优化传感器布置算法,修复数据库更新逻辑

This commit is contained in:
2026-04-17 17:21:50 +08:00
parent bf2aaa5ff7
commit 3b712ea467
7 changed files with 795 additions and 291 deletions
+475 -179
View File
@@ -1,18 +1,435 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import os import os
from app.algorithms._utils import fill_time_gaps ID_LIKE_COLUMNS = {
"id",
"device_id",
"node_id",
"sensor_id",
"monitor_id",
"junction_id",
}
def _normalize_time_frame(data: pd.DataFrame) -> pd.DataFrame:
"""返回按时间排序的副本,并尽量将 time 列解析为时间类型。"""
data = data.copy()
if "time" in data.columns:
data["time"] = pd.to_datetime(data["time"], errors="coerce")
data = data.sort_values(["time"]).reset_index(drop=True)
return data
def _select_pressure_columns(data: pd.DataFrame) -> tuple[list[str], list[str]]:
"""区分需要清洗的数值列与需要原样保留的列。"""
value_cols: list[str] = []
keep_cols: list[str] = []
for col in data.columns:
if col == "time":
continue
col_key = col.lower()
if col_key in ID_LIKE_COLUMNS or col_key.endswith("_id"):
keep_cols.append(col)
continue
numeric = pd.to_numeric(data[col], errors="coerce")
if numeric.notna().sum() == 0 or numeric.nunique(dropna=True) <= 1:
keep_cols.append(col)
else:
value_cols.append(col)
return value_cols, keep_cols
def _robust_scale(values: pd.Series) -> float:
"""基于 MAD 计算稳健尺度。"""
series = pd.to_numeric(values, errors="coerce").dropna()
if series.empty:
return 1.0
median = series.median()
mad = (series - median).abs().median()
if pd.notna(mad) and mad > 0:
return float(1.4826 * mad)
iqr = series.quantile(0.75) - series.quantile(0.25)
if pd.notna(iqr) and iqr > 0:
return float(iqr / 1.349)
std = series.std()
if pd.notna(std) and std > 0:
return float(std)
return 1.0
def _shrink_toward_baseline(observed: float, baseline: float, scale: float) -> float:
"""把观测值向基线值收缩,scale 越小,修复越强。"""
if pd.isna(observed):
return baseline
if pd.isna(baseline):
return observed
diff = observed - baseline
weight = scale / (abs(diff) + scale)
return float(baseline + diff * weight)
def _infer_time_frequency(time_values: pd.Series | pd.Index) -> pd.Timedelta:
"""从时间序列中推断采样频率,失败时默认 15 分钟。"""
parsed = pd.to_datetime(pd.Series(time_values), errors="coerce").dropna().sort_values()
if len(parsed) < 2:
return pd.Timedelta(minutes=15)
diffs = parsed.diff().dropna()
diffs = diffs[diffs > pd.Timedelta(0)]
if diffs.empty:
return pd.Timedelta(minutes=15)
mode = diffs.mode()
return mode.iloc[0] if not mode.empty else diffs.median()
def _build_local_pressure_baseline(series: pd.Series) -> pd.Series:
"""基于局部插值与中值滤波构造平滑基线。"""
baseline = _safe_time_interpolate(series)
baseline = baseline.rolling(window=5, center=True, min_periods=1).median()
baseline = _safe_time_interpolate(baseline)
return baseline.ffill().bfill()
def _build_seasonal_pressure_baseline(series: pd.Series) -> pd.Series:
"""按一天内的同一时刻构造季节性基线,适合日周期压力数据。"""
if not isinstance(series.index, pd.DatetimeIndex):
return pd.Series(np.nan, index=series.index, dtype=float)
slot_labels = pd.Series(series.index.strftime("%H:%M:%S"), index=series.index)
return series.groupby(slot_labels).transform("median")
def _detect_pressure_spikes(series: pd.Series, local_baseline: pd.Series) -> pd.Series:
"""识别单点异常上升/下降尖峰,避免过度修正正常波动。"""
residual = series - local_baseline
neighbor_center = (series.shift(1) + series.shift(-1)) / 2
curvature = series - neighbor_center
residual_scale = max(_robust_scale(residual), 1e-6)
curvature_scale = max(_robust_scale(curvature), 1e-6)
direction_flip = ((series - series.shift(1)) * (series.shift(-1) - series) < 0).fillna(False)
return (
residual.abs() > 3.5 * residual_scale
) & (
curvature.abs() > 3.0 * curvature_scale
) & direction_flip
def _fill_pressure_gaps(
original: pd.Series,
repaired: pd.Series,
local_baseline: pd.Series,
seasonal_baseline: pd.Series,
) -> pd.Series:
"""短缺口用局部插值,长缺口优先使用同一时刻的季节性轨迹。"""
missing_mask = original.isna()
if not missing_mask.any():
return repaired
gap_groups = (missing_mask != missing_mask.shift(fill_value=False)).cumsum()
gap_lengths = missing_mask.groupby(gap_groups).transform("sum").where(missing_mask, 0)
filled = repaired.copy()
short_gap_mask = missing_mask & (gap_lengths < 4)
long_gap_mask = missing_mask & ~short_gap_mask
filled[short_gap_mask] = local_baseline[short_gap_mask]
long_gap_fill = seasonal_baseline.where(seasonal_baseline.notna(), local_baseline)
filled[long_gap_mask] = long_gap_fill[long_gap_mask]
return filled
def _clean_pressure_series(series: pd.Series) -> pd.Series:
"""清洗单个压力时间序列。"""
series = pd.to_numeric(series, errors="coerce").astype(float)
local_baseline = _build_local_pressure_baseline(series)
spike_mask = _detect_pressure_spikes(series, local_baseline)
repaired = series.copy()
repaired[spike_mask] = local_baseline[spike_mask]
seasonal_baseline = _build_seasonal_pressure_baseline(repaired)
repaired = _fill_pressure_gaps(series, repaired, local_baseline, seasonal_baseline)
if repaired.isna().any():
repaired = repaired.where(repaired.notna(), local_baseline)
return repaired.ffill().bfill()
def _format_time_column(data: pd.DataFrame) -> pd.DataFrame:
"""统一输出时间格式,方便下游直接按 ISO 字符串解析。"""
if "time" not in data.columns:
return data
formatted = data.copy()
time_values = pd.to_datetime(formatted["time"], errors="coerce")
if time_values.isna().all():
return formatted
if time_values.dt.tz is not None:
time_strings = time_values.dt.strftime("%Y-%m-%dT%H:%M:%S%z")
time_strings = time_strings.str.replace(
r"([+-]\d{2})(\d{2})$",
r"\1:\2",
regex=True,
)
else:
time_strings = time_values.dt.strftime("%Y-%m-%dT%H:%M:%S")
formatted["time"] = time_strings.where(time_values.notna(), formatted["time"])
return formatted
def _expand_snapshot_time_grid(data: pd.DataFrame, freq: pd.Timedelta) -> pd.DataFrame:
"""仅补齐时间轴,不提前填充值,避免长缺口丢失原始形状特征。"""
expanded = data.copy()
expanded["time"] = pd.to_datetime(expanded["time"], errors="coerce")
expanded = expanded.dropna(subset=["time"]).sort_values("time")
if expanded.empty:
return data
indexed = expanded.set_index("time")
full_index = pd.date_range(indexed.index.min(), indexed.index.max(), freq=freq)
indexed = indexed.reindex(full_index)
indexed.index.name = "time"
return indexed.reset_index()
def _safe_datetime_index(values: pd.Series | pd.Index | list[object]) -> pd.DatetimeIndex | None:
"""尽量把时间值标准化为 DatetimeIndex;失败则返回 None。"""
parsed = pd.to_datetime(values, errors="coerce")
try:
datetime_index = pd.DatetimeIndex(parsed)
except (TypeError, ValueError):
return None
if datetime_index.isna().all():
return None
return datetime_index
def _safe_time_interpolate(series: pd.Series) -> pd.Series:
"""仅在索引确实是 DatetimeIndex 时使用 time interpolation。"""
if isinstance(series.index, pd.DatetimeIndex):
return series.interpolate(method="time", limit_direction="both")
return series.interpolate(limit_direction="both")
def _detect_long_form_identifier(data: pd.DataFrame, value_cols: list[str], keep_cols: list[str]) -> str | None:
"""识别 time/id/value 长表结构。"""
if "time" not in data.columns or len(value_cols) != 1:
return None
identifier_candidates = [
col
for col in keep_cols
if col.lower() in ID_LIKE_COLUMNS or col.lower().endswith("_id")
]
if len(identifier_candidates) != 1:
return None
if not data["time"].duplicated().any():
return None
return identifier_candidates[0]
def _clean_long_form_pressure(
data: pd.DataFrame,
value_col: str,
identifier_col: str,
keep_cols: list[str],
fill_gaps: bool,
) -> pd.DataFrame:
"""按测点拆分 long-form 压力数据,再逐列清洗后恢复原结构。"""
data = _normalize_time_frame(data)
wide_df = (
data[[identifier_col, "time", value_col]]
.pivot(index="time", columns=identifier_col, values=value_col)
.reset_index()
)
sensor_cols = [col for col in wide_df.columns if col != "time"]
cleaned_wide = _clean_snapshot_pressure(wide_df, sensor_cols, keep_cols=[], fill_gaps=fill_gaps)
cleaned_long = cleaned_wide.melt(
id_vars="time",
var_name=identifier_col,
value_name=value_col,
)
passthrough_cols = [col for col in keep_cols if col != identifier_col]
if passthrough_cols:
metadata = data[[identifier_col] + passthrough_cols].drop_duplicates(subset=[identifier_col])
cleaned_long = cleaned_long.merge(metadata, on=identifier_col, how="left")
try:
cleaned_long[identifier_col] = cleaned_long[identifier_col].astype(data[identifier_col].dtype)
except (TypeError, ValueError):
pass
cleaned_long = cleaned_long.sort_values(["time", identifier_col]).reset_index(drop=True)
ordered_cols = ["time", identifier_col] + passthrough_cols + [value_col]
cleaned_long = cleaned_long[[col for col in ordered_cols if col in cleaned_long.columns]]
return cleaned_long
def _build_time_slot_frame(
data: pd.DataFrame, value_col: str, expected_slots: int
) -> pd.DataFrame:
"""把重复时间点整理成 time x slot 的矩阵。"""
grouped = data.groupby("time", sort=True)
times = list(grouped.groups.keys())
slot_frame = pd.DataFrame(index=pd.Index(times, name="time"), columns=range(expected_slots), dtype=float)
for time_value, group in grouped:
values = pd.to_numeric(group[value_col], errors="coerce").tolist()
for slot_idx, value in enumerate(values[:expected_slots]):
slot_frame.loc[time_value, slot_idx] = value
return slot_frame
def _slot_baseline(slot_frame: pd.DataFrame) -> pd.DataFrame:
"""对每个槽位做时间插值和平滑,得到基线轨迹。"""
baseline = pd.DataFrame(index=slot_frame.index, columns=slot_frame.columns, dtype=float)
for col in slot_frame.columns:
series = slot_frame[col].astype(float)
series = _safe_time_interpolate(series)
series = series.rolling(window=5, center=True, min_periods=1).median()
series = _safe_time_interpolate(series).ffill().bfill()
baseline[col] = series
return baseline
def _choose_insertion_position(
observed: list[float], baseline_row: pd.Series, expected_slots: int
) -> int:
"""为少一个观测值的时间组选择最合理的插入位置。"""
missing_count = expected_slots - len(observed)
if missing_count <= 0:
return 0
best_pos = 0
best_cost = float("inf")
for insert_pos in range(expected_slots):
cost = 0.0
obs_idx = 0
for slot_idx in range(expected_slots):
if slot_idx == insert_pos:
continue
obs_value = observed[obs_idx]
base_value = float(baseline_row.iloc[slot_idx])
if pd.notna(obs_value) and pd.notna(base_value):
cost += abs(obs_value - base_value)
obs_idx += 1
if cost < best_cost:
best_cost = cost
best_pos = insert_pos
return best_pos
def _clean_repeated_timestamp_pressure(
data: pd.DataFrame, value_col: str, keep_cols: list[str]
) -> pd.DataFrame:
"""针对同一时间点重复采样的压力数据进行修复。"""
data = _normalize_time_frame(data)
grouped_sizes = data.groupby("time").size()
if grouped_sizes.empty:
return data
expected_slots = int(grouped_sizes.mode().iloc[0]) if not grouped_sizes.mode().empty else int(grouped_sizes.max())
expected_slots = max(expected_slots, int(grouped_sizes.max()))
slot_frame = _build_time_slot_frame(data, value_col, expected_slots)
baseline_frame = _slot_baseline(slot_frame)
residuals = slot_frame - baseline_frame
slot_scales = {
col: max(_robust_scale(residuals[col]), 1e-6) for col in residuals.columns
}
cleaned_rows: list[dict[str, object]] = []
grouped = data.groupby("time", sort=True)
for time_value, group in grouped:
observed_values = pd.to_numeric(group[value_col], errors="coerce").tolist()
baseline_row = baseline_frame.loc[time_value]
insert_pos = _choose_insertion_position(observed_values, baseline_row, expected_slots)
cleaned_values: list[float] = []
obs_idx = 0
for slot_idx in range(expected_slots):
if slot_idx == insert_pos and len(observed_values) < expected_slots:
cleaned_values.append(float(baseline_row.iloc[slot_idx]))
continue
if obs_idx >= len(observed_values):
cleaned_values.append(float(baseline_row.iloc[slot_idx]))
continue
observed = observed_values[obs_idx]
baseline = float(baseline_row.iloc[slot_idx])
cleaned_values.append(
_shrink_toward_baseline(observed, baseline, slot_scales.get(slot_idx, 1.0))
)
obs_idx += 1
# 其余字段原样保留;常量列(如 id)直接复制第一条记录即可
template_row = group.iloc[0].to_dict()
for slot_idx, cleaned_value in enumerate(cleaned_values):
row = dict(template_row)
row["time"] = time_value
row[value_col] = cleaned_value
cleaned_rows.append(row)
cleaned_df = pd.DataFrame(cleaned_rows)
cleaned_df = cleaned_df.sort_values(["time"]).reset_index(drop=True)
ordered_cols = ["time"] + keep_cols + [value_col]
ordered_cols = [col for col in ordered_cols if col in cleaned_df.columns]
remaining_cols = [col for col in cleaned_df.columns if col not in ordered_cols]
cleaned_df = cleaned_df[ordered_cols + remaining_cols]
return _format_time_column(cleaned_df)
def _clean_snapshot_pressure(
data: pd.DataFrame, value_cols: list[str], keep_cols: list[str], fill_gaps: bool
) -> pd.DataFrame:
"""针对单条时间序列或多列快照数据进行稳健修复。"""
data = _normalize_time_frame(data)
if fill_gaps and "time" in data.columns:
freq = _infer_time_frequency(data["time"])
data = _expand_snapshot_time_grid(data, freq)
data["time"] = pd.to_datetime(data["time"], errors="coerce")
data = data.sort_values(["time"]).reset_index(drop=True)
cleaned_df = data.copy()
time_index = (
_safe_datetime_index(cleaned_df["time"])
if "time" in cleaned_df.columns
else None
)
if time_index is None:
time_index = pd.RangeIndex(start=0, stop=len(cleaned_df))
for col in value_cols:
series = pd.Series(
pd.to_numeric(cleaned_df[col], errors="coerce").to_numpy(),
index=time_index,
dtype=float,
)
cleaned_df[col] = _clean_pressure_series(series).to_numpy()
ordered_cols = ["time"] + keep_cols + value_cols
ordered_cols = [col for col in ordered_cols if col in cleaned_df.columns]
remaining_cols = [col for col in cleaned_df.columns if col not in ordered_cols]
cleaned_df = cleaned_df[ordered_cols + remaining_cols]
return _format_time_column(cleaned_df)
def clean_pressure_data_km( def clean_pressure_data_km(
input_csv_path: str, show_plot: bool = False, fill_gaps: bool = True input_csv_path: str, show_plot: bool = False, fill_gaps: bool = True
) -> str: ) -> str:
""" """
读取输入 CSV,基于 KMeans 检测异常并用滚动平均修复。输出为 <input_basename>_cleaned.xlsx(同目录)。 读取输入 CSV,基于时间结构进行稳健修复。输出为 <input_basename>_cleaned.xlsx(同目录)。
原始数据在 sheet 'raw_pressure_data',处理后数据在 sheet 'cleaned_pressusre_data' 原始数据在 sheet 'raw_pressure_data',处理后数据在 sheet 'cleaned_pressusre_data'
返回输出文件的绝对路径。 返回输出文件的绝对路径。
@@ -24,80 +441,38 @@ def clean_pressure_data_km(
# 读取 CSV # 读取 CSV
input_csv_path = os.path.abspath(input_csv_path) input_csv_path = os.path.abspath(input_csv_path)
data = pd.read_csv(input_csv_path, header=0, index_col=None, encoding="utf-8") data = pd.read_csv(input_csv_path, header=0, index_col=None, encoding="utf-8")
data = _normalize_time_frame(data)
value_cols, keep_cols = _select_pressure_columns(data)
has_repeated_time = "time" in data.columns and data["time"].duplicated().any()
identifier_col = _detect_long_form_identifier(data, value_cols, keep_cols)
# 补齐时间缺口(如果数据包含 time 列) if identifier_col is not None:
if fill_gaps and "time" in data.columns: data_repaired = _clean_long_form_pressure(
data = fill_time_gaps( data,
data, time_col="time", freq="1min", short_gap_threshold=10 value_cols[0],
identifier_col,
keep_cols,
fill_gaps,
) )
elif has_repeated_time and len(value_cols) == 1:
data_repaired = _clean_repeated_timestamp_pressure(data, value_cols[0], keep_cols)
else:
data_repaired = _clean_snapshot_pressure(data, value_cols, keep_cols, fill_gaps)
# 分离时间列和数值列 # 可选可视化(只展示首个数值列
time_col_data = None
if "time" in data.columns:
time_col_data = data["time"]
data = data.drop(columns=["time"])
# 标准化
data_norm = (data - data.mean()) / data.std()
# 聚类与异常检测
k = 3
kmeans = KMeans(n_clusters=k, init="k-means++", n_init=50, random_state=42)
clusters = kmeans.fit_predict(data_norm)
centers = kmeans.cluster_centers_
distances = np.linalg.norm(data_norm.values - centers[clusters], axis=1)
threshold = distances.mean() + 3 * distances.std()
anomaly_pos = np.where(distances > threshold)[0]
anomaly_indices = data.index[anomaly_pos]
anomaly_details = {}
for pos in anomaly_pos:
row_norm = data_norm.iloc[pos]
cluster_idx = clusters[pos]
center = centers[cluster_idx]
diff = abs(row_norm - center)
main_sensor = diff.idxmax()
anomaly_details[data.index[pos]] = main_sensor
# 修复:滚动平均(窗口可调)
data_rolled = data.rolling(window=13, center=True, min_periods=1).mean()
data_repaired = data.copy()
for pos in anomaly_pos:
label = data.index[pos]
sensor = anomaly_details[label]
data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor]
# 可选可视化(使用位置作为 x 轴)
plt.rcParams["font.sans-serif"] = ["SimHei"] plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False plt.rcParams["axes.unicode_minus"] = False
if show_plot and value_cols:
if show_plot and len(data.columns) > 0: plot_col = value_cols[0]
n = len(data) if "time" in data_repaired.columns:
time = np.arange(n) x = pd.to_datetime(data_repaired["time"], errors="coerce")
plt.figure(figsize=(12, 8)) else:
for col in data.columns: x = np.arange(len(data_repaired))
plt.plot(time, data[col].values, marker="o", markersize=3, label=col) plt.figure(figsize=(12, 6))
for pos in anomaly_pos: plt.plot(x, pd.to_numeric(data_repaired[plot_col], errors="coerce"), label="cleaned")
sensor = anomaly_details[data.index[pos]] plt.xlabel("时间" if "time" in data_repaired.columns else "序号")
plt.plot(pos, data.iloc[pos][sensor], "ro", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("压力监测值") plt.ylabel("压力监测值")
plt.title("各传感器折线图(红色标记主要异常点)") plt.title(f"{plot_col} 清洗结果")
plt.legend()
plt.show()
plt.figure(figsize=(12, 8))
for col in data_repaired.columns:
plt.plot(
time, data_repaired[col].values, marker="o", markersize=3, label=col
)
for pos in anomaly_pos:
sensor = anomaly_details[data.index[pos]]
plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("修复后压力监测值")
plt.title("修复后各传感器折线图(绿色标记修复值)")
plt.legend() plt.legend()
plt.show() plt.show()
@@ -110,9 +485,6 @@ def clean_pressure_data_km(
# 如果原始数据包含时间列,将其添加回结果 # 如果原始数据包含时间列,将其添加回结果
data_for_save = data.copy() data_for_save = data.copy()
data_repaired_for_save = data_repaired.copy() data_repaired_for_save = data_repaired.copy()
if time_col_data is not None:
data_for_save.insert(0, "time", time_col_data)
data_repaired_for_save.insert(0, "time", time_col_data)
if os.path.exists(output_path): if os.path.exists(output_path):
os.remove(output_path) # 覆盖同名文件 os.remove(output_path) # 覆盖同名文件
@@ -126,10 +498,10 @@ def clean_pressure_data_km(
return os.path.abspath(output_path) return os.path.abspath(output_path)
def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> dict: def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> pd.DataFrame:
""" """
接收一个 DataFrame 数据结构,使用KMeans聚类检测异常并用滚动平均修复 接收一个 DataFrame 数据结构,使用时间感知的稳健修复方法清洗压力数据
返回清洗后的字典数据结构 返回清洗后的 DataFrame
Args: Args:
data: 输入 DataFrame(可包含 time 列) data: 输入 DataFrame(可包含 time 列)
@@ -137,113 +509,37 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
""" """
# 使用传入的 DataFrame # 使用传入的 DataFrame
data = data.copy() data = data.copy()
data = _normalize_time_frame(data)
value_cols, keep_cols = _select_pressure_columns(data)
has_repeated_time = "time" in data.columns and data["time"].duplicated().any()
identifier_col = _detect_long_form_identifier(data, value_cols, keep_cols)
# 补齐时间缺口(如果启用且数据包含 time 列) if identifier_col is not None:
data_filled = fill_time_gaps( data_repaired = _clean_long_form_pressure(
data, time_col="time", freq="1min", short_gap_threshold=10 data,
) value_cols[0],
identifier_col,
keep_cols,
fill_gaps=True,
)
elif has_repeated_time and len(value_cols) == 1:
data_repaired = _clean_repeated_timestamp_pressure(data, value_cols[0], keep_cols)
else:
data_repaired = _clean_snapshot_pressure(data, value_cols, keep_cols, fill_gaps=True)
# 保存 time 列用于最后合并 if show_plot and value_cols:
time_col_series = None plt.rcParams["font.sans-serif"] = ["SimHei"]
if "time" in data_filled.columns: plt.rcParams["axes.unicode_minus"] = False
time_col_series = data_filled["time"] plot_col = value_cols[0]
x = pd.to_datetime(data_repaired["time"], errors="coerce") if "time" in data_repaired.columns else np.arange(len(data_repaired))
# 移除 time 列用于后续清洗 plt.figure(figsize=(12, 6))
data_filled = data_filled.drop(columns=["time"]) plt.plot(x, pd.to_numeric(data_repaired[plot_col], errors="coerce"), label="cleaned")
plt.xlabel("时间" if "time" in data_repaired.columns else "序号")
# 标准化(使用填充后的数据)
data_norm = (data_filled - data_filled.mean()) / data_filled.std()
# 添加:处理标准化后的 NaN(例如,标准差为0的列),防止异常数据,时间段内所有数据都相同导致计算结果为 NaN
imputer = SimpleImputer(
strategy="constant", fill_value=0, keep_empty_features=True
) # 用 0 填充 NaN,包括全 NaN,并保留空特征
data_norm = pd.DataFrame(
imputer.fit_transform(data_norm),
columns=data_norm.columns,
index=data_norm.index,
)
# 聚类与异常检测
k = 3
kmeans = KMeans(n_clusters=k, init="k-means++", n_init=50, random_state=42)
clusters = kmeans.fit_predict(data_norm)
centers = kmeans.cluster_centers_
distances = np.linalg.norm(data_norm.values - centers[clusters], axis=1)
threshold = distances.mean() + 3 * distances.std()
anomaly_pos = np.where(distances > threshold)[0]
anomaly_indices = data_filled.index[anomaly_pos]
anomaly_details = {}
for pos in anomaly_pos:
row_norm = data_norm.iloc[pos]
cluster_idx = clusters[pos]
center = centers[cluster_idx]
diff = abs(row_norm - center)
main_sensor = diff.idxmax()
anomaly_details[data_filled.index[pos]] = main_sensor
# 修复:滚动平均(窗口可调)
data_rolled = data_filled.rolling(window=13, center=True, min_periods=1).mean()
data_repaired = data_filled.copy()
for pos in anomaly_pos:
label = data_filled.index[pos]
sensor = anomaly_details[label]
data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor]
# 可选可视化(使用位置作为 x 轴)
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
if show_plot and len(data.columns) > 0:
n = len(data)
time = np.arange(n)
n_filled = len(data_filled)
time_filled = np.arange(n_filled)
plt.figure(figsize=(12, 8))
for col in data.columns:
plt.plot(
time, data[col].values, marker="o", markersize=3, label=col, alpha=0.5
)
for col in data_filled.columns:
plt.plot(
time_filled,
data_filled[col].values,
marker="x",
markersize=3,
label=f"{col}_filled",
linestyle="--",
)
for pos in anomaly_pos:
sensor = anomaly_details[data_filled.index[pos]]
plt.plot(pos, data_filled.iloc[pos][sensor], "ro", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("压力监测值") plt.ylabel("压力监测值")
plt.title("各传感器折线图(红色标记主要异常点,虚线为0值填充后)") plt.title(f"{plot_col} 清洗结果")
plt.legend() plt.legend()
plt.show() plt.show()
plt.figure(figsize=(12, 8))
for col in data_repaired.columns:
plt.plot(
time_filled, data_repaired[col].values, marker="o", markersize=3, label=col
)
for pos in anomaly_pos:
sensor = anomaly_details[data_filled.index[pos]]
plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("修复后压力监测值")
plt.title("修复后各传感器折线图(绿色标记修复值)")
plt.legend()
plt.show()
# 将 time 列添加回结果
if time_col_series is not None:
data_repaired.insert(0, "time", time_col_series)
# 返回清洗后的字典
return data_repaired return data_repaired
+19 -57
View File
@@ -6,23 +6,21 @@ import sklearn.cluster
import os import os
class QD_KMeans(object): class QD_KMeans(object):
def __init__(self, wn, num_monitors): def __init__(self, wn, num_monitors):
# self.inp = inp # self.inp = inp
self.cluster_num = num_monitors # 聚类中心个数,也即测压点个数 self.cluster_num = num_monitors # 聚类中心个数,也即测压点个数
self.wn=wn self.wn = wn
self.monitor_nodes = [] self.monitor_nodes = []
self.coords = [] self.coords = []
self.junction_nodes = {} # Added missing initialization self.junction_nodes = {} # Added missing initialization
def get_junctions_coordinates(self): def get_junctions_coordinates(self):
for junction_name in self.wn.junction_name_list: for junction_name in self.wn.junction_name_list:
junction = self.wn.get_node(junction_name) junction = self.wn.get_node(junction_name)
self.junction_nodes[junction_name] = junction.coordinates self.junction_nodes[junction_name] = junction.coordinates
self.coords.append(junction.coordinates ) self.coords.append(junction.coordinates)
# print(f"Total junctions: {self.junction_coordinates}") # print(f"Total junctions: {self.junction_coordinates}")
@@ -30,8 +28,10 @@ class QD_KMeans(object):
if not self.coords: # Add check if coordinates are collected if not self.coords: # Add check if coordinates are collected
self.get_junctions_coordinates() self.get_junctions_coordinates()
coords = np.array(self.coords) coords = np.array(self.coords)
coords_normalized = (coords - coords.min(axis=0)) / (coords.max(axis=0) - coords.min(axis=0)) coords_normalized = (coords - coords.min(axis=0)) / (
kmeans = sklearn.cluster.KMeans(n_clusters= self.cluster_num, random_state=42) coords.max(axis=0) - coords.min(axis=0)
)
kmeans = sklearn.cluster.KMeans(n_clusters=self.cluster_num, random_state=42)
kmeans.fit(coords_normalized) kmeans.fit(coords_normalized)
for center in kmeans.cluster_centers_: for center in kmeans.cluster_centers_:
@@ -41,69 +41,31 @@ class QD_KMeans(object):
return self.monitor_nodes return self.monitor_nodes
def visualize_network(self): def visualize_network(self):
"""Visualize network with monitoring points""" """Visualize network with monitoring points"""
ax=wntr.graphics.plot_network(self.wn, ax = wntr.graphics.plot_network(
node_attribute=self.monitor_nodes, self.wn,
node_size=30, node_attribute=self.monitor_nodes,
title='Optimal sensor') node_size=30,
title="Optimal sensor",
)
plt.show() plt.show()
def kmeans_sensor_placement(name: str, sensor_num: int, min_diameter: int) -> list: def kmeans_sensor_placement(name: str, sensor_num: int, min_diameter: int) -> list:
inp_name = f'./db_inp/{name}.db.inp' inp_name = f"./db_inp/{name}.db.inp"
wn= wntr.network.WaterNetworkModel(inp_name) wn = wntr.network.WaterNetworkModel(inp_name)
wn_cluster=QD_KMeans(wn, sensor_num) wn_cluster = QD_KMeans(wn, sensor_num)
# Select monitoring pointse # Select monitoring pointse
sensor_ids= wn_cluster.select_monitoring_points() sensor_ids = wn_cluster.select_monitoring_points()
# wn_cluster.visualize_network() # wn_cluster.visualize_network()
return sensor_ids return sensor_ids
if __name__ == "__main__": if __name__ == "__main__":
#sensorindex = get_ID(name='suzhouhe_2024_cloud_0817', sensor_num=30, min_diameter=500) # sensorindex = get_ID(name='suzhouhe_2024_cloud_0817', sensor_num=30, min_diameter=500)
sensorindex = kmeans_sensor_placement(name='szh', sensor_num=50, min_diameter=300) sensorindex = kmeans_sensor_placement(name="szh", sensor_num=50, min_diameter=300)
print(sensorindex) print(sensorindex)
+92 -44
View File
@@ -20,6 +20,7 @@ import geopandas as gpd
from sklearn.metrics import pairwise_distances from sklearn.metrics import pairwise_distances
import app.services.project_info as project_info import app.services.project_info as project_info
# 2025/03/12 # 2025/03/12
# Step1: 获取节点坐标 # Step1: 获取节点坐标
def getCoor(wn: wntr.network.WaterNetworkModel) -> pandas.DataFrame: def getCoor(wn: wntr.network.WaterNetworkModel) -> pandas.DataFrame:
@@ -31,7 +32,7 @@ def getCoor(wn: wntr.network.WaterNetworkModel) -> pandas.DataFrame:
# site: pandas.Series # site: pandas.Series
# index:节点名称(wn.node_name_list # index:节点名称(wn.node_name_list
# values:每个节点的坐标,格式为 tuple(如 (x, y) 或 (x, y, z) # values:每个节点的坐标,格式为 tuple(如 (x, y) 或 (x, y, z)
site = wn.query_node_attribute('coordinates') site = wn.query_node_attribute("coordinates")
# Coor: pandas.Series # Coor: pandas.Series
# index:与site相同(节点名称)。 # index:与site相同(节点名称)。
# values:坐标转换为numpy.ndarray(如array([10.5, 20.3]) # values:坐标转换为numpy.ndarray(如array([10.5, 20.3])
@@ -43,9 +44,9 @@ def getCoor(wn: wntr.network.WaterNetworkModel) -> pandas.DataFrame:
x.append(Coor.values[i][0]) # 将 x 坐标存入 x 列表。 x.append(Coor.values[i][0]) # 将 x 坐标存入 x 列表。
y.append(Coor.values[i][1]) # 将 y 坐标存入 y 列表 y.append(Coor.values[i][1]) # 将 y 坐标存入 y 列表
# xy: dict[str, list], x、y 坐标的字典 # xy: dict[str, list], x、y 坐标的字典
xy = {'x': x, 'y': y} xy = {"x": x, "y": y}
# Coor_node: pandas.DataFrame, 存储节点 x, y 坐标的 DataFrame # Coor_node: pandas.DataFrame, 存储节点 x, y 坐标的 DataFrame
Coor_node = pd.DataFrame(xy, index=wn.node_name_list, columns=['x', 'y']) Coor_node = pd.DataFrame(xy, index=wn.node_name_list, columns=["x", "y"])
return Coor_node return Coor_node
@@ -87,23 +88,23 @@ def skater_partition(G, n_clusters):
字典形式的聚类结果,键为区域编号,值为该区域内的节点列表。 字典形式的聚类结果,键为区域编号,值为该区域内的节点列表。
""" """
# 1. 获取所有节点坐标,假设每个节点都有 'pos' 属性 # 1. 获取所有节点坐标,假设每个节点都有 'pos' 属性
pos = nx.get_node_attributes(G, 'pos') pos = nx.get_node_attributes(G, "pos")
nodes = list(G.nodes()) nodes = list(G.nodes())
# 构造坐标数组:每行为 [x, y] # 构造坐标数组:每行为 [x, y]
coords = np.array([pos[node] for node in nodes]) coords = np.array([pos[node] for node in nodes])
# 2. 构造 GeoDataFrame:创建 DataFrame 并生成 geometry 列 # 2. 构造 GeoDataFrame:创建 DataFrame 并生成 geometry 列
df = pd.DataFrame(coords, columns=['x', 'y'], index=nodes) df = pd.DataFrame(coords, columns=["x", "y"], index=nodes)
# 利用 shapely 的 Point 构造空间位置 # 利用 shapely 的 Point 构造空间位置
df['geometry'] = df.apply(lambda row: Point(row['x'], row['y']), axis=1) df["geometry"] = df.apply(lambda row: Point(row["x"], row["y"]), axis=1)
gdf = gpd.GeoDataFrame(df, geometry='geometry') gdf = gpd.GeoDataFrame(df, geometry="geometry")
# 3. 构造空间权重矩阵,使用 4 近邻方法(k=4,可根据实际情况调整) # 3. 构造空间权重矩阵,使用 4 近邻方法(k=4,可根据实际情况调整)
w = ps.weights.KNN.from_array(coords, k=4) w = ps.weights.KNN.from_array(coords, k=4)
w.transform = 'R' w.transform = "R"
# 4. 调用 SKATER:新版本 API 要求传入 gdf, w 以及 attrs_name(这里使用 'x' 和 'y' 作为属性) # 4. 调用 SKATER:新版本 API 要求传入 gdf, w 以及 attrs_name(这里使用 'x' 和 'y' 作为属性)
skater = Skater(gdf, w, attrs_name=['x', 'y'], n_clusters=n_clusters) skater = Skater(gdf, w, attrs_name=["x", "y"], n_clusters=n_clusters)
skater.solve() skater.solve()
# 5. 获取聚类标签,构造成字典格式 # 5. 获取聚类标签,构造成字典格式
@@ -133,24 +134,24 @@ def spectral_partition(G, n_clusters):
键为聚类标签,值为该聚类对应的节点列表。 键为聚类标签,值为该聚类对应的节点列表。
""" """
# 1. 获取节点空间坐标,注意保证每个节点都有 'pos' 属性 # 1. 获取节点空间坐标,注意保证每个节点都有 'pos' 属性
pos_dict = nx.get_node_attributes(G, 'pos') pos_dict = nx.get_node_attributes(G, "pos")
nodes = list(G.nodes()) nodes = list(G.nodes())
coords = np.array([pos_dict[node] for node in nodes]) coords = np.array([pos_dict[node] for node in nodes])
# 2. 计算节点之间的欧氏距离矩阵 # 2. 计算节点之间的欧氏距离矩阵
D = pairwise_distances(coords, metric='euclidean') D = pairwise_distances(coords, metric="euclidean")
# 3. 计算 sigma 值:这里取所有距离的均值,当然也可以根据实际情况调整 # 3. 计算 sigma 值:这里取所有距离的均值,当然也可以根据实际情况调整
sigma = np.mean(D) sigma = np.mean(D)
# 4. 构造相似度矩阵:使用高斯核函数 # 4. 构造相似度矩阵:使用高斯核函数
# A(i, j) = exp( -d(i,j)^2 / (2*sigma^2) ) # A(i, j) = exp( -d(i,j)^2 / (2*sigma^2) )
A = np.exp(- (D ** 2) / (2 * sigma ** 2)) A = np.exp(-(D**2) / (2 * sigma**2))
# 5. 使用谱聚类进行图分区 # 5. 使用谱聚类进行图分区
clustering = SpectralClustering(n_clusters=n_clusters, clustering = SpectralClustering(
affinity='precomputed', n_clusters=n_clusters, affinity="precomputed", random_state=0
random_state=0) )
labels = clustering.fit_predict(A) labels = clustering.fit_predict(A)
# 6. 构造字典形式的分区结果 # 6. 构造字典形式的分区结果
@@ -160,6 +161,7 @@ def spectral_partition(G, n_clusters):
return groups return groups
# 2025/03/12 # 2025/03/12
# Step3: wn_func类,水力计算 # Step3: wn_func类,水力计算
# wn_func 主要用于计算: # wn_func 主要用于计算:
@@ -181,7 +183,7 @@ class wn_func(object):
self.results = wntr.sim.EpanetSimulator(wn).run_sim() # 存储运行结果 self.results = wntr.sim.EpanetSimulator(wn).run_sim() # 存储运行结果
self.wn = wn self.wn = wn
# self.qpandas.DataFrame,管道流量,索引为时间步长,列为管道名称 # self.qpandas.DataFrame,管道流量,索引为时间步长,列为管道名称
self.q = self.results.link['flowrate'] self.q = self.results.link["flowrate"]
# ReservoirIndex / Tankindex: list[str],水库 / 水箱节点名称列表 # ReservoirIndex / Tankindex: list[str],水库 / 水箱节点名称列表
ReservoirIndex = wn.reservoir_name_list ReservoirIndex = wn.reservoir_name_list
Tankindex = wn.tank_name_list Tankindex = wn.tank_name_list
@@ -191,7 +193,7 @@ class wn_func(object):
# self.nodes: list[str],所有节点的名称 # self.nodes: list[str],所有节点的名称
self.nodes = wn.node_name_list self.nodes = wn.node_name_list
# self.coordinatespandas.Series,节点坐标,索引为节点名,值为 (x, y) 坐标的 tuple # self.coordinatespandas.Series,节点坐标,索引为节点名,值为 (x, y) 坐标的 tuple
self.coordinates = wn.query_node_attribute('coordinates') self.coordinates = wn.query_node_attribute("coordinates")
# allpumps / allvalves: list[str],所有泵/阀门名称列表 # allpumps / allvalves: list[str],所有泵/阀门名称列表
allpumps = wn.pump_name_list allpumps = wn.pump_name_list
allvalves = wn.valve_name_list allvalves = wn.valve_name_list
@@ -222,17 +224,27 @@ class wn_func(object):
# 泵的起终点、tank、reservoir # 泵的起终点、tank、reservoir
# self.delnodes: list[str],需要删除的节点(包括水库、泵、阀门连接的节点) # self.delnodes: list[str],需要删除的节点(包括水库、泵、阀门连接的节点)
self.delnodes = list( self.delnodes = list(
set(ReservoirIndex).union(Tankindex, pumpstnode, pumpednode, valvestnode, valveednode, Reservoirednode)) set(ReservoirIndex).union(
Tankindex,
pumpstnode,
pumpednode,
valvestnode,
valveednode,
Reservoirednode,
)
)
# 泵、起终点为tank、reservoir的管道 # 泵、起终点为tank、reservoir的管道
# self.delpipes: list[str],需要删除的管道(包括水库、泵、阀门连接的管道) # self.delpipes: list[str],需要删除的管道(包括水库、泵、阀门连接的管道)
self.delpipes = list(set(wn.pump_name_list).union(wn.valve_name_list).union(Reservoirpipe)) self.delpipes = list(
set(wn.pump_name_list).union(wn.valve_name_list).union(Reservoirpipe)
)
self.pipes = [pipe for pipe in wn.pipe_name_list if pipe not in self.delpipes] self.pipes = [pipe for pipe in wn.pipe_name_list if pipe not in self.delpipes]
# self.L: list[float],所有管道的长度(以米为单位) # self.L: list[float],所有管道的长度(以米为单位)
self.L = wn.query_link_attribute('length')[self.pipes].tolist() self.L = wn.query_link_attribute("length")[self.pipes].tolist()
self.n = len(self.nodes) self.n = len(self.nodes)
self.m = len(self.pipes) self.m = len(self.pipes)
# self.unit_headloss: list[float],单位水头损失(headloss 数据的第一行,单位:米/km) # self.unit_headloss: list[float],单位水头损失(headloss 数据的第一行,单位:米/km)
self.unit_headloss = self.results.link['headloss'].iloc[0, :].tolist() self.unit_headloss = self.results.link["headloss"].iloc[0, :].tolist()
## ##
self.delnodes1 = list(set(ReservoirIndex).union(Tankindex)) self.delnodes1 = list(set(ReservoirIndex).union(Tankindex))
@@ -245,7 +257,9 @@ class wn_func(object):
end_node = wn.links[pipe].end_node.name end_node = wn.links[pipe].end_node.name
self.less_than_min_diameter_junction_list.extend([start_node, end_node]) self.less_than_min_diameter_junction_list.extend([start_node, end_node])
# 去重 # 去重
self.less_than_min_diameter_junction_list = list(set(self.less_than_min_diameter_junction_list)) self.less_than_min_diameter_junction_list = list(
set(self.less_than_min_diameter_junction_list)
)
# Step3.2: 计算水力距离 # Step3.2: 计算水力距离
def CtoS(self): def CtoS(self):
@@ -266,7 +280,7 @@ class wn_func(object):
q = self.q q = self.q
L = self.L L = self.L
# H1pandas.DataFrame,水头数据,索引为时间步长,列为节点名 # H1pandas.DataFrame,水头数据,索引为时间步长,列为节点名
H1 = self.results.node['head'].T H1 = self.results.node["head"].T
# hhlist[float],计算管道两端水头之差 # hhlist[float],计算管道两端水头之差
hh = [] hh = []
# 水头损失 # 水头损失
@@ -280,8 +294,18 @@ class wn_func(object):
# headlosspandas.DataFrame,管道水头损失矩阵 # headlosspandas.DataFrame,管道水头损失矩阵
headloss = pd.DataFrame(hh, index=pipes).T headloss = pd.DataFrame(hh, index=pipes).T
# s1:管道阻力系数,s2:将管道阻力系数与管道的起始节点和终止节点对应 # s1:管道阻力系数,s2:将管道阻力系数与管道的起始节点和终止节点对应
hf = pd.DataFrame(np.array([0] * (n ** 2)).reshape(n, n), index=nodes, columns=nodes, dtype=float) hf = pd.DataFrame(
weightL = pd.DataFrame(np.array([0] * (n ** 2)).reshape(n, n), index=nodes, columns=nodes, dtype=float) np.array([0] * (n**2)).reshape(n, n),
index=nodes,
columns=nodes,
dtype=float,
)
weightL = pd.DataFrame(
np.array([0] * (n**2)).reshape(n, n),
index=nodes,
columns=nodes,
dtype=float,
)
# s2为对应管道起始节点与终止节点的粗糙度系数矩阵,index代表起始节点,columns代表终止节点 # s2为对应管道起始节点与终止节点的粗糙度系数矩阵,index代表起始节点,columns代表终止节点
G = nx.DiGraph() G = nx.DiGraph()
for i in range(0, m): for i in range(0, m):
@@ -298,11 +322,16 @@ class wn_func(object):
weightL.loc[b, a] = headloss.loc[0, pipe] * L[i] weightL.loc[b, a] = headloss.loc[0, pipe] * L[i]
G.add_weighted_edges_from([(b, a, weightL.loc[b, a])]) G.add_weighted_edges_from([(b, a, weightL.loc[b, a])])
hydraulicL = pd.DataFrame(np.array([0] * (n ** 2)).reshape(n, n), index=nodes, columns=nodes, dtype=float) hydraulicL = pd.DataFrame(
np.array([0] * (n**2)).reshape(n, n),
index=nodes,
columns=nodes,
dtype=float,
)
for a in nodes: for a in nodes:
if a in G.nodes: if a in G.nodes:
d = nx.shortest_path_length(G, source=a, weight='weight') d = nx.shortest_path_length(G, source=a, weight="weight")
for b in list(d.keys()): for b in list(d.keys()):
hydraulicL.loc[a, b] = d[b] hydraulicL.loc[a, b] = d[b]
@@ -331,11 +360,17 @@ class wn_func(object):
for t in self.wn.tanks(): for t in self.wn.tanks():
self.nonjunc_index.append(t[0]) self.nonjunc_index.append(t[0])
# Connnumpy.matrix,节点-管道连接矩阵,起点 -1,终点 1 # Connnumpy.matrix,节点-管道连接矩阵,起点 -1,终点 1
Conn = np.mat(np.zeros([n, m - p - v])) # 节点和管道的关系矩阵,行为节点,列为管道,起点为-1,终点为1 Conn = np.mat(
np.zeros([n, m - p - v])
) # 节点和管道的关系矩阵,行为节点,列为管道,起点为-1,终点为1
# NConnnumpy.matrix,节点-节点连接矩阵,有管道相连的地方设为 1 # NConnnumpy.matrix,节点-节点连接矩阵,有管道相连的地方设为 1
NConn = np.mat(np.zeros([n, n])) # 节点之间的关系,之间有管道为1,反之为0 NConn = np.mat(np.zeros([n, n])) # 节点之间的关系,之间有管道为1,反之为0
# pipeslist[str],去除泵和阀门的管道列表 # pipeslist[str],去除泵和阀门的管道列表
pipes = [pipe for pipe in self.wn.pipes() if pipe not in self.wn.pumps() and pipe not in self.wn.valves()] pipes = [
pipe
for pipe in self.wn.pipes()
if pipe not in self.wn.pumps() and pipe not in self.wn.valves()
]
for pipe_name, pipe in pipes: for pipe_name, pipe in pipes:
start = self.wn.node_name_list.index(pipe.start_node_name) start = self.wn.node_name_list.index(pipe.start_node_name)
end = self.wn.node_name_list.index(pipe.end_node_name) end = self.wn.node_name_list.index(pipe.end_node_name)
@@ -345,12 +380,21 @@ class wn_func(object):
NConn[start, end] = 1 NConn[start, end] = 1
NConn[end, start] = 1 NConn[end, start] = 1
self.A = Conn self.A = Conn
link_name_list = [link for link in self.wn.link_name_list if link_name_list = [
link not in self.wn.pump_name_list and link not in self.wn.valve_name_list] link
self.A2 = pd.DataFrame(self.A, index=self.wn.node_name_list, columns=link_name_list) for link in self.wn.link_name_list
if link not in self.wn.pump_name_list
and link not in self.wn.valve_name_list
]
self.A2 = pd.DataFrame(
self.A, index=self.wn.node_name_list, columns=link_name_list
)
self.A2 = self.A2.drop(self.delnodes) self.A2 = self.A2.drop(self.delnodes)
for pipe in self.delpipes: for pipe in self.delpipes:
if pipe not in self.wn.pump_name_list and pipe not in self.wn.valve_name_list: if (
pipe not in self.wn.pump_name_list
and pipe not in self.wn.valve_name_list
):
self.A2 = self.A2.drop(columns=pipe) self.A2 = self.A2.drop(columns=pipe)
self.junc_list = self.A2.index self.junc_list = self.A2.index
self.A2 = np.mat(self.A2) # 节点管道关系 self.A2 = np.mat(self.A2) # 节点管道关系
@@ -372,10 +416,10 @@ class wn_func(object):
except EpanetException: except EpanetException:
pass pass
finally: finally:
h = result.link['headloss'][self.pipes].values[0] h = result.link["headloss"][self.pipes].values[0]
q = result.link['flowrate'][self.pipes].values[0] q = result.link["flowrate"][self.pipes].values[0]
l = self.wn.query_link_attribute('length')[self.pipes] l = self.wn.query_link_attribute("length")[self.pipes]
C = self.wn.query_link_attribute('roughness')[self.pipes] C = self.wn.query_link_attribute("roughness")[self.pipes]
# headlossnumpy.ndarray,水头损失数组 # headlossnumpy.ndarray,水头损失数组
headloss = np.array(h) headloss = np.array(h)
# 调整流量方向 # 调整流量方向
@@ -393,7 +437,7 @@ class wn_func(object):
try: try:
det = np.linalg.det(X) det = np.linalg.det(X)
except RuntimeError as e: except RuntimeError as e:
sign, logdet = slogdet(X) # 防止溢出 sign, logdet = slogdet(X) # 防止溢出
det = sign * np.exp(logdet) det = sign * np.exp(logdet)
if det != 0: if det != 0:
J_H_Cw = X.I * A * S J_H_Cw = X.I * A * S
@@ -430,7 +474,10 @@ class Sensorplacement(wn_func):
""" """
Sensorplacement 类继承了 wn_func 类,并且用于计算和优化传感器布置的位置。 Sensorplacement 类继承了 wn_func 类,并且用于计算和优化传感器布置的位置。
""" """
def __init__(self, wn: wntr.network.WaterNetworkModel, sensornum: int, min_diameter: int):
def __init__(
self, wn: wntr.network.WaterNetworkModel, sensornum: int, min_diameter: int
):
""" """
:param wn: 由wntr生成的模型 :param wn: 由wntr生成的模型
@@ -442,7 +489,9 @@ class Sensorplacement(wn_func):
# 1.某个节点到所有节点的加权距离之和 # 1.某个节点到所有节点的加权距离之和
# 2.某个节点到该组内所有节点的加权距离之和 # 2.某个节点到该组内所有节点的加权距离之和
def sensor(self, SS: pandas.DataFrame, G: networkx.Graph, group: dict[int, list[str]]): def sensor(
self, SS: pandas.DataFrame, G: networkx.Graph, group: dict[int, list[str]]
):
""" """
sensor 方法是用来根据灵敏度矩阵 SS 和加权图 G 来确定传感器布置位置的 sensor 方法是用来根据灵敏度矩阵 SS 和加权图 G 来确定传感器布置位置的
:param SS: 灵敏度矩阵,每个节点的行和列代表不同节点,矩阵元素表示节点间的灵敏度。SS.iloc[i, :] 表示第 i 行对应节点 i 到所有其他节点的灵敏度 :param SS: 灵敏度矩阵,每个节点的行和列代表不同节点,矩阵元素表示节点间的灵敏度。SS.iloc[i, :] 表示第 i 行对应节点 i 到所有其他节点的灵敏度
@@ -527,7 +576,7 @@ def get_ID(name: str, sensor_num: int, min_diameter: int) -> list[str]:
:return: 测压点节点ID :return: 测压点节点ID
""" """
# inp_file_realstr,输入文件名,表示原始水力模型文件的路径,该文件格式为 EPANET 输入文件(.inp),包含管网的结构信息、节点、管道、泵等数据 # inp_file_realstr,输入文件名,表示原始水力模型文件的路径,该文件格式为 EPANET 输入文件(.inp),包含管网的结构信息、节点、管道、泵等数据
inp_file_real = f'./db_inp/{name}.db.inp' inp_file_real = f"./db_inp/{name}.db.inp"
# sensornumint,需要布置的传感器数量 # sensornumint,需要布置的传感器数量
# sensornum = sensor_num # sensornum = sensor_num
# wn_realwntr.network.WaterNetworkModel,加载 EPANET 水力模型 # wn_realwntr.network.WaterNetworkModel,加载 EPANET 水力模型
@@ -538,7 +587,7 @@ def get_ID(name: str, sensor_num: int, min_diameter: int) -> list[str]:
results_real = sim_real.run_sim() results_real = sim_real.run_sim()
# real_Clist[float],包含所有管道粗糙度的列表 # real_Clist[float],包含所有管道粗糙度的列表
real_C = wn_real.query_link_attribute('roughness').tolist() real_C = wn_real.query_link_attribute("roughness").tolist()
# wn_fun1wn_func(继承自 object),创建 wn_func 类的实例,传入 wn_real 水力模型对象。wn_func 用于计算管网相关的水力属性,比如水力距离、灵敏度等 # wn_fun1wn_func(继承自 object),创建 wn_func 类的实例,传入 wn_real 水力模型对象。wn_func 用于计算管网相关的水力属性,比如水力距离、灵敏度等
wn_fun1 = wn_func(wn_real, min_diameter=min_diameter) wn_fun1 = wn_func(wn_real, min_diameter=min_diameter)
# nodeslist[str],管网的节点名称列表 # nodeslist[str],管网的节点名称列表
@@ -598,7 +647,6 @@ def get_ID(name: str, sensor_num: int, min_diameter: int) -> list[str]:
sensorindex, sensorindex_2 = wn_fun.sensor(SS, G, group) # 初始的sensorindex sensorindex, sensorindex_2 = wn_fun.sensor(SS, G, group) # 初始的sensorindex
# print(str(sensor_num), "个测压点,测压点位置:", sensorindex) # print(str(sensor_num), "个测压点,测压点位置:", sensorindex)
# 重新打开数据库 # 重新打开数据库
# if is_project_open(name=name): # if is_project_open(name=name):
# close_project(name=name) # close_project(name=name)
@@ -637,7 +685,7 @@ def get_ID(name: str, sensor_num: int, min_diameter: int) -> list[str]:
return sensorindex return sensorindex
if __name__ == '__main__': if __name__ == "__main__":
sensorindex = get_ID(name=project_info.name, sensor_num=20, min_diameter=300) sensorindex = get_ID(name=project_info.name, sensor_num=20, min_diameter=300)
print(sensorindex) print(sensorindex)
+1 -3
View File
@@ -6,7 +6,6 @@ import shutil
import threading import threading
from fastapi import APIRouter, HTTPException, File, UploadFile, Query, Path, Body from fastapi import APIRouter, HTTPException, File, UploadFile, Query, Path, Body
from fastapi.responses import PlainTextResponse from fastapi.responses import PlainTextResponse
import app.infra.db.influxdb.api as influxdb_api
import app.services.simulation as simulation import app.services.simulation as simulation
import app.services.globals as globals import app.services.globals as globals
from app.services.tjnetwork import ( from app.services.tjnetwork import (
@@ -28,8 +27,7 @@ from app.algorithms.sensor import (
pressure_sensor_placement_sensitivity, pressure_sensor_placement_sensitivity,
pressure_sensor_placement_kmeans, pressure_sensor_placement_kmeans,
) )
import app.algorithms.cleaning.flow as flow_data_clean
import app.algorithms.cleaning.pressure as pressure_data_clean
from app.services.network_import import network_update from app.services.network_import import network_update
from app.services.simulation_ops import ( from app.services.simulation_ops import (
project_management, project_management,
@@ -89,12 +89,17 @@ class ScadaRepository:
if field not in valid_fields: if field not in valid_fields:
raise ValueError(f"Invalid field: {field}") raise ValueError(f"Invalid field: {field}")
query = sql.SQL( update_query = sql.SQL(
"UPDATE scada.scada_data SET {} = %s WHERE time = %s AND device_id = %s" "UPDATE scada.scada_data SET {} = %s WHERE time = %s AND device_id = %s"
).format(sql.Identifier(field)) ).format(sql.Identifier(field))
insert_query = sql.SQL(
"INSERT INTO scada.scada_data (time, device_id, {}) VALUES (%s, %s, %s)"
).format(sql.Identifier(field))
async with conn.cursor() as cur: async with conn.cursor() as cur:
await cur.execute(query, (value, time, device_id)) await cur.execute(update_query, (value, time, device_id))
if cur.rowcount == 0:
await cur.execute(insert_query, (time, device_id, value))
@staticmethod @staticmethod
async def delete_scada_by_id_time_range( async def delete_scada_by_id_time_range(
+108
View File
@@ -0,0 +1,108 @@
import importlib.util
import sys
import types
from pathlib import Path
import numpy as np
import pandas as pd
def _load_pressure_cleaning_module():
project_root = Path(__file__).resolve().parents[2]
utils_path = project_root / "app" / "algorithms" / "_utils.py"
pressure_path = project_root / "app" / "algorithms" / "cleaning" / "pressure.py"
app_module = sys.modules.setdefault("app", types.ModuleType("app"))
algorithms_module = sys.modules.setdefault(
"app.algorithms",
types.ModuleType("app.algorithms"),
)
setattr(app_module, "algorithms", algorithms_module)
utils_spec = importlib.util.spec_from_file_location("app.algorithms._utils", utils_path)
assert utils_spec and utils_spec.loader
utils_module = importlib.util.module_from_spec(utils_spec)
sys.modules["app.algorithms._utils"] = utils_module
utils_spec.loader.exec_module(utils_module)
pressure_spec = importlib.util.spec_from_file_location(
"tests_pressure_under_test",
pressure_path,
)
assert pressure_spec and pressure_spec.loader
pressure_module = importlib.util.module_from_spec(pressure_spec)
pressure_spec.loader.exec_module(pressure_module)
return pressure_module
def test_clean_pressure_data_df_km_repairs_long_form_pressure_series():
module = _load_pressure_cleaning_module()
repo_root = Path(__file__).resolve().parents[3]
raw_df = pd.read_csv(repo_root / "data" / "node_simulation.csv")
noisy_df = pd.read_csv(repo_root / "data" / "node_simulation_noisy.csv")
cleaned_df = module.clean_pressure_data_df_km(noisy_df)
for df in (raw_df, noisy_df, cleaned_df):
df["time"] = pd.to_datetime(df["time"])
assert len(cleaned_df) == len(raw_df)
assert set(cleaned_df.columns) == {"time", "id", "pressure"}
assert cleaned_df["pressure"].isna().sum() == 0
noisy_joined = raw_df.merge(noisy_df, on=["time", "id"], how="inner", suffixes=("_raw", "_noisy"))
cleaned_joined = raw_df.merge(
cleaned_df,
on=["time", "id"],
how="inner",
suffixes=("_raw", "_clean"),
)
noisy_rmse = float(
np.sqrt(np.mean((noisy_joined["pressure_raw"] - noisy_joined["pressure_noisy"]) ** 2))
)
cleaned_rmse = float(
np.sqrt(np.mean((cleaned_joined["pressure_raw"] - cleaned_joined["pressure_clean"]) ** 2))
)
noisy_mae = float(
np.mean(np.abs(noisy_joined["pressure_raw"] - noisy_joined["pressure_noisy"]))
)
cleaned_mae = float(
np.mean(np.abs(cleaned_joined["pressure_raw"] - cleaned_joined["pressure_clean"]))
)
assert cleaned_rmse < 0.35
assert cleaned_rmse < noisy_rmse * 0.5
assert cleaned_mae < noisy_mae
repaired_gap = cleaned_df[
(cleaned_df["id"] == 170490)
& (cleaned_df["time"] == pd.Timestamp("2026-01-01T05:00:00+08:00"))
]["pressure"].iloc[0]
assert abs(repaired_gap - 30.62433433532715) < 1.0
spike_row = cleaned_df[
(cleaned_df["id"] == 42563)
& (cleaned_df["time"] == pd.Timestamp("2026-01-01T03:45:00+08:00"))
]["pressure"].iloc[0]
assert abs(spike_row - 28.018701553344727) < 2.0
def test_clean_pressure_data_df_km_accepts_single_sensor_wide_frame_with_utc_strings():
module = _load_pressure_cleaning_module()
repo_root = Path(__file__).resolve().parents[3]
noisy_df = pd.read_csv(repo_root / "data" / "node_simulation_noisy.csv")
single_sensor = (
noisy_df[noisy_df["id"] == 170490][["time", "pressure"]]
.rename(columns={"pressure": "170490"})
.copy()
)
single_sensor["time"] = (
pd.to_datetime(single_sensor["time"], utc=True).dt.strftime("%Y-%m-%dT%H:%M:%SZ")
)
cleaned_df = module.clean_pressure_data_df_km(single_sensor)
assert len(cleaned_df) == 192
assert cleaned_df["170490"].isna().sum() == 0
+87
View File
@@ -0,0 +1,87 @@
from datetime import datetime, timezone
import importlib.util
from pathlib import Path
import pytest
def _load_scada_repository():
module_path = (
Path(__file__).resolve().parents[2]
/ "app"
/ "infra"
/ "db"
/ "timescaledb"
/ "repositories"
/ "scada.py"
)
spec = importlib.util.spec_from_file_location("tests_scada_repo_under_test", module_path)
module = importlib.util.module_from_spec(spec)
assert spec and spec.loader
spec.loader.exec_module(module)
return module.ScadaRepository
class _FakeCursor:
def __init__(self, initial_rowcount: int):
self.initial_rowcount = initial_rowcount
self.rowcount = 0
self.calls: list[tuple[str, tuple]] = []
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb):
return False
async def execute(self, query, params):
self.calls.append((str(query), params))
if len(self.calls) == 1:
self.rowcount = self.initial_rowcount
else:
self.rowcount = 1
class _FakeConnection:
def __init__(self, initial_rowcount: int):
self.cursor_instance = _FakeCursor(initial_rowcount)
def cursor(self):
return self.cursor_instance
@pytest.mark.asyncio
async def test_update_scada_field_inserts_when_update_hits_no_rows():
ScadaRepository = _load_scada_repository()
conn = _FakeConnection(initial_rowcount=0)
point_time = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
await ScadaRepository.update_scada_field(
conn,
point_time,
"170490",
"cleaned_value",
26.5,
)
assert len(conn.cursor_instance.calls) == 2
assert "UPDATE scada.scada_data SET" in conn.cursor_instance.calls[0][0]
assert "INSERT INTO scada.scada_data" in conn.cursor_instance.calls[1][0]
@pytest.mark.asyncio
async def test_update_scada_field_skips_insert_when_update_succeeds():
ScadaRepository = _load_scada_repository()
conn = _FakeConnection(initial_rowcount=1)
point_time = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
await ScadaRepository.update_scada_field(
conn,
point_time,
"170490",
"cleaned_value",
26.5,
)
assert len(conn.cursor_instance.calls) == 1
assert "UPDATE scada.scada_data SET" in conn.cursor_instance.calls[0][0]