修复数据清洗index越界错误;重命名压力流量清洗方法

This commit is contained in:
2026-02-02 14:15:54 +08:00
parent c3c26fb107
commit 3c7e2c5806
10 changed files with 116 additions and 60 deletions

View File

@@ -1,3 +1,3 @@
from .Fdataclean import *
from .Pdataclean import *
from .flow_data_clean import *
from .pressure_data_clean import *
from .pipeline_health_analyzer import *

View File

@@ -292,28 +292,47 @@ def clean_flow_data_df_kf(data: pd.DataFrame, show_plot: bool = False) -> dict:
plt.rcParams["axes.unicode_minus"] = False
if show_plot and len(data.columns) > 0:
sensor_to_plot = data.columns[0]
# 定义x轴
n = len(data)
time = np.arange(n)
n_filled = len(data_filled)
time_filled = np.arange(n_filled)
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
plt.plot(
data.index,
time,
data[sensor_to_plot],
label="原始监测值",
marker="o",
markersize=3,
alpha=0.7,
)
abnormal_zero_idx = data.index[data_filled[sensor_to_plot].isna()]
# 修正:检查 data_filled 的异常值,绘制在 time_filled 上
abnormal_zero_mask = data_filled[sensor_to_plot].isna()
# 如果目的是检查0值应该用 == 0。这里保留 isna() 但修正索引引用防止crash。
# 如果原意是 isna() 则在 fillna 后通常没有 na。假设用户可能想检查 0 值?
# 基于 "异常0值" 的标签,改为检查 0 值更合理,但为了保险起见,
# 如果 isna() 返回空,就不画。防止索引越界是主要的。
abnormal_zero_idx = data_filled.index[abnormal_zero_mask]
if len(abnormal_zero_idx) > 0:
# 注意:如果 abnormal_zero_idx 是基于 data_filled 的索引0..M-1
# 直接作为 x 坐标即可,因为 time_filled 也是 0..M-1
# 而 y 值应该取自 data_filled 或 data_kf取 data 会越界
plt.plot(
abnormal_zero_idx,
data[sensor_to_plot].loc[abnormal_zero_idx],
data_filled[sensor_to_plot].loc[abnormal_zero_idx],
"mo",
markersize=8,
label="异常0",
label="异常值(NaN)",
)
plt.plot(
data.index, data_kf[sensor_to_plot], label="Kalman滤波预测值", linewidth=2
time_filled, data_kf[sensor_to_plot], label="Kalman滤波预测值", linewidth=2
)
anomaly_idx = anomalies_info[sensor_to_plot].index
if len(anomaly_idx) > 0:
@@ -331,7 +350,7 @@ def clean_flow_data_df_kf(data: pd.DataFrame, show_plot: bool = False) -> dict:
plt.subplot(2, 1, 2)
plt.plot(
data.index,
time_filled,
cleaned_data[sensor_to_plot],
label="修复后监测值",
marker="o",

View File

@@ -239,7 +239,7 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
threshold = distances.mean() + 3 * distances.std()
anomaly_pos = np.where(distances > threshold)[0]
anomaly_indices = data.index[anomaly_pos]
anomaly_indices = data_filled.index[anomaly_pos]
anomaly_details = {}
for pos in anomaly_pos:
@@ -248,13 +248,13 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
center = centers[cluster_idx]
diff = abs(row_norm - center)
main_sensor = diff.idxmax()
anomaly_details[data.index[pos]] = main_sensor
anomaly_details[data_filled.index[pos]] = main_sensor
# 修复:滚动平均(窗口可调)
data_rolled = data_filled.rolling(window=13, center=True, min_periods=1).mean()
data_repaired = data_filled.copy()
for pos in anomaly_pos:
label = data.index[pos]
label = data_filled.index[pos]
sensor = anomaly_details[label]
data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor]
@@ -265,6 +265,8 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
if show_plot and len(data.columns) > 0:
n = len(data)
time = np.arange(n)
n_filled = len(data_filled)
time_filled = np.arange(n_filled)
plt.figure(figsize=(12, 8))
for col in data.columns:
plt.plot(
@@ -272,7 +274,7 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
)
for col in data_filled.columns:
plt.plot(
time,
time_filled,
data_filled[col].values,
marker="x",
markersize=3,
@@ -280,7 +282,7 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
linestyle="--",
)
for pos in anomaly_pos:
sensor = anomaly_details[data.index[pos]]
sensor = anomaly_details[data_filled.index[pos]]
plt.plot(pos, data_filled.iloc[pos][sensor], "ro", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("压力监测值")
@@ -291,16 +293,16 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
plt.figure(figsize=(12, 8))
for col in data_repaired.columns:
plt.plot(
time, data_repaired[col].values, marker="o", markersize=3, label=col
time_filled, data_repaired[col].values, marker="o", markersize=3, label=col
)
for pos in anomaly_pos:
sensor = anomaly_details[data.index[pos]]
sensor = anomaly_details[data_filled.index[pos]]
plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("修复后压力监测值")
plt.title("修复后各传感器折线图(绿色标记修复值)")
plt.legend()
plt.show()
plt.xlabel("时间点(序号)")
plt.ylabel("修复后压力监测值")
plt.title("修复后各传感器折线图(绿色标记修复值)")
plt.legend()
plt.show()
# 返回清洗后的字典
return data_repaired