新增压力、流量数据清洗前的对异常值0的预处理方法

This commit is contained in:
JIANG
2025-11-12 15:06:14 +08:00
parent 57f6a96658
commit a0519295cb
2 changed files with 171 additions and 69 deletions

View File

@@ -105,8 +105,15 @@ def clean_pressure_data_dict_km(data_dict: dict, show_plot: bool = False) -> dic
data = pd.DataFrame(data_dict)
# 填充NaN值
data = data.ffill().bfill()
# 标准化
data_norm = (data - data.mean()) / data.std()
# 异常值预处理
# 将0值替换为NaN然后用线性插值填充
data_filled = data.replace(0, np.nan)
data_filled = data_filled.interpolate(method="linear", limit_direction="both")
# 如果仍有NaN全为0的列用前后值填充
data_filled = data_filled.ffill().bfill()
# 标准化(使用填充后的数据)
data_norm = (data_filled - data_filled.mean()) / data_filled.std()
# 聚类与异常检测
k = 3
@@ -130,46 +137,59 @@ def clean_pressure_data_dict_km(data_dict: dict, show_plot: bool = False) -> dic
anomaly_details[data.index[pos]] = main_sensor
# 修复:滚动平均(窗口可调)
data_rolled = data.rolling(window=13, center=True, min_periods=1).mean()
data_repaired = data.copy()
data_rolled = data_filled.rolling(window=13, center=True, min_periods=1).mean()
data_repaired = data_filled.copy()
for pos in anomaly_pos:
label = data.index[pos]
sensor = anomaly_details[label]
data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor]
# 可选可视化(使用位置作为 x 轴)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
if show_plot and len(data.columns) > 0:
n = len(data)
time = np.arange(n)
plt.figure(figsize=(12, 8))
for col in data.columns:
plt.plot(time, data[col].values, marker='o', markersize=3, label=col)
plt.plot(
time, data[col].values, marker="o", markersize=3, label=col, alpha=0.5
)
for col in data_filled.columns:
plt.plot(
time,
data_filled[col].values,
marker="x",
markersize=3,
label=f"{col}_filled",
linestyle="--",
)
for pos in anomaly_pos:
sensor = anomaly_details[data.index[pos]]
plt.plot(pos, data.iloc[pos][sensor], 'ro', markersize=8)
plt.plot(pos, data_filled.iloc[pos][sensor], "ro", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("压力监测值")
plt.title("各传感器折线图(红色标记主要异常点)")
plt.title("各传感器折线图(红色标记主要异常点虚线为0值填充后")
plt.legend()
plt.show()
plt.figure(figsize=(12, 8))
for col in data_repaired.columns:
plt.plot(time, data_repaired[col].values, marker='o', markersize=3, label=col)
plt.plot(
time, data_repaired[col].values, marker="o", markersize=3, label=col
)
for pos in anomaly_pos:
sensor = anomaly_details[data.index[pos]]
plt.plot(pos, data_repaired.iloc[pos][sensor], 'go', markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("修复后压力监测值")
plt.title("修复后各传感器折线图(绿色标记修复值)")
plt.legend()
plt.show()
plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8)
plt.xlabel("时间点(序号)")
plt.ylabel("修复后压力监测值")
plt.title("修复后各传感器折线图(绿色标记修复值)")
plt.legend()
plt.show()
# 返回清洗后的字典
return data_repaired.to_dict(orient='list')
return data_repaired.to_dict(orient="list")
# 测试