import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.impute import SimpleImputer import os def fill_time_gaps( data: pd.DataFrame, time_col: str = "time", freq: str = "1min", short_gap_threshold: int = 10, ) -> pd.DataFrame: """ 补齐缺失时间戳并填补数据缺口。 Args: data: 包含时间列的 DataFrame time_col: 时间列名(默认 'time') freq: 重采样频率(默认 '1min') short_gap_threshold: 短缺口阈值(分钟),<=此值用线性插值,>此值用前向填充 Returns: 补齐时间后的 DataFrame(保留原时间列格式) """ if time_col not in data.columns: raise ValueError(f"时间列 '{time_col}' 不存在于数据中") # 解析时间列并设为索引 data = data.copy() data[time_col] = pd.to_datetime(data[time_col], utc=True) data_indexed = data.set_index(time_col) # 生成完整时间范围 full_range = pd.date_range( start=data_indexed.index.min(), end=data_indexed.index.max(), freq=freq ) # 重索引以补齐缺失时间点 data_reindexed = data_indexed.reindex(full_range) # 按列处理缺口 for col in data_reindexed.columns: # 识别缺失值位置 is_missing = data_reindexed[col].isna() # 计算连续缺失的长度 missing_groups = (is_missing != is_missing.shift()).cumsum() gap_lengths = is_missing.groupby(missing_groups).transform("sum") # 短缺口:线性插值 short_gap_mask = is_missing & (gap_lengths <= short_gap_threshold) if short_gap_mask.any(): data_reindexed.loc[short_gap_mask, col] = ( data_reindexed[col] .interpolate(method="linear", limit_area="inside") .loc[short_gap_mask] ) # 长缺口:前向填充 long_gap_mask = is_missing & (gap_lengths > short_gap_threshold) if long_gap_mask.any(): data_reindexed.loc[long_gap_mask, col] = ( data_reindexed[col].ffill().loc[long_gap_mask] ) # 重置索引并恢复时间列(保留原格式) data_result = data_reindexed.reset_index() data_result.rename(columns={"index": time_col}, inplace=True) # 保留时区信息 data_result[time_col] = data_result[time_col].dt.strftime("%Y-%m-%dT%H:%M:%S%z") # 修正时区格式(Python的%z输出为+0000,需转为+00:00) data_result[time_col] = data_result[time_col].str.replace( r"(\+\d{2})(\d{2})$", r"\1:\2", regex=True ) return data_result def clean_pressure_data_km( input_csv_path: str, show_plot: bool = False, fill_gaps: bool = True ) -> str: """ 读取输入 CSV,基于 KMeans 检测异常并用滚动平均修复。输出为 _cleaned.xlsx(同目录)。 原始数据在 sheet 'raw_pressure_data',处理后数据在 sheet 'cleaned_pressusre_data'。 返回输出文件的绝对路径。 Args: input_csv_path: CSV 文件路径 show_plot: 是否显示可视化 fill_gaps: 是否先补齐时间缺口(默认 True) """ # 读取 CSV input_csv_path = os.path.abspath(input_csv_path) data = pd.read_csv(input_csv_path, header=0, index_col=None, encoding="utf-8") # 补齐时间缺口(如果数据包含 time 列) if fill_gaps and "time" in data.columns: data = fill_time_gaps( data, time_col="time", freq="1min", short_gap_threshold=10 ) # 分离时间列和数值列 time_col_data = None if "time" in data.columns: time_col_data = data["time"] data = data.drop(columns=["time"]) # 标准化 data_norm = (data - data.mean()) / data.std() # 聚类与异常检测 k = 3 kmeans = KMeans(n_clusters=k, init="k-means++", n_init=50, random_state=42) clusters = kmeans.fit_predict(data_norm) centers = kmeans.cluster_centers_ distances = np.linalg.norm(data_norm.values - centers[clusters], axis=1) threshold = distances.mean() + 3 * distances.std() anomaly_pos = np.where(distances > threshold)[0] anomaly_indices = data.index[anomaly_pos] anomaly_details = {} for pos in anomaly_pos: row_norm = data_norm.iloc[pos] cluster_idx = clusters[pos] center = centers[cluster_idx] diff = abs(row_norm - center) main_sensor = diff.idxmax() anomaly_details[data.index[pos]] = main_sensor # 修复:滚动平均(窗口可调) data_rolled = data.rolling(window=13, center=True, min_periods=1).mean() data_repaired = data.copy() for pos in anomaly_pos: label = data.index[pos] sensor = anomaly_details[label] data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor] # 可选可视化(使用位置作为 x 轴) plt.rcParams["font.sans-serif"] = ["SimHei"] plt.rcParams["axes.unicode_minus"] = False if show_plot and len(data.columns) > 0: n = len(data) time = np.arange(n) plt.figure(figsize=(12, 8)) for col in data.columns: plt.plot(time, data[col].values, marker="o", markersize=3, label=col) for pos in anomaly_pos: sensor = anomaly_details[data.index[pos]] plt.plot(pos, data.iloc[pos][sensor], "ro", markersize=8) plt.xlabel("时间点(序号)") plt.ylabel("压力监测值") plt.title("各传感器折线图(红色标记主要异常点)") plt.legend() plt.show() plt.figure(figsize=(12, 8)) for col in data_repaired.columns: plt.plot( time, data_repaired[col].values, marker="o", markersize=3, label=col ) for pos in anomaly_pos: sensor = anomaly_details[data.index[pos]] plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8) plt.xlabel("时间点(序号)") plt.ylabel("修复后压力监测值") plt.title("修复后各传感器折线图(绿色标记修复值)") plt.legend() plt.show() # 保存到 Excel:两个 sheet input_dir = os.path.dirname(os.path.abspath(input_csv_path)) input_base = os.path.splitext(os.path.basename(input_csv_path))[0] output_filename = f"{input_base}_cleaned.xlsx" output_path = os.path.join(input_dir, output_filename) # 如果原始数据包含时间列,将其添加回结果 data_for_save = data.copy() data_repaired_for_save = data_repaired.copy() if time_col_data is not None: data_for_save.insert(0, "time", time_col_data) data_repaired_for_save.insert(0, "time", time_col_data) if os.path.exists(output_path): os.remove(output_path) # 覆盖同名文件 with pd.ExcelWriter(output_path, engine="openpyxl") as writer: data_for_save.to_excel(writer, sheet_name="raw_pressure_data", index=False) data_repaired_for_save.to_excel( writer, sheet_name="cleaned_pressusre_data", index=False ) # 返回输出文件的绝对路径 return os.path.abspath(output_path) def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> dict: """ 接收一个 DataFrame 数据结构,使用KMeans聚类检测异常并用滚动平均修复。 返回清洗后的字典数据结构。 Args: data: 输入 DataFrame(可包含 time 列) show_plot: 是否显示可视化 """ # 使用传入的 DataFrame data = data.copy() # 补齐时间缺口(如果启用且数据包含 time 列) data_filled = fill_time_gaps( data, time_col="time", freq="1min", short_gap_threshold=10 ) # 移除 time 列用于后续清洗 data_filled = data_filled.drop(columns=["time"]) # 标准化(使用填充后的数据) data_norm = (data_filled - data_filled.mean()) / data_filled.std() # 添加:处理标准化后的 NaN(例如,标准差为0的列),防止异常数据,时间段内所有数据都相同导致计算结果为 NaN imputer = SimpleImputer( strategy="constant", fill_value=0, keep_empty_features=True ) # 用 0 填充 NaN,包括全 NaN,并保留空特征 data_norm = pd.DataFrame( imputer.fit_transform(data_norm), columns=data_norm.columns, index=data_norm.index, ) # 聚类与异常检测 k = 3 kmeans = KMeans(n_clusters=k, init="k-means++", n_init=50, random_state=42) clusters = kmeans.fit_predict(data_norm) centers = kmeans.cluster_centers_ distances = np.linalg.norm(data_norm.values - centers[clusters], axis=1) threshold = distances.mean() + 3 * distances.std() anomaly_pos = np.where(distances > threshold)[0] anomaly_indices = data.index[anomaly_pos] anomaly_details = {} for pos in anomaly_pos: row_norm = data_norm.iloc[pos] cluster_idx = clusters[pos] center = centers[cluster_idx] diff = abs(row_norm - center) main_sensor = diff.idxmax() anomaly_details[data.index[pos]] = main_sensor # 修复:滚动平均(窗口可调) data_rolled = data_filled.rolling(window=13, center=True, min_periods=1).mean() data_repaired = data_filled.copy() for pos in anomaly_pos: label = data.index[pos] sensor = anomaly_details[label] data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor] # 可选可视化(使用位置作为 x 轴) plt.rcParams["font.sans-serif"] = ["SimHei"] plt.rcParams["axes.unicode_minus"] = False if show_plot and len(data.columns) > 0: n = len(data) time = np.arange(n) plt.figure(figsize=(12, 8)) for col in data.columns: plt.plot( time, data[col].values, marker="o", markersize=3, label=col, alpha=0.5 ) for col in data_filled.columns: plt.plot( time, data_filled[col].values, marker="x", markersize=3, label=f"{col}_filled", linestyle="--", ) for pos in anomaly_pos: sensor = anomaly_details[data.index[pos]] plt.plot(pos, data_filled.iloc[pos][sensor], "ro", markersize=8) plt.xlabel("时间点(序号)") plt.ylabel("压力监测值") plt.title("各传感器折线图(红色标记主要异常点,虚线为0值填充后)") plt.legend() plt.show() plt.figure(figsize=(12, 8)) for col in data_repaired.columns: plt.plot( time, data_repaired[col].values, marker="o", markersize=3, label=col ) for pos in anomaly_pos: sensor = anomaly_details[data.index[pos]] plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8) plt.xlabel("时间点(序号)") plt.ylabel("修复后压力监测值") plt.title("修复后各传感器折线图(绿色标记修复值)") plt.legend() plt.show() # 返回清洗后的字典 return data_repaired # 测试 # if __name__ == "__main__": # # 默认使用脚本目录下的 pressure_raw_data.csv # script_dir = os.path.dirname(os.path.abspath(__file__)) # default_csv = os.path.join(script_dir, "pressure_raw_data.csv") # out_path = clean_pressure_data_km(default_csv, show_plot=False) # print("保存路径:", out_path) # 测试 clean_pressure_data_dict_km 函数 if __name__ == "__main__": import random # 读取 szh_pressure_scada.csv 文件 script_dir = os.path.dirname(os.path.abspath(__file__)) csv_path = os.path.join(script_dir, "szh_pressure_scada.csv") data = pd.read_csv(csv_path, header=0, index_col=None, encoding="utf-8") # 排除 Time 列,随机选择 5 列 columns_to_exclude = ["Time"] available_columns = [col for col in data.columns if col not in columns_to_exclude] selected_columns = random.sample(available_columns, 5) # 将选中的列转换为字典 data_dict = {col: data[col].tolist() for col in selected_columns} print("选中的列:", selected_columns) print("原始数据长度:", len(data_dict[selected_columns[0]])) # 调用函数进行清洗 cleaned_dict = clean_pressure_data_df_km(data_dict, show_plot=True) print("清洗后的字典键:", list(cleaned_dict.keys())) print("清洗后的数据长度:", len(cleaned_dict[selected_columns[0]])) print("测试完成:函数运行正常")