修复数据清洗时间轴填补后的对齐问题

This commit is contained in:
2026-02-02 15:16:23 +08:00
parent 3c7e2c5806
commit 9be2028e4c
3 changed files with 38 additions and 20 deletions

View File

@@ -37,8 +37,9 @@ def fill_time_gaps(
start=data_indexed.index.min(), end=data_indexed.index.max(), freq=freq
)
# 重索引以补齐缺失时间点
data_reindexed = data_indexed.reindex(full_range)
# 重索引以补齐缺失时间点,同时保留原始时间戳
combined_index = data_indexed.index.union(full_range).sort_values().unique()
data_reindexed = data_indexed.reindex(combined_index)
# 按列处理缺口
for col in data_reindexed.columns:
@@ -49,12 +50,12 @@ def fill_time_gaps(
missing_groups = (is_missing != is_missing.shift()).cumsum()
gap_lengths = is_missing.groupby(missing_groups).transform("sum")
# 短缺口:线性插值
# 短缺口:时间插值
short_gap_mask = is_missing & (gap_lengths <= short_gap_threshold)
if short_gap_mask.any():
data_reindexed.loc[short_gap_mask, col] = (
data_reindexed[col]
.interpolate(method="linear", limit_area="inside")
.interpolate(method="time", limit_area="inside")
.loc[short_gap_mask]
)
@@ -213,6 +214,12 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
data_filled = fill_time_gaps(
data, time_col="time", freq="1min", short_gap_threshold=10
)
# 保存 time 列用于最后合并
time_col_series = None
if "time" in data_filled.columns:
time_col_series = data_filled["time"]
# 移除 time 列用于后续清洗
data_filled = data_filled.drop(columns=["time"])
@@ -304,6 +311,10 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
plt.legend()
plt.show()
# 将 time 列添加回结果
if time_col_series is not None:
data_repaired.insert(0, "time", time_col_series)
# 返回清洗后的字典
return data_repaired