修复数据清洗index越界错误;重命名压力流量清洗方法

This commit is contained in:
2026-02-02 14:15:54 +08:00
parent c3c26fb107
commit 3c7e2c5806
10 changed files with 116 additions and 60 deletions

View File

@@ -1,3 +1,3 @@
from .Fdataclean import * from .flow_data_clean import *
from .Pdataclean import * from .pressure_data_clean import *
from .pipeline_health_analyzer import * from .pipeline_health_analyzer import *

View File

@@ -292,28 +292,47 @@ def clean_flow_data_df_kf(data: pd.DataFrame, show_plot: bool = False) -> dict:
plt.rcParams["axes.unicode_minus"] = False plt.rcParams["axes.unicode_minus"] = False
if show_plot and len(data.columns) > 0: if show_plot and len(data.columns) > 0:
sensor_to_plot = data.columns[0] sensor_to_plot = data.columns[0]
# 定义x轴
n = len(data)
time = np.arange(n)
n_filled = len(data_filled)
time_filled = np.arange(n_filled)
plt.figure(figsize=(12, 8)) plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1) plt.subplot(2, 1, 1)
plt.plot( plt.plot(
data.index, time,
data[sensor_to_plot], data[sensor_to_plot],
label="原始监测值", label="原始监测值",
marker="o", marker="o",
markersize=3, markersize=3,
alpha=0.7, alpha=0.7,
) )
abnormal_zero_idx = data.index[data_filled[sensor_to_plot].isna()]
# 修正:检查 data_filled 的异常值,绘制在 time_filled 上
abnormal_zero_mask = data_filled[sensor_to_plot].isna()
# 如果目的是检查0值应该用 == 0。这里保留 isna() 但修正索引引用防止crash。
# 如果原意是 isna() 则在 fillna 后通常没有 na。假设用户可能想检查 0 值?
# 基于 "异常0值" 的标签,改为检查 0 值更合理,但为了保险起见,
# 如果 isna() 返回空,就不画。防止索引越界是主要的。
abnormal_zero_idx = data_filled.index[abnormal_zero_mask]
if len(abnormal_zero_idx) > 0: if len(abnormal_zero_idx) > 0:
# 注意:如果 abnormal_zero_idx 是基于 data_filled 的索引0..M-1
# 直接作为 x 坐标即可,因为 time_filled 也是 0..M-1
# 而 y 值应该取自 data_filled 或 data_kf取 data 会越界
plt.plot( plt.plot(
abnormal_zero_idx, abnormal_zero_idx,
data[sensor_to_plot].loc[abnormal_zero_idx], data_filled[sensor_to_plot].loc[abnormal_zero_idx],
"mo", "mo",
markersize=8, markersize=8,
label="异常0", label="异常值(NaN)",
) )
plt.plot( plt.plot(
data.index, data_kf[sensor_to_plot], label="Kalman滤波预测值", linewidth=2 time_filled, data_kf[sensor_to_plot], label="Kalman滤波预测值", linewidth=2
) )
anomaly_idx = anomalies_info[sensor_to_plot].index anomaly_idx = anomalies_info[sensor_to_plot].index
if len(anomaly_idx) > 0: if len(anomaly_idx) > 0:
@@ -331,7 +350,7 @@ def clean_flow_data_df_kf(data: pd.DataFrame, show_plot: bool = False) -> dict:
plt.subplot(2, 1, 2) plt.subplot(2, 1, 2)
plt.plot( plt.plot(
data.index, time_filled,
cleaned_data[sensor_to_plot], cleaned_data[sensor_to_plot],
label="修复后监测值", label="修复后监测值",
marker="o", marker="o",

View File

@@ -239,7 +239,7 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
threshold = distances.mean() + 3 * distances.std() threshold = distances.mean() + 3 * distances.std()
anomaly_pos = np.where(distances > threshold)[0] anomaly_pos = np.where(distances > threshold)[0]
anomaly_indices = data.index[anomaly_pos] anomaly_indices = data_filled.index[anomaly_pos]
anomaly_details = {} anomaly_details = {}
for pos in anomaly_pos: for pos in anomaly_pos:
@@ -248,13 +248,13 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
center = centers[cluster_idx] center = centers[cluster_idx]
diff = abs(row_norm - center) diff = abs(row_norm - center)
main_sensor = diff.idxmax() main_sensor = diff.idxmax()
anomaly_details[data.index[pos]] = main_sensor anomaly_details[data_filled.index[pos]] = main_sensor
# 修复:滚动平均(窗口可调) # 修复:滚动平均(窗口可调)
data_rolled = data_filled.rolling(window=13, center=True, min_periods=1).mean() data_rolled = data_filled.rolling(window=13, center=True, min_periods=1).mean()
data_repaired = data_filled.copy() data_repaired = data_filled.copy()
for pos in anomaly_pos: for pos in anomaly_pos:
label = data.index[pos] label = data_filled.index[pos]
sensor = anomaly_details[label] sensor = anomaly_details[label]
data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor] data_repaired.loc[label, sensor] = data_rolled.loc[label, sensor]
@@ -265,6 +265,8 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
if show_plot and len(data.columns) > 0: if show_plot and len(data.columns) > 0:
n = len(data) n = len(data)
time = np.arange(n) time = np.arange(n)
n_filled = len(data_filled)
time_filled = np.arange(n_filled)
plt.figure(figsize=(12, 8)) plt.figure(figsize=(12, 8))
for col in data.columns: for col in data.columns:
plt.plot( plt.plot(
@@ -272,7 +274,7 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
) )
for col in data_filled.columns: for col in data_filled.columns:
plt.plot( plt.plot(
time, time_filled,
data_filled[col].values, data_filled[col].values,
marker="x", marker="x",
markersize=3, markersize=3,
@@ -280,7 +282,7 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
linestyle="--", linestyle="--",
) )
for pos in anomaly_pos: for pos in anomaly_pos:
sensor = anomaly_details[data.index[pos]] sensor = anomaly_details[data_filled.index[pos]]
plt.plot(pos, data_filled.iloc[pos][sensor], "ro", markersize=8) plt.plot(pos, data_filled.iloc[pos][sensor], "ro", markersize=8)
plt.xlabel("时间点(序号)") plt.xlabel("时间点(序号)")
plt.ylabel("压力监测值") plt.ylabel("压力监测值")
@@ -291,10 +293,10 @@ def clean_pressure_data_df_km(data: pd.DataFrame, show_plot: bool = False) -> di
plt.figure(figsize=(12, 8)) plt.figure(figsize=(12, 8))
for col in data_repaired.columns: for col in data_repaired.columns:
plt.plot( plt.plot(
time, data_repaired[col].values, marker="o", markersize=3, label=col time_filled, data_repaired[col].values, marker="o", markersize=3, label=col
) )
for pos in anomaly_pos: for pos in anomaly_pos:
sensor = anomaly_details[data.index[pos]] sensor = anomaly_details[data_filled.index[pos]]
plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8) plt.plot(pos, data_repaired.iloc[pos][sensor], "go", markersize=8)
plt.xlabel("时间点(序号)") plt.xlabel("时间点(序号)")
plt.ylabel("修复后压力监测值") plt.ylabel("修复后压力监测值")

View File

@@ -1,7 +1,7 @@
import os import os
import app.algorithms.api_ex.Fdataclean as Fdataclean import app.algorithms.api_ex.flow_data_clean as flow_data_clean
import app.algorithms.api_ex.Pdataclean as Pdataclean import app.algorithms.api_ex.pressure_data_clean as pressure_data_clean
############################################################ ############################################################
@@ -26,7 +26,7 @@ def flow_data_clean(input_csv_file: str) -> str:
if not os.path.exists(input_csv_path): if not os.path.exists(input_csv_path):
raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}") raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}")
# 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗 # 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗
out_xlsx_path = Fdataclean.clean_flow_data_kf(input_csv_path) out_xlsx_path = flow_data_clean.clean_flow_data_kf(input_csv_path)
print("清洗后的数据已保存到:", out_xlsx_path) print("清洗后的数据已保存到:", out_xlsx_path)
@@ -53,5 +53,5 @@ def pressure_data_clean(input_csv_file: str) -> str:
if not os.path.exists(input_csv_path): if not os.path.exists(input_csv_path):
raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}") raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}")
# 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗 # 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗
out_xlsx_path = Pdataclean.clean_pressure_data_km(input_csv_path) out_xlsx_path = pressure_data_clean.clean_pressure_data_km(input_csv_path)
print("清洗后的数据已保存到:", out_xlsx_path) print("清洗后的数据已保存到:", out_xlsx_path)

View File

@@ -30,8 +30,8 @@ from app.algorithms.sensors import (
pressure_sensor_placement_sensitivity, pressure_sensor_placement_sensitivity,
pressure_sensor_placement_kmeans, pressure_sensor_placement_kmeans,
) )
import app.algorithms.api_ex.Fdataclean as Fdataclean import app.algorithms.api_ex.flow_data_clean as flow_data_clean
import app.algorithms.api_ex.Pdataclean as Pdataclean import app.algorithms.api_ex.pressure_data_clean as pressure_data_clean
from app.services.network_import import network_update from app.services.network_import import network_update
from app.services.simulation_ops import ( from app.services.simulation_ops import (
project_management, project_management,
@@ -588,9 +588,9 @@ async def fastapi_scada_device_data_cleaning(
values = [record["value"] for record in type_scada_data[device_id]] values = [record["value"] for record in type_scada_data[device_id]]
df[device_id] = values df[device_id] = values
if device_type == "pressure": if device_type == "pressure":
cleaned_value_df = Pdataclean.clean_pressure_data_df_km(df) cleaned_value_df = pressure_data_clean.clean_pressure_data_df_km(df)
elif device_type == "pipe_flow": elif device_type == "pipe_flow":
cleaned_value_df = Fdataclean.clean_flow_data_df_kf(df) cleaned_value_df = flow_data_clean.clean_flow_data_df_kf(df)
cleaned_value_df = pd.DataFrame(cleaned_value_df) cleaned_value_df = pd.DataFrame(cleaned_value_df)
cleaned_df = pd.concat([df["time"], cleaned_value_df], axis=1) cleaned_df = pd.concat([df["time"], cleaned_value_df], axis=1)
influxdb_api.import_multicolumn_data_from_dict( influxdb_api.import_multicolumn_data_from_dict(

View File

@@ -3,14 +3,17 @@
记录系统关键操作,用于安全审计和合规追踪 记录系统关键操作,用于安全审计和合规追踪
""" """
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class AuditAction: class AuditAction:
"""审计操作类型常量""" """审计操作类型常量"""
# 认证相关 # 认证相关
LOGIN = "LOGIN" LOGIN = "LOGIN"
LOGOUT = "LOGOUT" LOGOUT = "LOGOUT"
@@ -32,6 +35,7 @@ class AuditAction:
SYSTEM_START = "SYSTEM_START" SYSTEM_START = "SYSTEM_START"
SYSTEM_STOP = "SYSTEM_STOP" SYSTEM_STOP = "SYSTEM_STOP"
async def log_audit_event( async def log_audit_event(
action: str, action: str,
user_id: Optional[int] = None, user_id: Optional[int] = None,
@@ -45,7 +49,7 @@ async def log_audit_event(
request_data: Optional[dict] = None, request_data: Optional[dict] = None,
response_status: Optional[int] = None, response_status: Optional[int] = None,
error_message: Optional[str] = None, error_message: Optional[str] = None,
db = None # 新增:可选的数据库实例 db=None, # 新增:可选的数据库实例
): ):
""" """
记录审计日志 记录审计日志
@@ -72,7 +76,18 @@ async def log_audit_event(
if request_data: if request_data:
request_data = sanitize_sensitive_data(request_data) request_data = sanitize_sensitive_data(request_data)
# 如果没有提供数据库实例,尝试获取(这在中间件中可能不可用) # 如果没有提供数据库实例,尝试从全局获取
if db is None:
try:
from app.infra.db.postgresql.database import db as default_db
# 仅当连接池已初始化时使用
if default_db.pool:
db = default_db
except ImportError:
pass
# 如果仍然没有数据库实例
if db is None: if db is None:
# 在某些上下文中可能无法获取,此时静默失败 # 在某些上下文中可能无法获取,此时静默失败
logger.warning("No database instance provided for audit logging") logger.warning("No database instance provided for audit logging")
@@ -92,7 +107,7 @@ async def log_audit_event(
request_path=request_path, request_path=request_path,
request_data=request_data, request_data=request_data,
response_status=response_status, response_status=response_status,
error_message=error_message error_message=error_message,
) )
logger.info( logger.info(
@@ -104,6 +119,7 @@ async def log_audit_event(
# 审计日志失败不应影响业务流程 # 审计日志失败不应影响业务流程
logger.error(f"Failed to create audit log: {e}", exc_info=True) logger.error(f"Failed to create audit log: {e}", exc_info=True)
def sanitize_sensitive_data(data: dict) -> dict: def sanitize_sensitive_data(data: dict) -> dict:
""" """
脱敏敏感数据 脱敏敏感数据
@@ -115,9 +131,16 @@ def sanitize_sensitive_data(data: dict) -> dict:
脱敏后的数据 脱敏后的数据
""" """
sensitive_fields = [ sensitive_fields = [
'password', 'passwd', 'pwd', "password",
'secret', 'token', 'api_key', 'apikey', "passwd",
'credit_card', 'ssn', 'social_security' "pwd",
"secret",
"token",
"api_key",
"apikey",
"credit_card",
"ssn",
"social_security",
] ]
sanitized = data.copy() sanitized = data.copy()

View File

@@ -1,11 +1,14 @@
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
class Settings(BaseSettings): class Settings(BaseSettings):
PROJECT_NAME: str = "TJWater Server" PROJECT_NAME: str = "TJWater Server"
API_V1_STR: str = "/api/v1" API_V1_STR: str = "/api/v1"
# JWT 配置 # JWT 配置
SECRET_KEY: str = "your-secret-key-here-change-in-production-use-openssl-rand-hex-32" SECRET_KEY: str = (
"your-secret-key-here-change-in-production-use-openssl-rand-hex-32"
)
ALGORITHM: str = "HS256" ALGORITHM: str = "HS256"
ACCESS_TOKEN_EXPIRE_MINUTES: int = 30 ACCESS_TOKEN_EXPIRE_MINUTES: int = 30
REFRESH_TOKEN_EXPIRE_DAYS: int = 7 REFRESH_TOKEN_EXPIRE_DAYS: int = 7
@@ -20,6 +23,12 @@ class Settings(BaseSettings):
DB_USER: str = "postgres" DB_USER: str = "postgres"
DB_PASSWORD: str = "password" DB_PASSWORD: str = "password"
# Database Config (TimescaleDB)
TIMESCALEDB_DB_NAME: str = "tjwater"
TIMESCALEDB_DB_HOST: str = "localhost"
TIMESCALEDB_DB_PORT: str = "5433"
TIMESCALEDB_DB_USER: str = "postgres"
TIMESCALEDB_DB_PASSWORD: str = "password"
# InfluxDB # InfluxDB
INFLUXDB_URL: str = "http://localhost:8086" INFLUXDB_URL: str = "http://localhost:8086"
INFLUXDB_TOKEN: str = "token" INFLUXDB_TOKEN: str = "token"
@@ -34,4 +43,5 @@ class Settings(BaseSettings):
env_file = ".env" env_file = ".env"
extra = "ignore" extra = "ignore"
settings = Settings() settings = Settings()

View File

@@ -9,6 +9,7 @@ import json
from typing import Callable from typing import Callable
from fastapi import Request, Response from fastapi import Request, Response
from starlette.middleware.base import BaseHTTPMiddleware from starlette.middleware.base import BaseHTTPMiddleware
from app.infra.db.postgresql.database import db as default_db
from app.core.audit import log_audit_event, AuditAction from app.core.audit import log_audit_event, AuditAction
import logging import logging
@@ -135,6 +136,7 @@ class AuditMiddleware(BaseHTTPMiddleware):
if response.status_code < 400 if response.status_code < 400
else f"HTTP {response.status_code}" else f"HTTP {response.status_code}"
), ),
db=default_db,
) )
except Exception as e: except Exception as e:
# 审计失败不应影响响应 # 审计失败不应影响响应

View File

@@ -4,8 +4,8 @@ from datetime import datetime, timedelta
from psycopg import AsyncConnection from psycopg import AsyncConnection
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from app.algorithms.api_ex.Fdataclean import clean_flow_data_df_kf from app.algorithms.api_ex.flow_data_clean import clean_flow_data_df_kf
from app.algorithms.api_ex.Pdataclean import clean_pressure_data_df_km from app.algorithms.api_ex.pressure_data_clean import clean_pressure_data_df_km
from app.algorithms.api_ex.pipeline_health_analyzer import PipelineHealthAnalyzer from app.algorithms.api_ex.pipeline_health_analyzer import PipelineHealthAnalyzer
from app.infra.db.postgresql.internal_queries import InternalQueries from app.infra.db.postgresql.internal_queries import InternalQueries

View File

@@ -19,8 +19,8 @@ from sqlalchemy import create_engine
import ast import ast
import app.services.project_info as project_info import app.services.project_info as project_info
import app.algorithms.api_ex.kmeans_sensor as kmeans_sensor import app.algorithms.api_ex.kmeans_sensor as kmeans_sensor
import app.algorithms.api_ex.Fdataclean as Fdataclean import app.algorithms.api_ex.flow_data_clean as flow_data_clean
import app.algorithms.api_ex.Pdataclean as Pdataclean import app.algorithms.api_ex.pressure_data_clean as pressure_data_clean
import app.algorithms.api_ex.sensitivity as sensitivity import app.algorithms.api_ex.sensitivity as sensitivity
from app.native.api.postgresql_info import get_pgconn_string from app.native.api.postgresql_info import get_pgconn_string
@@ -1475,7 +1475,7 @@ def flow_data_clean(input_csv_file: str) -> str:
if not os.path.exists(input_csv_path): if not os.path.exists(input_csv_path):
raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}") raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}")
# 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗 # 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗
out_xlsx_path = Fdataclean.clean_flow_data_kf(input_csv_path) out_xlsx_path = flow_data_clean.clean_flow_data_kf(input_csv_path)
print("清洗后的数据已保存到:", out_xlsx_path) print("清洗后的数据已保存到:", out_xlsx_path)
@@ -1502,7 +1502,7 @@ def pressure_data_clean(input_csv_file: str) -> str:
if not os.path.exists(input_csv_path): if not os.path.exists(input_csv_path):
raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}") raise FileNotFoundError(f"指定的文件不存在: {input_csv_path}")
# 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗 # 调用 Fdataclean.clean_flow_data_kf 函数进行数据清洗
out_xlsx_path = Pdataclean.clean_pressure_data_km(input_csv_path) out_xlsx_path = pressure_data_clean.clean_pressure_data_km(input_csv_path)
print("清洗后的数据已保存到:", out_xlsx_path) print("清洗后的数据已保存到:", out_xlsx_path)