from __future__ import annotations from datetime import datetime from typing import Any import pandas as pd from app.algorithms.burst_detection.burst_detector import BurstDetector from app.infra.db.timescaledb.internal_queries import InternalQueries from app.services.scheme_management import ( query_burst_detection_scheme_detail, query_burst_detection_schemes, scheme_name_exists, store_scheme_info, ) from app.services.tjnetwork import get_all_scada_info from app.services.time_api import extract_date, parse_utc_time, utc_now def run_burst_detection( *, network: str, username: str, observed_pressure_data: ( pd.DataFrame | dict[str, list[Any]] | list[dict[str, Any]] | list[list[Any]] | None ) = None, points_per_day: int = 1440, mu: int = 100, iforest_params: dict[str, Any] | None = None, scada_start: datetime | str | None = None, scada_end: datetime | str | None = None, sensor_nodes: list[str] | None = None, scheme_name: str | None = None, data_source: str = "monitoring", simulation_scheme_name: str | None = None, simulation_scheme_type: str | None = None, ) -> dict[str, Any]: """ 运行爆管侦测服务入口。 调用方式二选一: - 直接传 `observed_pressure_data` - 或传 `scada_start/scada_end` 让后端自动查询 SCADA 压力数据 `observed_pressure_data` 支持格式: - `pd.DataFrame` 行表示时间点,列表示传感器;列名应为传感器/节点 ID。 - `dict[str, list[Any]]` 键为传感器/节点 ID,值为按时间顺序排列的压力序列。 例如:`{"J1": [101.2, 101.0], "J2": [99.8, 99.7]}`。 - `list[dict[str, Any]]` 每个元素代表一个时间点的多传感器观测。 例如:`[{"J1": 101.2, "J2": 99.8}, {"J1": 101.0, "J2": 99.7}]`。 - `list[list[Any]]` 二维数组式 JSON,格式为 `(时间点数, 传感器数)`。 这是最接近原始 `burst_detector` 示例代码的调用方式。 数据约束: - 统一要求“行=时间点,列=传感器”。 - 总样本点数必须能被 `points_per_day` 整除。 - 至少要有 2 天数据,即 `sample_count >= 2 * points_per_day`。 - 若传入 `sensor_nodes`,输入数据必须包含这些列;SCADA 模式下也会只按这些节点取数。 """ if not network: raise ValueError("network is required.") selected_sensor_nodes = ( list(dict.fromkeys([node for node in (sensor_nodes or []) if node])) if sensor_nodes else None ) use_scada_source = scada_start is not None or scada_end is not None if use_scada_source: scada_sensor_nodes = ( selected_sensor_nodes if selected_sensor_nodes is not None else _get_pressure_sensor_nodes(network) ) if data_source == "simulation": if not simulation_scheme_name: raise ValueError("模拟方案模式必须提供 simulation_scheme_name。") observed_df = _build_observed_pressure_from_simulation( network=network, sensor_nodes=scada_sensor_nodes, scada_start=scada_start, scada_end=scada_end, simulation_scheme_name=simulation_scheme_name, simulation_scheme_type=simulation_scheme_type, ) observed_input = observed_df observed_source = "simulation_scheme_timerange" else: observed_df = _build_observed_pressure_from_scada( network=network, sensor_nodes=scada_sensor_nodes, scada_start=scada_start, scada_end=scada_end, ) observed_input = observed_df observed_source = "backend_timerange" else: if observed_pressure_data is None: raise ValueError( "未提供 observed_pressure_data,且未提供 scada_start/scada_end。" ) observed_input = observed_pressure_data observed_source = "request_payload" detector = BurstDetector( mu=mu, points_per_day=points_per_day, iforest_params=iforest_params, ) result_df = detector.run_detection( observed_input, sensor_nodes=selected_sensor_nodes, ) resolved_sensor_nodes = list(result_df.attrs.get("sensor_nodes", [])) rows = _serialize_result_rows(result_df) payload: dict[str, Any] = { "network": network, "sensor_nodes": resolved_sensor_nodes, "observed_source": observed_source, "sample_count": int(result_df.attrs.get("sample_count", 0)), "points_per_day": int(result_df.attrs.get("points_per_day", points_per_day)), "day_count": int(result_df.attrs.get("day_count", len(result_df))), "rows": rows, "summary": _build_detection_summary(result_df), } if data_source == "simulation": payload["data_source"] = "simulation" payload["simulation_scheme"] = { "name": simulation_scheme_name, "type": simulation_scheme_type, } else: payload["data_source"] = "monitoring" if use_scada_source: payload["scada_window"] = { "start": _to_datetime(scada_start).isoformat(), "end": _to_datetime(scada_end).isoformat(), } if scheme_name: _store_burst_detection_scheme( network=network, scheme_name=scheme_name, username=username, payload=payload, mu=mu, points_per_day=points_per_day, iforest_params=detector.iforest_params, ) payload["scheme_name"] = scheme_name return payload def _build_observed_pressure_from_simulation( *, network: str, sensor_nodes: list[str], scada_start: datetime | str | None, scada_end: datetime | str | None, simulation_scheme_name: str | None, simulation_scheme_type: str | None = None, ) -> pd.DataFrame: if scada_start is None or scada_end is None: raise ValueError("使用模拟方案查询时必须同时提供 scada_start 与 scada_end。") start_dt = _to_datetime(scada_start) end_dt = _to_datetime(scada_end) if start_dt >= end_dt: raise ValueError("SCADA 时间窗非法:scada_start 必须早于 scada_end。") # Reuse burst_location logic partially here or call internal queries directly # For burst detection, we need time series for all sensor nodes. # Check for missing nodes in simulation result if needed, but InternalQueries handles some of it. # We assume sensor_nodes are valid pressure nodes. scheme_type = simulation_scheme_type or "burst_analysis" simulation_data = InternalQueries.query_scheme_simulation_by_ids_timerange( db_name=network, scheme_type=scheme_type, scheme_name=simulation_scheme_name, element_ids=sensor_nodes, start_time=start_dt.isoformat(), end_time=end_dt.isoformat(), element_type="node", field="pressure", ) # simulation_data is {sensor_id: [{time, value}, ...]} # Convert to DataFrame: index=time, columns=sensor_ids data_dict = {} timestamps = set() for sensor_id in sensor_nodes: records = simulation_data.get(sensor_id, []) if not records: continue # Convert records to Series with time index ts_values = [] ts_index = [] for r in records: if r.get("value") is not None: ts_values.append(float(r["value"])) ts_index.append(pd.to_datetime(r["time"])) if ts_values: s = pd.Series(ts_values, index=ts_index) data_dict[sensor_id] = s timestamps.update(ts_index) if not data_dict: raise ValueError("指定时间窗内未查询到模拟压力数据。") observation_df = pd.DataFrame(data_dict) # Handle missing timestamps if any (though simulation usually has uniform steps) # Forward fill or interpolate might be needed if steps differ, but typically they align. observation_df = observation_df.sort_index() # Fill NaN if any missing points for some sensors observation_df = observation_df.fillna(method="ffill").fillna(method="bfill") if observation_df.empty: raise ValueError("模拟压力数据无法构建观测矩阵。") return observation_df def list_burst_detection_schemes( network: str, query_date: datetime | str | None = None, ) -> list[dict[str, Any]]: parsed_date = extract_date(query_date, field_name="query_date") if query_date is not None else None return query_burst_detection_schemes( name=network, network=network, query_date=parsed_date, ) def get_burst_detection_scheme_detail(network: str, scheme_name: str) -> dict[str, Any]: result = query_burst_detection_scheme_detail(network, scheme_name) if not result: raise ValueError(f"未找到爆管侦测方案: {scheme_name}") return result def _store_burst_detection_scheme( *, network: str, scheme_name: str, username: str, payload: dict[str, Any], mu: int, points_per_day: int, iforest_params: dict[str, Any], ) -> None: if scheme_name_exists(network, scheme_name): raise ValueError(f"方案名称已存在: {scheme_name}") now_iso = utc_now().isoformat() scheme_detail = { "network": network, "sensor_nodes": payload.get("sensor_nodes", []), "observed_source": payload.get("observed_source"), "scada_window": payload.get("scada_window"), "algorithm_params": { "mu": mu, "points_per_day": points_per_day, "iforest_params": iforest_params, }, "result_summary": payload.get("summary", {}), "result_payload": payload, } store_scheme_info( name=network, scheme_name=scheme_name, scheme_type="burst_detection", username=username, scheme_start_time=now_iso, scheme_detail=scheme_detail, ) def _serialize_result_rows(result_df: pd.DataFrame) -> list[dict[str, Any]]: rows: list[dict[str, Any]] = [] for row in result_df.to_dict(orient="records"): rows.append( { "Day": int(row["Day"]), "Score": float(row["Score"]), "Prediction": int(row["Prediction"]), "IsBurst": bool(row["IsBurst"]), } ) return rows def _build_detection_summary(result_df: pd.DataFrame) -> dict[str, Any]: rows = _serialize_result_rows(result_df) if not rows: raise ValueError("爆管侦测结果为空。") score_series = result_df["Score"] most_anomalous_index = int(score_series.idxmin()) latest_row = rows[-1] anomaly_days = [row["Day"] for row in rows if row["IsBurst"]] return { "burst_detected": bool(latest_row["IsBurst"]), "latest_day": latest_row, "most_anomalous_day": int(result_df.iloc[most_anomalous_index]["Day"]), "anomaly_days": anomaly_days, "anomaly_day_count": len(anomaly_days), "latest_sensor_rankings": _build_latest_sensor_rankings(result_df), } def _build_latest_sensor_rankings(result_df: pd.DataFrame) -> list[dict[str, Any]]: feature_matrix = result_df.attrs.get("high_freq_features") sensor_nodes = list(result_df.attrs.get("sensor_nodes", [])) if feature_matrix is None or len(sensor_nodes) == 0: return [] latest_values = feature_matrix[-1] ranking = sorted( zip(sensor_nodes, latest_values, strict=False), key=lambda item: item[1], ) return [ { "sensor_node": sensor_id, "latest_high_frequency_value": float(value), } for sensor_id, value in ranking[: min(10, len(ranking))] ] def _get_pressure_sensor_nodes(network: str) -> list[str]: sensor_nodes: list[str] = [] for item in get_all_scada_info(network): if str(item.get("type", "")).lower() != "pressure": continue node_id = item.get("associated_element_id") if isinstance(node_id, str) and node_id: sensor_nodes.append(node_id) sensor_nodes = list(dict.fromkeys(sensor_nodes)) if not sensor_nodes: raise ValueError("未找到压力传感器对应节点(scada_info.type=pressure)。") return sensor_nodes def _build_observed_pressure_from_scada( *, network: str, sensor_nodes: list[str], scada_start: datetime | str | None, scada_end: datetime | str | None, ) -> pd.DataFrame: if scada_start is None or scada_end is None: raise ValueError("使用后端 SCADA 查询时必须同时提供 scada_start 与 scada_end。") start_dt = _to_datetime(scada_start) end_dt = _to_datetime(scada_end) if start_dt >= end_dt: raise ValueError("SCADA 时间窗非法:scada_start 必须早于 scada_end。") node_query_id: dict[str, str] = {} for item in get_all_scada_info(network): if str(item.get("type", "")).lower() != "pressure": continue node_id = item.get("associated_element_id") query_id = item.get("api_query_id") if ( isinstance(node_id, str) and node_id and isinstance(query_id, str) and query_id ): node_query_id[node_id] = query_id missing_nodes = [node_id for node_id in sensor_nodes if node_id not in node_query_id] if missing_nodes: preview = ", ".join(missing_nodes[:10]) raise ValueError(f"未找到可用于压力观测的 SCADA api_query_id: {preview}") query_ids = [node_query_id[node_id] for node_id in sensor_nodes] scada_data = InternalQueries.query_scada_by_ids_timerange( db_name=network, device_ids=query_ids, start_time=start_dt.isoformat(), end_time=end_dt.isoformat(), ) available_lengths = [ len(scada_data.get(query_id, [])) for query_id in query_ids if len(scada_data.get(query_id, [])) > 0 ] if not available_lengths: raise ValueError("指定时间窗内未查询到压力 SCADA 数据。") min_len = min(available_lengths) observation_df = pd.DataFrame() for node_id in sensor_nodes: query_id = node_query_id[node_id] records = scada_data.get(query_id, [])[:min_len] if len(records) < min_len: continue observation_df[node_id] = [float(item["value"]) for item in records] if observation_df.empty: raise ValueError("SCADA 压力数据无法构建观测矩阵。") return observation_df def _to_datetime(value: datetime | str) -> datetime: return parse_utc_time(value)