TJWaterServerBinary/app/services/burst_detection.py

from __future__ import annotations

from datetime import datetime
from typing import Any

import pandas as pd

from app.algorithms.burst_detection.burst_detector import BurstDetector
from app.infra.db.timescaledb.internal_queries import InternalQueries
from app.services.scheme_management import (
    query_burst_detection_scheme_detail,
    query_burst_detection_schemes,
    scheme_name_exists,
    store_scheme_info,
)
from app.services.tjnetwork import get_all_scada_info
from app.services.time_api import extract_date, parse_utc_time, utc_now


def run_burst_detection(
    *,
    network: str,
    username: str,
    observed_pressure_data: (
        pd.DataFrame
        | dict[str, list[Any]]
        | list[dict[str, Any]]
        | list[list[Any]]
        | None
    ) = None,
    points_per_day: int = 1440,
    mu: int = 100,
    iforest_params: dict[str, Any] | None = None,
    scada_start: datetime | str | None = None,
    scada_end: datetime | str | None = None,
    sensor_nodes: list[str] | None = None,
    scheme_name: str | None = None,
    data_source: str = "monitoring",
    simulation_scheme_name: str | None = None,
    simulation_scheme_type: str | None = None,
) -> dict[str, Any]:
    """
    运行爆管侦测服务入口。

    调用方式二选一：
    - 直接传 `observed_pressure_data`
    - 或传 `scada_start/scada_end` 让后端自动查询 SCADA 压力数据

    `observed_pressure_data` 支持格式：
    - `pd.DataFrame`
      行表示时间点，列表示传感器；列名应为传感器/节点 ID。
    - `dict[str, list[Any]]`
      键为传感器/节点 ID，值为按时间顺序排列的压力序列。
      例如：`{"J1": [101.2, 101.0], "J2": [99.8, 99.7]}`。
    - `list[dict[str, Any]]`
      每个元素代表一个时间点的多传感器观测。
      例如：`[{"J1": 101.2, "J2": 99.8}, {"J1": 101.0, "J2": 99.7}]`。
    - `list[list[Any]]`
      二维数组式 JSON，格式为 `(时间点数, 传感器数)`。
      这是最接近原始 `burst_detector` 示例代码的调用方式。

    数据约束：
    - 统一要求“行=时间点，列=传感器”。
    - 总样本点数必须能被 `points_per_day` 整除。
    - 至少要有 2 天数据，即 `sample_count >= 2 * points_per_day`。
    - 若传入 `sensor_nodes`，输入数据必须包含这些列；SCADA 模式下也会只按这些节点取数。
    """
    if not network:
        raise ValueError("network is required.")

    selected_sensor_nodes = (
        list(dict.fromkeys([node for node in (sensor_nodes or []) if node]))
        if sensor_nodes
        else None
    )
    use_scada_source = scada_start is not None or scada_end is not None

    if use_scada_source:
        scada_sensor_nodes = (
            selected_sensor_nodes
            if selected_sensor_nodes is not None
            else _get_pressure_sensor_nodes(network)
        )
        if data_source == "simulation":
            if not simulation_scheme_name:
                raise ValueError("模拟方案模式必须提供 simulation_scheme_name。")
            observed_df = _build_observed_pressure_from_simulation(
                network=network,
                sensor_nodes=scada_sensor_nodes,
                scada_start=scada_start,
                scada_end=scada_end,
                simulation_scheme_name=simulation_scheme_name,
                simulation_scheme_type=simulation_scheme_type,
            )
            observed_input = observed_df
            observed_source = "simulation_scheme_timerange"
        else:
            observed_df = _build_observed_pressure_from_scada(
                network=network,
                sensor_nodes=scada_sensor_nodes,
                scada_start=scada_start,
                scada_end=scada_end,
            )
            observed_input = observed_df
            observed_source = "backend_timerange"
    else:
        if observed_pressure_data is None:
            raise ValueError(
                "未提供 observed_pressure_data，且未提供 scada_start/scada_end。"
            )
        observed_input = observed_pressure_data
        observed_source = "request_payload"

    detector = BurstDetector(
        mu=mu,
        points_per_day=points_per_day,
        iforest_params=iforest_params,
    )
    result_df = detector.run_detection(
        observed_input,
        sensor_nodes=selected_sensor_nodes,
    )
    resolved_sensor_nodes = list(result_df.attrs.get("sensor_nodes", []))
    rows = _serialize_result_rows(result_df)
    payload: dict[str, Any] = {
        "network": network,
        "sensor_nodes": resolved_sensor_nodes,
        "observed_source": observed_source,
        "sample_count": int(result_df.attrs.get("sample_count", 0)),
        "points_per_day": int(result_df.attrs.get("points_per_day", points_per_day)),
        "day_count": int(result_df.attrs.get("day_count", len(result_df))),
        "rows": rows,
        "summary": _build_detection_summary(result_df),
    }
    if data_source == "simulation":
        payload["data_source"] = "simulation"
        payload["simulation_scheme"] = {
            "name": simulation_scheme_name,
            "type": simulation_scheme_type,
        }
    else:
        payload["data_source"] = "monitoring"

    if use_scada_source:
        payload["scada_window"] = {
            "start": _to_datetime(scada_start).isoformat(),
            "end": _to_datetime(scada_end).isoformat(),
        }
    if scheme_name:
        _store_burst_detection_scheme(
            network=network,
            scheme_name=scheme_name,
            username=username,
            payload=payload,
            mu=mu,
            points_per_day=points_per_day,
            iforest_params=detector.iforest_params,
        )
        payload["scheme_name"] = scheme_name
    return payload


def _build_observed_pressure_from_simulation(
    *,
    network: str,
    sensor_nodes: list[str],
    scada_start: datetime | str | None,
    scada_end: datetime | str | None,
    simulation_scheme_name: str | None,
    simulation_scheme_type: str | None = None,
) -> pd.DataFrame:
    if scada_start is None or scada_end is None:
        raise ValueError("使用模拟方案查询时必须同时提供 scada_start 与 scada_end。")

    start_dt = _to_datetime(scada_start)
    end_dt = _to_datetime(scada_end)
    if start_dt >= end_dt:
        raise ValueError("SCADA 时间窗非法：scada_start 必须早于 scada_end。")

    # Reuse burst_location logic partially here or call internal queries directly
    # For burst detection, we need time series for all sensor nodes.

    # Check for missing nodes in simulation result if needed, but InternalQueries handles some of it.
    # We assume sensor_nodes are valid pressure nodes.

    scheme_type = simulation_scheme_type or "burst_analysis"

    simulation_data = InternalQueries.query_scheme_simulation_by_ids_timerange(
        db_name=network,
        scheme_type=scheme_type,
        scheme_name=simulation_scheme_name,
        element_ids=sensor_nodes,
        start_time=start_dt.isoformat(),
        end_time=end_dt.isoformat(),
        element_type="node",
        field="pressure",
    )

    # simulation_data is {sensor_id: [{time, value}, ...]}
    # Convert to DataFrame: index=time, columns=sensor_ids

    data_dict = {}
    timestamps = set()

    for sensor_id in sensor_nodes:
        records = simulation_data.get(sensor_id, [])
        if not records:
             continue

        # Convert records to Series with time index
        ts_values = []
        ts_index = []
        for r in records:
            if r.get("value") is not None:
                ts_values.append(float(r["value"]))
                ts_index.append(pd.to_datetime(r["time"]))

        if ts_values:
            s = pd.Series(ts_values, index=ts_index)
            data_dict[sensor_id] = s
            timestamps.update(ts_index)

    if not data_dict:
        raise ValueError("指定时间窗内未查询到模拟压力数据。")

    observation_df = pd.DataFrame(data_dict)

    # Handle missing timestamps if any (though simulation usually has uniform steps)
    # Forward fill or interpolate might be needed if steps differ, but typically they align.
    observation_df = observation_df.sort_index()

    # Fill NaN if any missing points for some sensors
    observation_df = observation_df.fillna(method="ffill").fillna(method="bfill")

    if observation_df.empty:
        raise ValueError("模拟压力数据无法构建观测矩阵。")

    return observation_df


def list_burst_detection_schemes(
    network: str,
    query_date: datetime | str | None = None,
) -> list[dict[str, Any]]:
    parsed_date = extract_date(query_date, field_name="query_date") if query_date is not None else None
    return query_burst_detection_schemes(
        name=network,
        network=network,
        query_date=parsed_date,
    )


def get_burst_detection_scheme_detail(network: str, scheme_name: str) -> dict[str, Any]:
    result = query_burst_detection_scheme_detail(network, scheme_name)
    if not result:
        raise ValueError(f"未找到爆管侦测方案: {scheme_name}")
    return result


def _store_burst_detection_scheme(
    *,
    network: str,
    scheme_name: str,
    username: str,
    payload: dict[str, Any],
    mu: int,
    points_per_day: int,
    iforest_params: dict[str, Any],
) -> None:
    if scheme_name_exists(network, scheme_name):
        raise ValueError(f"方案名称已存在: {scheme_name}")

    now_iso = utc_now().isoformat()
    scheme_detail = {
        "network": network,
        "sensor_nodes": payload.get("sensor_nodes", []),
        "observed_source": payload.get("observed_source"),
        "scada_window": payload.get("scada_window"),
        "algorithm_params": {
            "mu": mu,
            "points_per_day": points_per_day,
            "iforest_params": iforest_params,
        },
        "result_summary": payload.get("summary", {}),
        "result_payload": payload,
    }
    store_scheme_info(
        name=network,
        scheme_name=scheme_name,
        scheme_type="burst_detection",
        username=username,
        scheme_start_time=now_iso,
        scheme_detail=scheme_detail,
    )


def _serialize_result_rows(result_df: pd.DataFrame) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    for row in result_df.to_dict(orient="records"):
        rows.append(
            {
                "Day": int(row["Day"]),
                "Score": float(row["Score"]),
                "Prediction": int(row["Prediction"]),
                "IsBurst": bool(row["IsBurst"]),
            }
        )
    return rows


def _build_detection_summary(result_df: pd.DataFrame) -> dict[str, Any]:
    rows = _serialize_result_rows(result_df)
    if not rows:
        raise ValueError("爆管侦测结果为空。")

    score_series = result_df["Score"]
    most_anomalous_index = int(score_series.idxmin())
    latest_row = rows[-1]
    anomaly_days = [row["Day"] for row in rows if row["IsBurst"]]

    return {
        "burst_detected": bool(latest_row["IsBurst"]),
        "latest_day": latest_row,
        "most_anomalous_day": int(result_df.iloc[most_anomalous_index]["Day"]),
        "anomaly_days": anomaly_days,
        "anomaly_day_count": len(anomaly_days),
        "latest_sensor_rankings": _build_latest_sensor_rankings(result_df),
    }


def _build_latest_sensor_rankings(result_df: pd.DataFrame) -> list[dict[str, Any]]:
    feature_matrix = result_df.attrs.get("high_freq_features")
    sensor_nodes = list(result_df.attrs.get("sensor_nodes", []))
    if feature_matrix is None or len(sensor_nodes) == 0:
        return []

    latest_values = feature_matrix[-1]
    ranking = sorted(
        zip(sensor_nodes, latest_values, strict=False),
        key=lambda item: item[1],
    )
    return [
        {
            "sensor_node": sensor_id,
            "latest_high_frequency_value": float(value),
        }
        for sensor_id, value in ranking[: min(10, len(ranking))]
    ]


def _get_pressure_sensor_nodes(network: str) -> list[str]:
    sensor_nodes: list[str] = []
    for item in get_all_scada_info(network):
        if str(item.get("type", "")).lower() != "pressure":
            continue
        node_id = item.get("associated_element_id")
        if isinstance(node_id, str) and node_id:
            sensor_nodes.append(node_id)
    sensor_nodes = list(dict.fromkeys(sensor_nodes))
    if not sensor_nodes:
        raise ValueError("未找到压力传感器对应节点（scada_info.type=pressure）。")
    return sensor_nodes


def _build_observed_pressure_from_scada(
    *,
    network: str,
    sensor_nodes: list[str],
    scada_start: datetime | str | None,
    scada_end: datetime | str | None,
) -> pd.DataFrame:
    if scada_start is None or scada_end is None:
        raise ValueError("使用后端 SCADA 查询时必须同时提供 scada_start 与 scada_end。")

    start_dt = _to_datetime(scada_start)
    end_dt = _to_datetime(scada_end)
    if start_dt >= end_dt:
        raise ValueError("SCADA 时间窗非法：scada_start 必须早于 scada_end。")

    node_query_id: dict[str, str] = {}
    for item in get_all_scada_info(network):
        if str(item.get("type", "")).lower() != "pressure":
            continue
        node_id = item.get("associated_element_id")
        query_id = item.get("api_query_id")
        if (
            isinstance(node_id, str)
            and node_id
            and isinstance(query_id, str)
            and query_id
        ):
            node_query_id[node_id] = query_id

    missing_nodes = [node_id for node_id in sensor_nodes if node_id not in node_query_id]
    if missing_nodes:
        preview = ", ".join(missing_nodes[:10])
        raise ValueError(f"未找到可用于压力观测的 SCADA api_query_id: {preview}")

    query_ids = [node_query_id[node_id] for node_id in sensor_nodes]
    scada_data = InternalQueries.query_scada_by_ids_timerange(
        db_name=network,
        device_ids=query_ids,
        start_time=start_dt.isoformat(),
        end_time=end_dt.isoformat(),
    )

    available_lengths = [
        len(scada_data.get(query_id, []))
        for query_id in query_ids
        if len(scada_data.get(query_id, [])) > 0
    ]
    if not available_lengths:
        raise ValueError("指定时间窗内未查询到压力 SCADA 数据。")

    min_len = min(available_lengths)
    observation_df = pd.DataFrame()
    for node_id in sensor_nodes:
        query_id = node_query_id[node_id]
        records = scada_data.get(query_id, [])[:min_len]
        if len(records) < min_len:
            continue
        observation_df[node_id] = [float(item["value"]) for item in records]

    if observation_df.empty:
        raise ValueError("SCADA 压力数据无法构建观测矩阵。")
    return observation_df


def _to_datetime(value: datetime | str) -> datetime:
    return parse_utc_time(value)