sub2api/tools/perf/openai_oauth_gray_guard.py

#!/usr/bin/env python3
"""OpenAI OAuth 灰度阈值守护脚本。

用途：
- 拉取 Ops 指标阈值配置与 Dashboard Overview 实时数据
- 对比 P99 TTFT / 错误率 / SLA
- 作为 6.2 灰度守护的自动化门禁（退出码可直接用于 CI/CD）

退出码：
- 0: 指标通过
- 1: 请求失败/参数错误
- 2: 指标超阈值（建议停止扩量并回滚）
"""

from __future__ import annotations

import argparse
import json
import sys
import urllib.error
import urllib.parse
import urllib.request
from dataclasses import dataclass
from typing import Any, Dict, List, Optional


@dataclass
class GuardThresholds:
    sla_percent_min: Optional[float]
    ttft_p99_ms_max: Optional[float]
    request_error_rate_percent_max: Optional[float]
    upstream_error_rate_percent_max: Optional[float]


@dataclass
class GuardSnapshot:
    sla: Optional[float]
    ttft_p99_ms: Optional[float]
    request_error_rate_percent: Optional[float]
    upstream_error_rate_percent: Optional[float]


def build_headers(token: str) -> Dict[str, str]:
    headers = {"Accept": "application/json"}
    if token.strip():
        headers["Authorization"] = f"Bearer {token.strip()}"
    return headers


def request_json(url: str, headers: Dict[str, str]) -> Dict[str, Any]:
    req = urllib.request.Request(url=url, method="GET", headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            raw = resp.read().decode("utf-8")
            return json.loads(raw)
    except urllib.error.HTTPError as e:
        body = e.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"HTTP {e.code}: {body}") from e
    except urllib.error.URLError as e:
        raise RuntimeError(f"request failed: {e}") from e


def parse_envelope_data(payload: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(payload, dict):
        raise RuntimeError("invalid response payload")
    if payload.get("code") != 0:
        raise RuntimeError(f"api error: code={payload.get('code')} message={payload.get('message')}")
    data = payload.get("data")
    if not isinstance(data, dict):
        raise RuntimeError("invalid response data")
    return data


def parse_thresholds(data: Dict[str, Any]) -> GuardThresholds:
    return GuardThresholds(
        sla_percent_min=to_float_or_none(data.get("sla_percent_min")),
        ttft_p99_ms_max=to_float_or_none(data.get("ttft_p99_ms_max")),
        request_error_rate_percent_max=to_float_or_none(data.get("request_error_rate_percent_max")),
        upstream_error_rate_percent_max=to_float_or_none(data.get("upstream_error_rate_percent_max")),
    )


def parse_snapshot(data: Dict[str, Any]) -> GuardSnapshot:
    ttft = data.get("ttft") if isinstance(data.get("ttft"), dict) else {}
    return GuardSnapshot(
        sla=to_float_or_none(data.get("sla")),
        ttft_p99_ms=to_float_or_none(ttft.get("p99_ms")),
        request_error_rate_percent=to_float_or_none(data.get("error_rate")),
        upstream_error_rate_percent=to_float_or_none(data.get("upstream_error_rate")),
    )


def to_float_or_none(v: Any) -> Optional[float]:
    if v is None:
        return None
    try:
        return float(v)
    except (TypeError, ValueError):
        return None


def evaluate(snapshot: GuardSnapshot, thresholds: GuardThresholds) -> List[str]:
    violations: List[str] = []

    if thresholds.sla_percent_min is not None and snapshot.sla is not None:
        if snapshot.sla < thresholds.sla_percent_min:
            violations.append(
                f"SLA 低于阈值: actual={snapshot.sla:.2f}% threshold={thresholds.sla_percent_min:.2f}%"
            )

    if thresholds.ttft_p99_ms_max is not None and snapshot.ttft_p99_ms is not None:
        if snapshot.ttft_p99_ms > thresholds.ttft_p99_ms_max:
            violations.append(
                f"TTFT P99 超阈值: actual={snapshot.ttft_p99_ms:.2f}ms threshold={thresholds.ttft_p99_ms_max:.2f}ms"
            )

    if (
        thresholds.request_error_rate_percent_max is not None
        and snapshot.request_error_rate_percent is not None
        and snapshot.request_error_rate_percent > thresholds.request_error_rate_percent_max
    ):
        violations.append(
            "请求错误率超阈值: "
            f"actual={snapshot.request_error_rate_percent:.2f}% "
            f"threshold={thresholds.request_error_rate_percent_max:.2f}%"
        )

    if (
        thresholds.upstream_error_rate_percent_max is not None
        and snapshot.upstream_error_rate_percent is not None
        and snapshot.upstream_error_rate_percent > thresholds.upstream_error_rate_percent_max
    ):
        violations.append(
            "上游错误率超阈值: "
            f"actual={snapshot.upstream_error_rate_percent:.2f}% "
            f"threshold={thresholds.upstream_error_rate_percent_max:.2f}%"
        )

    return violations


def main() -> int:
    parser = argparse.ArgumentParser(description="OpenAI OAuth 灰度阈值守护")
    parser.add_argument("--base-url", required=True, help="服务地址，例如 http://127.0.0.1:5231")
    parser.add_argument("--admin-token", default="", help="Admin JWT（可选，按部署策略）")
    parser.add_argument("--platform", default="openai", help="平台过滤，默认 openai")
    parser.add_argument("--time-range", default="30m", help="时间窗口: 5m/30m/1h/6h/24h/7d/30d")
    parser.add_argument("--group-id", default="", help="可选 group_id")
    args = parser.parse_args()

    base = args.base_url.rstrip("/")
    headers = build_headers(args.admin_token)

    try:
        threshold_url = f"{base}/api/v1/admin/ops/settings/metric-thresholds"
        thresholds_raw = request_json(threshold_url, headers)
        thresholds = parse_thresholds(parse_envelope_data(thresholds_raw))

        query = {"platform": args.platform, "time_range": args.time_range}
        if args.group_id.strip():
            query["group_id"] = args.group_id.strip()
        overview_url = (
            f"{base}/api/v1/admin/ops/dashboard/overview?"
            + urllib.parse.urlencode(query)
        )
        overview_raw = request_json(overview_url, headers)
        snapshot = parse_snapshot(parse_envelope_data(overview_raw))

        print("[OpenAI OAuth Gray Guard] 当前快照:")
        print(
            json.dumps(
                {
                    "sla": snapshot.sla,
                    "ttft_p99_ms": snapshot.ttft_p99_ms,
                    "request_error_rate_percent": snapshot.request_error_rate_percent,
                    "upstream_error_rate_percent": snapshot.upstream_error_rate_percent,
                },
                ensure_ascii=False,
                indent=2,
            )
        )
        print("[OpenAI OAuth Gray Guard] 阈值配置:")
        print(
            json.dumps(
                {
                    "sla_percent_min": thresholds.sla_percent_min,
                    "ttft_p99_ms_max": thresholds.ttft_p99_ms_max,
                    "request_error_rate_percent_max": thresholds.request_error_rate_percent_max,
                    "upstream_error_rate_percent_max": thresholds.upstream_error_rate_percent_max,
                },
                ensure_ascii=False,
                indent=2,
            )
        )

        violations = evaluate(snapshot, thresholds)
        if violations:
            print("[OpenAI OAuth Gray Guard] 检测到阈值违例：")
            for idx, line in enumerate(violations, start=1):
                print(f"  {idx}. {line}")
            print("[OpenAI OAuth Gray Guard] 建议：停止扩量并执行回滚。")
            return 2

        print("[OpenAI OAuth Gray Guard] 指标通过，可继续观察或按计划扩量。")
        return 0

    except Exception as exc:
        print(f"[OpenAI OAuth Gray Guard] 执行失败: {exc}", file=sys.stderr)
        return 1


if __name__ == "__main__":
    raise SystemExit(main())