feat(handler): 实现运维监控 API 处理器和中间件

- 新增 ops 错误日志记录器(ops_error_logger.go)
- 新增 ops 主处理器(ops_handler.go)
- 新增告警管理处理器(ops_alerts_handler.go)
- 新增仪表板处理器(ops_dashboard_handler.go)
- 新增实时监控处理器(ops_realtime_handler.go)
- 新增配置管理处理器(ops_settings_handler.go)
- 新增 WebSocket 处理器(ops_ws_handler.go)
- 扩展设置 DTO 支持 ops 配置
- 新增客户端请求 ID 中间件(client_request_id.go)
- 新增 WebSocket 查询令牌认证中间件(ws_query_token_auth.go)
- 更新管理员认证中间件支持 ops 路由
- 注册 handler 依赖注入
This commit is contained in:
IanShaw027
2026-01-09 20:54:26 +08:00
parent 5baa8b5673
commit f3ed95d4de
12 changed files with 2854 additions and 0 deletions

View File

@@ -0,0 +1,433 @@
package admin
import (
"encoding/json"
"fmt"
"math"
"net/http"
"strconv"
"strings"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
"github.com/gin-gonic/gin/binding"
)
var validOpsAlertMetricTypes = []string{
"success_rate",
"error_rate",
"upstream_error_rate",
"p95_latency_ms",
"p99_latency_ms",
"cpu_usage_percent",
"memory_usage_percent",
"concurrency_queue_depth",
}
var validOpsAlertMetricTypeSet = func() map[string]struct{} {
set := make(map[string]struct{}, len(validOpsAlertMetricTypes))
for _, v := range validOpsAlertMetricTypes {
set[v] = struct{}{}
}
return set
}()
var validOpsAlertOperators = []string{">", "<", ">=", "<=", "==", "!="}
var validOpsAlertOperatorSet = func() map[string]struct{} {
set := make(map[string]struct{}, len(validOpsAlertOperators))
for _, v := range validOpsAlertOperators {
set[v] = struct{}{}
}
return set
}()
var validOpsAlertSeverities = []string{"P0", "P1", "P2", "P3"}
var validOpsAlertSeveritySet = func() map[string]struct{} {
set := make(map[string]struct{}, len(validOpsAlertSeverities))
for _, v := range validOpsAlertSeverities {
set[v] = struct{}{}
}
return set
}()
type opsAlertRuleValidatedInput struct {
Name string
MetricType string
Operator string
Threshold float64
Severity string
WindowMinutes int
SustainedMinutes int
CooldownMinutes int
Enabled bool
NotifyEmail bool
WindowProvided bool
SustainedProvided bool
CooldownProvided bool
SeverityProvided bool
EnabledProvided bool
NotifyProvided bool
}
func isPercentOrRateMetric(metricType string) bool {
switch metricType {
case "success_rate",
"error_rate",
"upstream_error_rate",
"cpu_usage_percent",
"memory_usage_percent":
return true
default:
return false
}
}
func validateOpsAlertRulePayload(raw map[string]json.RawMessage) (*opsAlertRuleValidatedInput, error) {
if raw == nil {
return nil, fmt.Errorf("invalid request body")
}
requiredFields := []string{"name", "metric_type", "operator", "threshold"}
for _, field := range requiredFields {
if _, ok := raw[field]; !ok {
return nil, fmt.Errorf("%s is required", field)
}
}
var name string
if err := json.Unmarshal(raw["name"], &name); err != nil || strings.TrimSpace(name) == "" {
return nil, fmt.Errorf("name is required")
}
name = strings.TrimSpace(name)
var metricType string
if err := json.Unmarshal(raw["metric_type"], &metricType); err != nil || strings.TrimSpace(metricType) == "" {
return nil, fmt.Errorf("metric_type is required")
}
metricType = strings.TrimSpace(metricType)
if _, ok := validOpsAlertMetricTypeSet[metricType]; !ok {
return nil, fmt.Errorf("metric_type must be one of: %s", strings.Join(validOpsAlertMetricTypes, ", "))
}
var operator string
if err := json.Unmarshal(raw["operator"], &operator); err != nil || strings.TrimSpace(operator) == "" {
return nil, fmt.Errorf("operator is required")
}
operator = strings.TrimSpace(operator)
if _, ok := validOpsAlertOperatorSet[operator]; !ok {
return nil, fmt.Errorf("operator must be one of: %s", strings.Join(validOpsAlertOperators, ", "))
}
var threshold float64
if err := json.Unmarshal(raw["threshold"], &threshold); err != nil {
return nil, fmt.Errorf("threshold must be a number")
}
if math.IsNaN(threshold) || math.IsInf(threshold, 0) {
return nil, fmt.Errorf("threshold must be a finite number")
}
if isPercentOrRateMetric(metricType) {
if threshold < 0 || threshold > 100 {
return nil, fmt.Errorf("threshold must be between 0 and 100 for metric_type %s", metricType)
}
} else if threshold < 0 {
return nil, fmt.Errorf("threshold must be >= 0")
}
validated := &opsAlertRuleValidatedInput{
Name: name,
MetricType: metricType,
Operator: operator,
Threshold: threshold,
}
if v, ok := raw["severity"]; ok {
validated.SeverityProvided = true
var sev string
if err := json.Unmarshal(v, &sev); err != nil {
return nil, fmt.Errorf("severity must be a string")
}
sev = strings.ToUpper(strings.TrimSpace(sev))
if sev != "" {
if _, ok := validOpsAlertSeveritySet[sev]; !ok {
return nil, fmt.Errorf("severity must be one of: %s", strings.Join(validOpsAlertSeverities, ", "))
}
validated.Severity = sev
}
}
if validated.Severity == "" {
validated.Severity = "P2"
}
if v, ok := raw["enabled"]; ok {
validated.EnabledProvided = true
if err := json.Unmarshal(v, &validated.Enabled); err != nil {
return nil, fmt.Errorf("enabled must be a boolean")
}
} else {
validated.Enabled = true
}
if v, ok := raw["notify_email"]; ok {
validated.NotifyProvided = true
if err := json.Unmarshal(v, &validated.NotifyEmail); err != nil {
return nil, fmt.Errorf("notify_email must be a boolean")
}
} else {
validated.NotifyEmail = true
}
if v, ok := raw["window_minutes"]; ok {
validated.WindowProvided = true
if err := json.Unmarshal(v, &validated.WindowMinutes); err != nil {
return nil, fmt.Errorf("window_minutes must be an integer")
}
switch validated.WindowMinutes {
case 1, 5, 60:
default:
return nil, fmt.Errorf("window_minutes must be one of: 1, 5, 60")
}
} else {
validated.WindowMinutes = 1
}
if v, ok := raw["sustained_minutes"]; ok {
validated.SustainedProvided = true
if err := json.Unmarshal(v, &validated.SustainedMinutes); err != nil {
return nil, fmt.Errorf("sustained_minutes must be an integer")
}
if validated.SustainedMinutes < 1 || validated.SustainedMinutes > 1440 {
return nil, fmt.Errorf("sustained_minutes must be between 1 and 1440")
}
} else {
validated.SustainedMinutes = 1
}
if v, ok := raw["cooldown_minutes"]; ok {
validated.CooldownProvided = true
if err := json.Unmarshal(v, &validated.CooldownMinutes); err != nil {
return nil, fmt.Errorf("cooldown_minutes must be an integer")
}
if validated.CooldownMinutes < 0 || validated.CooldownMinutes > 1440 {
return nil, fmt.Errorf("cooldown_minutes must be between 0 and 1440")
}
} else {
validated.CooldownMinutes = 0
}
return validated, nil
}
// ListAlertRules returns all ops alert rules.
// GET /api/v1/admin/ops/alert-rules
func (h *OpsHandler) ListAlertRules(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
rules, err := h.opsService.ListAlertRules(c.Request.Context())
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, rules)
}
// CreateAlertRule creates an ops alert rule.
// POST /api/v1/admin/ops/alert-rules
func (h *OpsHandler) CreateAlertRule(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
var raw map[string]json.RawMessage
if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
validated, err := validateOpsAlertRulePayload(raw)
if err != nil {
response.BadRequest(c, err.Error())
return
}
var rule service.OpsAlertRule
if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
rule.Name = validated.Name
rule.MetricType = validated.MetricType
rule.Operator = validated.Operator
rule.Threshold = validated.Threshold
rule.WindowMinutes = validated.WindowMinutes
rule.SustainedMinutes = validated.SustainedMinutes
rule.CooldownMinutes = validated.CooldownMinutes
rule.Severity = validated.Severity
rule.Enabled = validated.Enabled
rule.NotifyEmail = validated.NotifyEmail
created, err := h.opsService.CreateAlertRule(c.Request.Context(), &rule)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, created)
}
// UpdateAlertRule updates an existing ops alert rule.
// PUT /api/v1/admin/ops/alert-rules/:id
func (h *OpsHandler) UpdateAlertRule(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid rule ID")
return
}
var raw map[string]json.RawMessage
if err := c.ShouldBindBodyWith(&raw, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
validated, err := validateOpsAlertRulePayload(raw)
if err != nil {
response.BadRequest(c, err.Error())
return
}
var rule service.OpsAlertRule
if err := c.ShouldBindBodyWith(&rule, binding.JSON); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
rule.ID = id
rule.Name = validated.Name
rule.MetricType = validated.MetricType
rule.Operator = validated.Operator
rule.Threshold = validated.Threshold
rule.WindowMinutes = validated.WindowMinutes
rule.SustainedMinutes = validated.SustainedMinutes
rule.CooldownMinutes = validated.CooldownMinutes
rule.Severity = validated.Severity
rule.Enabled = validated.Enabled
rule.NotifyEmail = validated.NotifyEmail
updated, err := h.opsService.UpdateAlertRule(c.Request.Context(), &rule)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, updated)
}
// DeleteAlertRule deletes an ops alert rule.
// DELETE /api/v1/admin/ops/alert-rules/:id
func (h *OpsHandler) DeleteAlertRule(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
id, err := strconv.ParseInt(c.Param("id"), 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid rule ID")
return
}
if err := h.opsService.DeleteAlertRule(c.Request.Context(), id); err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, gin.H{"deleted": true})
}
// ListAlertEvents lists recent ops alert events.
// GET /api/v1/admin/ops/alert-events
func (h *OpsHandler) ListAlertEvents(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
limit := 100
if raw := strings.TrimSpace(c.Query("limit")); raw != "" {
n, err := strconv.Atoi(raw)
if err != nil || n <= 0 {
response.BadRequest(c, "Invalid limit")
return
}
limit = n
}
filter := &service.OpsAlertEventFilter{
Limit: limit,
Status: strings.TrimSpace(c.Query("status")),
Severity: strings.TrimSpace(c.Query("severity")),
}
// Optional global filter support (platform/group/time range).
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
filter.Platform = platform
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
if startTime, endTime, err := parseOpsTimeRange(c, "24h"); err == nil {
// Only apply when explicitly provided to avoid surprising default narrowing.
if strings.TrimSpace(c.Query("start_time")) != "" || strings.TrimSpace(c.Query("end_time")) != "" || strings.TrimSpace(c.Query("time_range")) != "" {
filter.StartTime = &startTime
filter.EndTime = &endTime
}
} else {
response.BadRequest(c, err.Error())
return
}
events, err := h.opsService.ListAlertEvents(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, events)
}

View File

@@ -0,0 +1,243 @@
package admin
import (
"net/http"
"strconv"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// GetDashboardOverview returns vNext ops dashboard overview (raw path).
// GET /api/v1/admin/ops/dashboard/overview
func (h *OpsHandler) GetDashboardOverview(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
data, err := h.opsService.GetDashboardOverview(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardThroughputTrend returns throughput time series (raw path).
// GET /api/v1/admin/ops/dashboard/throughput-trend
func (h *OpsHandler) GetDashboardThroughputTrend(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
data, err := h.opsService.GetThroughputTrend(c.Request.Context(), filter, bucketSeconds)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardLatencyHistogram returns the latency distribution histogram (success requests).
// GET /api/v1/admin/ops/dashboard/latency-histogram
func (h *OpsHandler) GetDashboardLatencyHistogram(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
data, err := h.opsService.GetLatencyHistogram(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardErrorTrend returns error counts time series (raw path).
// GET /api/v1/admin/ops/dashboard/error-trend
func (h *OpsHandler) GetDashboardErrorTrend(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
bucketSeconds := pickThroughputBucketSeconds(endTime.Sub(startTime))
data, err := h.opsService.GetErrorTrend(c.Request.Context(), filter, bucketSeconds)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
// GetDashboardErrorDistribution returns error distribution by status code (raw path).
// GET /api/v1/admin/ops/dashboard/error-distribution
func (h *OpsHandler) GetDashboardErrorDistribution(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsDashboardFilter{
StartTime: startTime,
EndTime: endTime,
Platform: strings.TrimSpace(c.Query("platform")),
QueryMode: parseOpsQueryMode(c),
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
data, err := h.opsService.GetErrorDistribution(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, data)
}
func pickThroughputBucketSeconds(window time.Duration) int {
// Keep buckets predictable and avoid huge responses.
switch {
case window <= 2*time.Hour:
return 60
case window <= 24*time.Hour:
return 300
default:
return 3600
}
}
func parseOpsQueryMode(c *gin.Context) service.OpsQueryMode {
if c == nil {
return ""
}
raw := strings.TrimSpace(c.Query("mode"))
if raw == "" {
// Empty means "use server default" (DB setting ops_query_mode_default).
return ""
}
return service.ParseOpsQueryMode(raw)
}

View File

@@ -0,0 +1,364 @@
package admin
import (
"errors"
"fmt"
"io"
"net/http"
"strconv"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
type OpsHandler struct {
opsService *service.OpsService
}
func NewOpsHandler(opsService *service.OpsService) *OpsHandler {
return &OpsHandler{opsService: opsService}
}
// GetErrorLogs lists ops error logs.
// GET /api/v1/admin/ops/errors
func (h *OpsHandler) GetErrorLogs(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
page, pageSize := response.ParsePagination(c)
// Ops list can be larger than standard admin tables.
if pageSize > 500 {
pageSize = 500
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsErrorLogFilter{
Page: page,
PageSize: pageSize,
}
if !startTime.IsZero() {
filter.StartTime = &startTime
}
if !endTime.IsZero() {
filter.EndTime = &endTime
}
if platform := strings.TrimSpace(c.Query("platform")); platform != "" {
filter.Platform = platform
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid account_id")
return
}
filter.AccountID = &id
}
if phase := strings.TrimSpace(c.Query("phase")); phase != "" {
filter.Phase = phase
}
if q := strings.TrimSpace(c.Query("q")); q != "" {
filter.Query = q
}
if statusCodesStr := strings.TrimSpace(c.Query("status_codes")); statusCodesStr != "" {
parts := strings.Split(statusCodesStr, ",")
out := make([]int, 0, len(parts))
for _, part := range parts {
p := strings.TrimSpace(part)
if p == "" {
continue
}
n, err := strconv.Atoi(p)
if err != nil || n < 0 {
response.BadRequest(c, "Invalid status_codes")
return
}
out = append(out, n)
}
filter.StatusCodes = out
}
result, err := h.opsService.GetErrorLogs(c.Request.Context(), filter)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Paginated(c, result.Errors, int64(result.Total), result.Page, result.PageSize)
}
// GetErrorLogByID returns a single error log detail.
// GET /api/v1/admin/ops/errors/:id
func (h *OpsHandler) GetErrorLogByID(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
idStr := strings.TrimSpace(c.Param("id"))
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid error id")
return
}
detail, err := h.opsService.GetErrorLogByID(c.Request.Context(), id)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, detail)
}
// ListRequestDetails returns a request-level list (success + error) for drill-down.
// GET /api/v1/admin/ops/requests
func (h *OpsHandler) ListRequestDetails(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
page, pageSize := response.ParsePagination(c)
if pageSize > 100 {
pageSize = 100
}
startTime, endTime, err := parseOpsTimeRange(c, "1h")
if err != nil {
response.BadRequest(c, err.Error())
return
}
filter := &service.OpsRequestDetailFilter{
Page: page,
PageSize: pageSize,
StartTime: &startTime,
EndTime: &endTime,
}
filter.Kind = strings.TrimSpace(c.Query("kind"))
filter.Platform = strings.TrimSpace(c.Query("platform"))
filter.Model = strings.TrimSpace(c.Query("model"))
filter.RequestID = strings.TrimSpace(c.Query("request_id"))
filter.Query = strings.TrimSpace(c.Query("q"))
filter.Sort = strings.TrimSpace(c.Query("sort"))
if v := strings.TrimSpace(c.Query("user_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid user_id")
return
}
filter.UserID = &id
}
if v := strings.TrimSpace(c.Query("api_key_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid api_key_id")
return
}
filter.APIKeyID = &id
}
if v := strings.TrimSpace(c.Query("account_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid account_id")
return
}
filter.AccountID = &id
}
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
filter.GroupID = &id
}
if v := strings.TrimSpace(c.Query("min_duration_ms")); v != "" {
parsed, err := strconv.Atoi(v)
if err != nil || parsed < 0 {
response.BadRequest(c, "Invalid min_duration_ms")
return
}
filter.MinDurationMs = &parsed
}
if v := strings.TrimSpace(c.Query("max_duration_ms")); v != "" {
parsed, err := strconv.Atoi(v)
if err != nil || parsed < 0 {
response.BadRequest(c, "Invalid max_duration_ms")
return
}
filter.MaxDurationMs = &parsed
}
out, err := h.opsService.ListRequestDetails(c.Request.Context(), filter)
if err != nil {
// Invalid sort/kind/platform etc should be a bad request; keep it simple.
if strings.Contains(strings.ToLower(err.Error()), "invalid") {
response.BadRequest(c, err.Error())
return
}
response.Error(c, http.StatusInternalServerError, "Failed to list request details")
return
}
response.Paginated(c, out.Items, out.Total, out.Page, out.PageSize)
}
type opsRetryRequest struct {
Mode string `json:"mode"`
PinnedAccountID *int64 `json:"pinned_account_id"`
}
// RetryErrorRequest retries a failed request using stored request_body.
// POST /api/v1/admin/ops/errors/:id/retry
func (h *OpsHandler) RetryErrorRequest(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
subject, ok := middleware.GetAuthSubjectFromContext(c)
if !ok || subject.UserID <= 0 {
response.Error(c, http.StatusUnauthorized, "Unauthorized")
return
}
idStr := strings.TrimSpace(c.Param("id"))
id, err := strconv.ParseInt(idStr, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid error id")
return
}
req := opsRetryRequest{Mode: service.OpsRetryModeClient}
if err := c.ShouldBindJSON(&req); err != nil && !errors.Is(err, io.EOF) {
response.BadRequest(c, "Invalid request: "+err.Error())
return
}
if strings.TrimSpace(req.Mode) == "" {
req.Mode = service.OpsRetryModeClient
}
result, err := h.opsService.RetryError(c.Request.Context(), subject.UserID, id, req.Mode, req.PinnedAccountID)
if err != nil {
response.ErrorFrom(c, err)
return
}
response.Success(c, result)
}
func parseOpsTimeRange(c *gin.Context, defaultRange string) (time.Time, time.Time, error) {
startStr := strings.TrimSpace(c.Query("start_time"))
endStr := strings.TrimSpace(c.Query("end_time"))
parseTS := func(s string) (time.Time, error) {
if s == "" {
return time.Time{}, nil
}
if t, err := time.Parse(time.RFC3339Nano, s); err == nil {
return t, nil
}
return time.Parse(time.RFC3339, s)
}
start, err := parseTS(startStr)
if err != nil {
return time.Time{}, time.Time{}, err
}
end, err := parseTS(endStr)
if err != nil {
return time.Time{}, time.Time{}, err
}
// start/end explicitly provided (even partially)
if startStr != "" || endStr != "" {
if end.IsZero() {
end = time.Now()
}
if start.IsZero() {
dur, _ := parseOpsDuration(defaultRange)
start = end.Add(-dur)
}
if start.After(end) {
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: start_time must be <= end_time")
}
if end.Sub(start) > 30*24*time.Hour {
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
}
return start, end, nil
}
// time_range fallback
tr := strings.TrimSpace(c.Query("time_range"))
if tr == "" {
tr = defaultRange
}
dur, ok := parseOpsDuration(tr)
if !ok {
dur, _ = parseOpsDuration(defaultRange)
}
end = time.Now()
start = end.Add(-dur)
if end.Sub(start) > 30*24*time.Hour {
return time.Time{}, time.Time{}, fmt.Errorf("invalid time range: max window is 30 days")
}
return start, end, nil
}
func parseOpsDuration(v string) (time.Duration, bool) {
switch strings.TrimSpace(v) {
case "5m":
return 5 * time.Minute, true
case "30m":
return 30 * time.Minute, true
case "1h":
return time.Hour, true
case "6h":
return 6 * time.Hour, true
case "24h":
return 24 * time.Hour, true
default:
return 0, false
}
}

View File

@@ -0,0 +1,120 @@
package admin
import (
"net/http"
"strconv"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// GetConcurrencyStats returns real-time concurrency usage aggregated by platform/group/account.
// GET /api/v1/admin/ops/concurrency
func (h *OpsHandler) GetConcurrencyStats(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
response.Success(c, gin.H{
"enabled": false,
"platform": map[string]*service.PlatformConcurrencyInfo{},
"group": map[int64]*service.GroupConcurrencyInfo{},
"account": map[int64]*service.AccountConcurrencyInfo{},
"timestamp": time.Now().UTC(),
})
return
}
platformFilter := strings.TrimSpace(c.Query("platform"))
var groupID *int64
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
groupID = &id
}
platform, group, account, collectedAt, err := h.opsService.GetConcurrencyStats(c.Request.Context(), platformFilter, groupID)
if err != nil {
response.ErrorFrom(c, err)
return
}
payload := gin.H{
"enabled": true,
"platform": platform,
"group": group,
"account": account,
}
if collectedAt != nil {
payload["timestamp"] = collectedAt.UTC()
}
response.Success(c, payload)
}
// GetAccountAvailability returns account availability statistics.
// GET /api/v1/admin/ops/account-availability
//
// Query params:
// - platform: optional
// - group_id: optional
func (h *OpsHandler) GetAccountAvailability(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
response.Success(c, gin.H{
"enabled": false,
"platform": map[string]*service.PlatformAvailability{},
"group": map[int64]*service.GroupAvailability{},
"account": map[int64]*service.AccountAvailability{},
"timestamp": time.Now().UTC(),
})
return
}
platform := strings.TrimSpace(c.Query("platform"))
var groupID *int64
if v := strings.TrimSpace(c.Query("group_id")); v != "" {
id, err := strconv.ParseInt(v, 10, 64)
if err != nil || id <= 0 {
response.BadRequest(c, "Invalid group_id")
return
}
groupID = &id
}
platformStats, groupStats, accountStats, collectedAt, err := h.opsService.GetAccountAvailabilityStats(c.Request.Context(), platform, groupID)
if err != nil {
response.ErrorFrom(c, err)
return
}
payload := gin.H{
"enabled": true,
"platform": platformStats,
"group": groupStats,
"account": accountStats,
}
if collectedAt != nil {
payload["timestamp"] = collectedAt.UTC()
}
response.Success(c, payload)
}

View File

@@ -0,0 +1,103 @@
package admin
import (
"net/http"
"github.com/Wei-Shaw/sub2api/internal/pkg/response"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
// GetEmailNotificationConfig returns Ops email notification config (DB-backed).
// GET /api/v1/admin/ops/email-notification/config
func (h *OpsHandler) GetEmailNotificationConfig(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
cfg, err := h.opsService.GetEmailNotificationConfig(c.Request.Context())
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get email notification config")
return
}
response.Success(c, cfg)
}
// UpdateEmailNotificationConfig updates Ops email notification config (DB-backed).
// PUT /api/v1/admin/ops/email-notification/config
func (h *OpsHandler) UpdateEmailNotificationConfig(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
var req service.OpsEmailNotificationConfigUpdateRequest
if err := c.ShouldBindJSON(&req); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
updated, err := h.opsService.UpdateEmailNotificationConfig(c.Request.Context(), &req)
if err != nil {
// Most failures here are validation errors from request payload; treat as 400.
response.Error(c, http.StatusBadRequest, err.Error())
return
}
response.Success(c, updated)
}
// GetAlertRuntimeSettings returns Ops alert evaluator runtime settings (DB-backed).
// GET /api/v1/admin/ops/runtime/alert
func (h *OpsHandler) GetAlertRuntimeSettings(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
cfg, err := h.opsService.GetOpsAlertRuntimeSettings(c.Request.Context())
if err != nil {
response.Error(c, http.StatusInternalServerError, "Failed to get alert runtime settings")
return
}
response.Success(c, cfg)
}
// UpdateAlertRuntimeSettings updates Ops alert evaluator runtime settings (DB-backed).
// PUT /api/v1/admin/ops/runtime/alert
func (h *OpsHandler) UpdateAlertRuntimeSettings(c *gin.Context) {
if h.opsService == nil {
response.Error(c, http.StatusServiceUnavailable, "Ops service not available")
return
}
if err := h.opsService.RequireMonitoringEnabled(c.Request.Context()); err != nil {
response.ErrorFrom(c, err)
return
}
var req service.OpsAlertRuntimeSettings
if err := c.ShouldBindJSON(&req); err != nil {
response.BadRequest(c, "Invalid request body")
return
}
updated, err := h.opsService.UpdateOpsAlertRuntimeSettings(c.Request.Context(), &req)
if err != nil {
response.Error(c, http.StatusBadRequest, err.Error())
return
}
response.Success(c, updated)
}

View File

@@ -0,0 +1,765 @@
package admin
import (
"context"
"encoding/json"
"log"
"math"
"net"
"net/http"
"net/netip"
"net/url"
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
"github.com/gorilla/websocket"
)
type OpsWSProxyConfig struct {
TrustProxy bool
TrustedProxies []netip.Prefix
OriginPolicy string
}
const (
envOpsWSTrustProxy = "OPS_WS_TRUST_PROXY"
envOpsWSTrustedProxies = "OPS_WS_TRUSTED_PROXIES"
envOpsWSOriginPolicy = "OPS_WS_ORIGIN_POLICY"
envOpsWSMaxConns = "OPS_WS_MAX_CONNS"
envOpsWSMaxConnsPerIP = "OPS_WS_MAX_CONNS_PER_IP"
)
const (
OriginPolicyStrict = "strict"
OriginPolicyPermissive = "permissive"
)
var opsWSProxyConfig = loadOpsWSProxyConfigFromEnv()
var upgrader = websocket.Upgrader{
CheckOrigin: func(r *http.Request) bool {
return isAllowedOpsWSOrigin(r)
},
// Subprotocol negotiation:
// - The frontend passes ["sub2api-admin", "jwt.<token>"].
// - We always select "sub2api-admin" so the token is never echoed back in the handshake response.
Subprotocols: []string{"sub2api-admin"},
}
const (
qpsWSPushInterval = 2 * time.Second
qpsWSRefreshInterval = 5 * time.Second
qpsWSRequestCountWindow = 1 * time.Minute
defaultMaxWSConns = 100
defaultMaxWSConnsPerIP = 20
)
var wsConnCount atomic.Int32
var wsConnCountByIP sync.Map // map[string]*atomic.Int32
const qpsWSIdleStopDelay = 30 * time.Second
const (
opsWSCloseRealtimeDisabled = 4001
)
var qpsWSIdleStopMu sync.Mutex
var qpsWSIdleStopTimer *time.Timer
func cancelQPSWSIdleStop() {
qpsWSIdleStopMu.Lock()
if qpsWSIdleStopTimer != nil {
qpsWSIdleStopTimer.Stop()
qpsWSIdleStopTimer = nil
}
qpsWSIdleStopMu.Unlock()
}
func scheduleQPSWSIdleStop() {
qpsWSIdleStopMu.Lock()
if qpsWSIdleStopTimer != nil {
qpsWSIdleStopMu.Unlock()
return
}
qpsWSIdleStopTimer = time.AfterFunc(qpsWSIdleStopDelay, func() {
// Only stop if truly idle at fire time.
if wsConnCount.Load() == 0 {
qpsWSCache.Stop()
}
qpsWSIdleStopMu.Lock()
qpsWSIdleStopTimer = nil
qpsWSIdleStopMu.Unlock()
})
qpsWSIdleStopMu.Unlock()
}
type opsWSRuntimeLimits struct {
MaxConns int32
MaxConnsPerIP int32
}
var opsWSLimits = loadOpsWSRuntimeLimitsFromEnv()
const (
qpsWSWriteTimeout = 10 * time.Second
qpsWSPongWait = 60 * time.Second
qpsWSPingInterval = 30 * time.Second
// We don't expect clients to send application messages; we only read to process control frames (Pong/Close).
qpsWSMaxReadBytes = 1024
)
type opsWSQPSCache struct {
refreshInterval time.Duration
requestCountWindow time.Duration
lastUpdatedUnixNano atomic.Int64
payload atomic.Value // []byte
opsService *service.OpsService
cancel context.CancelFunc
done chan struct{}
mu sync.Mutex
running bool
}
var qpsWSCache = &opsWSQPSCache{
refreshInterval: qpsWSRefreshInterval,
requestCountWindow: qpsWSRequestCountWindow,
}
func (c *opsWSQPSCache) start(opsService *service.OpsService) {
if c == nil || opsService == nil {
return
}
for {
c.mu.Lock()
if c.running {
c.mu.Unlock()
return
}
// If a previous refresh loop is currently stopping, wait for it to fully exit.
done := c.done
if done != nil {
c.mu.Unlock()
<-done
c.mu.Lock()
if c.done == done && !c.running {
c.done = nil
}
c.mu.Unlock()
continue
}
c.opsService = opsService
ctx, cancel := context.WithCancel(context.Background())
c.cancel = cancel
c.done = make(chan struct{})
done = c.done
c.running = true
c.mu.Unlock()
go func() {
defer close(done)
c.refreshLoop(ctx)
}()
return
}
}
// Stop stops the background refresh loop.
// It is safe to call multiple times.
func (c *opsWSQPSCache) Stop() {
if c == nil {
return
}
c.mu.Lock()
if !c.running {
done := c.done
c.mu.Unlock()
if done != nil {
<-done
}
return
}
cancel := c.cancel
c.cancel = nil
c.running = false
c.opsService = nil
done := c.done
c.mu.Unlock()
if cancel != nil {
cancel()
}
if done != nil {
<-done
}
c.mu.Lock()
if c.done == done && !c.running {
c.done = nil
}
c.mu.Unlock()
}
func (c *opsWSQPSCache) refreshLoop(ctx context.Context) {
ticker := time.NewTicker(c.refreshInterval)
defer ticker.Stop()
c.refresh(ctx)
for {
select {
case <-ticker.C:
c.refresh(ctx)
case <-ctx.Done():
return
}
}
}
func (c *opsWSQPSCache) refresh(parentCtx context.Context) {
if c == nil {
return
}
c.mu.Lock()
opsService := c.opsService
c.mu.Unlock()
if opsService == nil {
return
}
if parentCtx == nil {
parentCtx = context.Background()
}
ctx, cancel := context.WithTimeout(parentCtx, 10*time.Second)
defer cancel()
now := time.Now().UTC()
stats, err := opsService.GetWindowStats(ctx, now.Add(-c.requestCountWindow), now)
if err != nil || stats == nil {
if err != nil {
log.Printf("[OpsWS] refresh: get window stats failed: %v", err)
}
return
}
requestCount := stats.SuccessCount + stats.ErrorCountTotal
qps := 0.0
tps := 0.0
if c.requestCountWindow > 0 {
seconds := c.requestCountWindow.Seconds()
qps = roundTo1DP(float64(requestCount) / seconds)
tps = roundTo1DP(float64(stats.TokenConsumed) / seconds)
}
payload := gin.H{
"type": "qps_update",
"timestamp": now.Format(time.RFC3339),
"data": gin.H{
"qps": qps,
"tps": tps,
"request_count": requestCount,
},
}
msg, err := json.Marshal(payload)
if err != nil {
log.Printf("[OpsWS] refresh: marshal payload failed: %v", err)
return
}
c.payload.Store(msg)
c.lastUpdatedUnixNano.Store(now.UnixNano())
}
func roundTo1DP(v float64) float64 {
return math.Round(v*10) / 10
}
func (c *opsWSQPSCache) getPayload() []byte {
if c == nil {
return nil
}
if cached, ok := c.payload.Load().([]byte); ok && cached != nil {
return cached
}
return nil
}
func closeWS(conn *websocket.Conn, code int, reason string) {
if conn == nil {
return
}
msg := websocket.FormatCloseMessage(code, reason)
_ = conn.WriteControl(websocket.CloseMessage, msg, time.Now().Add(qpsWSWriteTimeout))
_ = conn.Close()
}
// QPSWSHandler handles realtime QPS push via WebSocket.
// GET /api/v1/admin/ops/ws/qps
func (h *OpsHandler) QPSWSHandler(c *gin.Context) {
clientIP := requestClientIP(c.Request)
if h == nil || h.opsService == nil {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "ops service not initialized"})
return
}
// If realtime monitoring is disabled, prefer a successful WS upgrade followed by a clean close
// with a deterministic close code. This prevents clients from spinning on 404/1006 reconnect loops.
if !h.opsService.IsRealtimeMonitoringEnabled(c.Request.Context()) {
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
if err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": "ops realtime monitoring is disabled"})
return
}
closeWS(conn, opsWSCloseRealtimeDisabled, "realtime_disabled")
return
}
cancelQPSWSIdleStop()
// Lazily start the background refresh loop so unit tests that never hit the
// websocket route don't spawn goroutines that depend on DB/Redis stubs.
qpsWSCache.start(h.opsService)
// Reserve a global slot before upgrading the connection to keep the limit strict.
if !tryAcquireOpsWSTotalSlot(opsWSLimits.MaxConns) {
log.Printf("[OpsWS] connection limit reached: %d/%d", wsConnCount.Load(), opsWSLimits.MaxConns)
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
return
}
defer func() {
if wsConnCount.Add(-1) == 0 {
scheduleQPSWSIdleStop()
}
}()
if opsWSLimits.MaxConnsPerIP > 0 && clientIP != "" {
if !tryAcquireOpsWSIPSlot(clientIP, opsWSLimits.MaxConnsPerIP) {
log.Printf("[OpsWS] per-ip connection limit reached: ip=%s limit=%d", clientIP, opsWSLimits.MaxConnsPerIP)
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "too many connections"})
return
}
defer releaseOpsWSIPSlot(clientIP)
}
conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
if err != nil {
log.Printf("[OpsWS] upgrade failed: %v", err)
return
}
defer func() {
_ = conn.Close()
}()
handleQPSWebSocket(c.Request.Context(), conn)
}
func tryAcquireOpsWSTotalSlot(limit int32) bool {
if limit <= 0 {
return true
}
for {
current := wsConnCount.Load()
if current >= limit {
return false
}
if wsConnCount.CompareAndSwap(current, current+1) {
return true
}
}
}
func tryAcquireOpsWSIPSlot(clientIP string, limit int32) bool {
if strings.TrimSpace(clientIP) == "" || limit <= 0 {
return true
}
v, _ := wsConnCountByIP.LoadOrStore(clientIP, &atomic.Int32{})
counter := v.(*atomic.Int32)
for {
current := counter.Load()
if current >= limit {
return false
}
if counter.CompareAndSwap(current, current+1) {
return true
}
}
}
func releaseOpsWSIPSlot(clientIP string) {
if strings.TrimSpace(clientIP) == "" {
return
}
v, ok := wsConnCountByIP.Load(clientIP)
if !ok {
return
}
counter := v.(*atomic.Int32)
next := counter.Add(-1)
if next <= 0 {
// Best-effort cleanup; safe even if a new slot was acquired concurrently.
wsConnCountByIP.Delete(clientIP)
}
}
func handleQPSWebSocket(parentCtx context.Context, conn *websocket.Conn) {
if conn == nil {
return
}
ctx, cancel := context.WithCancel(parentCtx)
defer cancel()
var closeOnce sync.Once
closeConn := func() {
closeOnce.Do(func() {
_ = conn.Close()
})
}
closeFrameCh := make(chan []byte, 1)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
defer cancel()
conn.SetReadLimit(qpsWSMaxReadBytes)
if err := conn.SetReadDeadline(time.Now().Add(qpsWSPongWait)); err != nil {
log.Printf("[OpsWS] set read deadline failed: %v", err)
return
}
conn.SetPongHandler(func(string) error {
return conn.SetReadDeadline(time.Now().Add(qpsWSPongWait))
})
conn.SetCloseHandler(func(code int, text string) error {
select {
case closeFrameCh <- websocket.FormatCloseMessage(code, text):
default:
}
cancel()
return nil
})
for {
_, _, err := conn.ReadMessage()
if err != nil {
if websocket.IsUnexpectedCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway, websocket.CloseNoStatusReceived) {
log.Printf("[OpsWS] read failed: %v", err)
}
return
}
}
}()
// Push QPS data every 2 seconds (values are globally cached and refreshed at most once per qpsWSRefreshInterval).
pushTicker := time.NewTicker(qpsWSPushInterval)
defer pushTicker.Stop()
// Heartbeat ping every 30 seconds.
pingTicker := time.NewTicker(qpsWSPingInterval)
defer pingTicker.Stop()
writeWithTimeout := func(messageType int, data []byte) error {
if err := conn.SetWriteDeadline(time.Now().Add(qpsWSWriteTimeout)); err != nil {
return err
}
return conn.WriteMessage(messageType, data)
}
sendClose := func(closeFrame []byte) {
if closeFrame == nil {
closeFrame = websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
}
_ = writeWithTimeout(websocket.CloseMessage, closeFrame)
}
for {
select {
case <-pushTicker.C:
msg := qpsWSCache.getPayload()
if msg == nil {
continue
}
if err := writeWithTimeout(websocket.TextMessage, msg); err != nil {
log.Printf("[OpsWS] write failed: %v", err)
cancel()
closeConn()
wg.Wait()
return
}
case <-pingTicker.C:
if err := writeWithTimeout(websocket.PingMessage, nil); err != nil {
log.Printf("[OpsWS] ping failed: %v", err)
cancel()
closeConn()
wg.Wait()
return
}
case closeFrame := <-closeFrameCh:
sendClose(closeFrame)
closeConn()
wg.Wait()
return
case <-ctx.Done():
var closeFrame []byte
select {
case closeFrame = <-closeFrameCh:
default:
}
sendClose(closeFrame)
closeConn()
wg.Wait()
return
}
}
}
func isAllowedOpsWSOrigin(r *http.Request) bool {
if r == nil {
return false
}
origin := strings.TrimSpace(r.Header.Get("Origin"))
if origin == "" {
switch strings.ToLower(strings.TrimSpace(opsWSProxyConfig.OriginPolicy)) {
case OriginPolicyStrict:
return false
case OriginPolicyPermissive, "":
return true
default:
return true
}
}
parsed, err := url.Parse(origin)
if err != nil || parsed.Hostname() == "" {
return false
}
originHost := strings.ToLower(parsed.Hostname())
trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
reqHost := hostWithoutPort(r.Host)
if trustProxyHeaders {
xfHost := strings.TrimSpace(r.Header.Get("X-Forwarded-Host"))
if xfHost != "" {
xfHost = strings.TrimSpace(strings.Split(xfHost, ",")[0])
if xfHost != "" {
reqHost = hostWithoutPort(xfHost)
}
}
}
reqHost = strings.ToLower(reqHost)
if reqHost == "" {
return false
}
return originHost == reqHost
}
func shouldTrustOpsWSProxyHeaders(r *http.Request) bool {
if r == nil {
return false
}
if !opsWSProxyConfig.TrustProxy {
return false
}
peerIP, ok := requestPeerIP(r)
if !ok {
return false
}
return isAddrInTrustedProxies(peerIP, opsWSProxyConfig.TrustedProxies)
}
func requestPeerIP(r *http.Request) (netip.Addr, bool) {
if r == nil {
return netip.Addr{}, false
}
host, _, err := net.SplitHostPort(strings.TrimSpace(r.RemoteAddr))
if err != nil {
host = strings.TrimSpace(r.RemoteAddr)
}
host = strings.TrimPrefix(host, "[")
host = strings.TrimSuffix(host, "]")
if host == "" {
return netip.Addr{}, false
}
addr, err := netip.ParseAddr(host)
if err != nil {
return netip.Addr{}, false
}
return addr.Unmap(), true
}
func requestClientIP(r *http.Request) string {
if r == nil {
return ""
}
trustProxyHeaders := shouldTrustOpsWSProxyHeaders(r)
if trustProxyHeaders {
xff := strings.TrimSpace(r.Header.Get("X-Forwarded-For"))
if xff != "" {
// Use the left-most entry (original client). If multiple proxies add values, they are comma-separated.
xff = strings.TrimSpace(strings.Split(xff, ",")[0])
xff = strings.TrimPrefix(xff, "[")
xff = strings.TrimSuffix(xff, "]")
if addr, err := netip.ParseAddr(xff); err == nil && addr.IsValid() {
return addr.Unmap().String()
}
}
}
if peer, ok := requestPeerIP(r); ok && peer.IsValid() {
return peer.String()
}
return ""
}
func isAddrInTrustedProxies(addr netip.Addr, trusted []netip.Prefix) bool {
if !addr.IsValid() {
return false
}
for _, p := range trusted {
if p.Contains(addr) {
return true
}
}
return false
}
func loadOpsWSProxyConfigFromEnv() OpsWSProxyConfig {
cfg := OpsWSProxyConfig{
TrustProxy: true,
TrustedProxies: defaultTrustedProxies(),
OriginPolicy: OriginPolicyPermissive,
}
if v := strings.TrimSpace(os.Getenv(envOpsWSTrustProxy)); v != "" {
if parsed, err := strconv.ParseBool(v); err == nil {
cfg.TrustProxy = parsed
} else {
log.Printf("[OpsWS] invalid %s=%q (expected bool); using default=%v", envOpsWSTrustProxy, v, cfg.TrustProxy)
}
}
if raw := strings.TrimSpace(os.Getenv(envOpsWSTrustedProxies)); raw != "" {
prefixes, invalid := parseTrustedProxyList(raw)
if len(invalid) > 0 {
log.Printf("[OpsWS] invalid %s entries ignored: %s", envOpsWSTrustedProxies, strings.Join(invalid, ", "))
}
cfg.TrustedProxies = prefixes
}
if v := strings.TrimSpace(os.Getenv(envOpsWSOriginPolicy)); v != "" {
normalized := strings.ToLower(v)
switch normalized {
case OriginPolicyStrict, OriginPolicyPermissive:
cfg.OriginPolicy = normalized
default:
log.Printf("[OpsWS] invalid %s=%q (expected %q or %q); using default=%q", envOpsWSOriginPolicy, v, OriginPolicyStrict, OriginPolicyPermissive, cfg.OriginPolicy)
}
}
return cfg
}
func loadOpsWSRuntimeLimitsFromEnv() opsWSRuntimeLimits {
cfg := opsWSRuntimeLimits{
MaxConns: defaultMaxWSConns,
MaxConnsPerIP: defaultMaxWSConnsPerIP,
}
if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConns)); v != "" {
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
cfg.MaxConns = int32(parsed)
} else {
log.Printf("[OpsWS] invalid %s=%q (expected int>0); using default=%d", envOpsWSMaxConns, v, cfg.MaxConns)
}
}
if v := strings.TrimSpace(os.Getenv(envOpsWSMaxConnsPerIP)); v != "" {
if parsed, err := strconv.Atoi(v); err == nil && parsed >= 0 {
cfg.MaxConnsPerIP = int32(parsed)
} else {
log.Printf("[OpsWS] invalid %s=%q (expected int>=0); using default=%d", envOpsWSMaxConnsPerIP, v, cfg.MaxConnsPerIP)
}
}
return cfg
}
func defaultTrustedProxies() []netip.Prefix {
prefixes, _ := parseTrustedProxyList("127.0.0.0/8,::1/128")
return prefixes
}
func parseTrustedProxyList(raw string) (prefixes []netip.Prefix, invalid []string) {
for _, token := range strings.Split(raw, ",") {
item := strings.TrimSpace(token)
if item == "" {
continue
}
var (
p netip.Prefix
err error
)
if strings.Contains(item, "/") {
p, err = netip.ParsePrefix(item)
} else {
var addr netip.Addr
addr, err = netip.ParseAddr(item)
if err == nil {
addr = addr.Unmap()
bits := 128
if addr.Is4() {
bits = 32
}
p = netip.PrefixFrom(addr, bits)
}
}
if err != nil || !p.IsValid() {
invalid = append(invalid, item)
continue
}
prefixes = append(prefixes, p.Masked())
}
return prefixes, invalid
}
func hostWithoutPort(hostport string) string {
hostport = strings.TrimSpace(hostport)
if hostport == "" {
return ""
}
if host, _, err := net.SplitHostPort(hostport); err == nil {
return host
}
if strings.HasPrefix(hostport, "[") && strings.HasSuffix(hostport, "]") {
return strings.Trim(hostport, "[]")
}
parts := strings.Split(hostport, ":")
return parts[0]
}

View File

@@ -37,6 +37,11 @@ type SystemSettings struct {
// Identity patch configuration (Claude -> Gemini)
EnableIdentityPatch bool `json:"enable_identity_patch"`
IdentityPatchPrompt string `json:"identity_patch_prompt"`
// Ops monitoring (vNext)
OpsMonitoringEnabled bool `json:"ops_monitoring_enabled"`
OpsRealtimeMonitoringEnabled bool `json:"ops_realtime_monitoring_enabled"`
OpsQueryModeDefault string `json:"ops_query_mode_default"`
}
type PublicSettings struct {

View File

@@ -0,0 +1,681 @@
package handler
import (
"bytes"
"context"
"encoding/json"
"log"
"runtime"
"runtime/debug"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"unicode/utf8"
"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
middleware2 "github.com/Wei-Shaw/sub2api/internal/server/middleware"
"github.com/Wei-Shaw/sub2api/internal/service"
"github.com/gin-gonic/gin"
)
const (
opsModelKey = "ops_model"
opsStreamKey = "ops_stream"
opsRequestBodyKey = "ops_request_body"
opsAccountIDKey = "ops_account_id"
)
const (
opsErrorLogTimeout = 5 * time.Second
opsErrorLogDrainTimeout = 10 * time.Second
opsErrorLogMinWorkerCount = 4
opsErrorLogMaxWorkerCount = 32
opsErrorLogQueueSizePerWorker = 128
opsErrorLogMinQueueSize = 256
opsErrorLogMaxQueueSize = 8192
)
type opsErrorLogJob struct {
ops *service.OpsService
entry *service.OpsInsertErrorLogInput
requestBody []byte
}
var (
opsErrorLogOnce sync.Once
opsErrorLogQueue chan opsErrorLogJob
opsErrorLogStopOnce sync.Once
opsErrorLogWorkersWg sync.WaitGroup
opsErrorLogMu sync.RWMutex
opsErrorLogStopping bool
opsErrorLogQueueLen atomic.Int64
opsErrorLogEnqueued atomic.Int64
opsErrorLogDropped atomic.Int64
opsErrorLogProcessed atomic.Int64
opsErrorLogLastDropLogAt atomic.Int64
opsErrorLogShutdownCh = make(chan struct{})
opsErrorLogShutdownOnce sync.Once
opsErrorLogDrained atomic.Bool
)
func startOpsErrorLogWorkers() {
opsErrorLogMu.Lock()
defer opsErrorLogMu.Unlock()
if opsErrorLogStopping {
return
}
workerCount, queueSize := opsErrorLogConfig()
opsErrorLogQueue = make(chan opsErrorLogJob, queueSize)
opsErrorLogQueueLen.Store(0)
opsErrorLogWorkersWg.Add(workerCount)
for i := 0; i < workerCount; i++ {
go func() {
defer opsErrorLogWorkersWg.Done()
for job := range opsErrorLogQueue {
opsErrorLogQueueLen.Add(-1)
if job.ops == nil || job.entry == nil {
continue
}
func() {
defer func() {
if r := recover(); r != nil {
log.Printf("[OpsErrorLogger] worker panic: %v\n%s", r, debug.Stack())
}
}()
ctx, cancel := context.WithTimeout(context.Background(), opsErrorLogTimeout)
_ = job.ops.RecordError(ctx, job.entry, job.requestBody)
cancel()
opsErrorLogProcessed.Add(1)
}()
}
}()
}
}
func enqueueOpsErrorLog(ops *service.OpsService, entry *service.OpsInsertErrorLogInput, requestBody []byte) {
if ops == nil || entry == nil {
return
}
select {
case <-opsErrorLogShutdownCh:
return
default:
}
opsErrorLogMu.RLock()
stopping := opsErrorLogStopping
opsErrorLogMu.RUnlock()
if stopping {
return
}
opsErrorLogOnce.Do(startOpsErrorLogWorkers)
opsErrorLogMu.RLock()
defer opsErrorLogMu.RUnlock()
if opsErrorLogStopping || opsErrorLogQueue == nil {
return
}
select {
case opsErrorLogQueue <- opsErrorLogJob{ops: ops, entry: entry, requestBody: requestBody}:
opsErrorLogQueueLen.Add(1)
opsErrorLogEnqueued.Add(1)
default:
// Queue is full; drop to avoid blocking request handling.
opsErrorLogDropped.Add(1)
maybeLogOpsErrorLogDrop()
}
}
func StopOpsErrorLogWorkers() bool {
opsErrorLogStopOnce.Do(func() {
opsErrorLogShutdownOnce.Do(func() {
close(opsErrorLogShutdownCh)
})
opsErrorLogDrained.Store(stopOpsErrorLogWorkers())
})
return opsErrorLogDrained.Load()
}
func stopOpsErrorLogWorkers() bool {
opsErrorLogMu.Lock()
opsErrorLogStopping = true
ch := opsErrorLogQueue
if ch != nil {
close(ch)
}
opsErrorLogQueue = nil
opsErrorLogMu.Unlock()
if ch == nil {
opsErrorLogQueueLen.Store(0)
return true
}
done := make(chan struct{})
go func() {
opsErrorLogWorkersWg.Wait()
close(done)
}()
select {
case <-done:
opsErrorLogQueueLen.Store(0)
return true
case <-time.After(opsErrorLogDrainTimeout):
return false
}
}
func OpsErrorLogQueueLength() int64 {
return opsErrorLogQueueLen.Load()
}
func OpsErrorLogQueueCapacity() int {
opsErrorLogMu.RLock()
ch := opsErrorLogQueue
opsErrorLogMu.RUnlock()
if ch == nil {
return 0
}
return cap(ch)
}
func OpsErrorLogDroppedTotal() int64 {
return opsErrorLogDropped.Load()
}
func OpsErrorLogEnqueuedTotal() int64 {
return opsErrorLogEnqueued.Load()
}
func OpsErrorLogProcessedTotal() int64 {
return opsErrorLogProcessed.Load()
}
func maybeLogOpsErrorLogDrop() {
now := time.Now().Unix()
for {
last := opsErrorLogLastDropLogAt.Load()
if last != 0 && now-last < 60 {
return
}
if opsErrorLogLastDropLogAt.CompareAndSwap(last, now) {
break
}
}
queued := opsErrorLogQueueLen.Load()
queueCap := OpsErrorLogQueueCapacity()
log.Printf(
"[OpsErrorLogger] queue is full; dropping logs (queued=%d cap=%d enqueued_total=%d dropped_total=%d processed_total=%d)",
queued,
queueCap,
opsErrorLogEnqueued.Load(),
opsErrorLogDropped.Load(),
opsErrorLogProcessed.Load(),
)
}
func opsErrorLogConfig() (workerCount int, queueSize int) {
workerCount = runtime.GOMAXPROCS(0) * 2
if workerCount < opsErrorLogMinWorkerCount {
workerCount = opsErrorLogMinWorkerCount
}
if workerCount > opsErrorLogMaxWorkerCount {
workerCount = opsErrorLogMaxWorkerCount
}
queueSize = workerCount * opsErrorLogQueueSizePerWorker
if queueSize < opsErrorLogMinQueueSize {
queueSize = opsErrorLogMinQueueSize
}
if queueSize > opsErrorLogMaxQueueSize {
queueSize = opsErrorLogMaxQueueSize
}
return workerCount, queueSize
}
func setOpsRequestContext(c *gin.Context, model string, stream bool, requestBody []byte) {
if c == nil {
return
}
c.Set(opsModelKey, model)
c.Set(opsStreamKey, stream)
if len(requestBody) > 0 {
c.Set(opsRequestBodyKey, requestBody)
}
}
func setOpsSelectedAccount(c *gin.Context, accountID int64) {
if c == nil || accountID <= 0 {
return
}
c.Set(opsAccountIDKey, accountID)
}
type opsCaptureWriter struct {
gin.ResponseWriter
limit int
buf bytes.Buffer
}
func (w *opsCaptureWriter) Write(b []byte) (int, error) {
if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
remaining := w.limit - w.buf.Len()
if len(b) > remaining {
_, _ = w.buf.Write(b[:remaining])
} else {
_, _ = w.buf.Write(b)
}
}
return w.ResponseWriter.Write(b)
}
func (w *opsCaptureWriter) WriteString(s string) (int, error) {
if w.Status() >= 400 && w.limit > 0 && w.buf.Len() < w.limit {
remaining := w.limit - w.buf.Len()
if len(s) > remaining {
_, _ = w.buf.WriteString(s[:remaining])
} else {
_, _ = w.buf.WriteString(s)
}
}
return w.ResponseWriter.WriteString(s)
}
// OpsErrorLoggerMiddleware records error responses (status >= 400) into ops_error_logs.
//
// Notes:
// - It buffers response bodies only when status >= 400 to avoid overhead for successful traffic.
// - Streaming errors after the response has started (SSE) may still need explicit logging.
func OpsErrorLoggerMiddleware(ops *service.OpsService) gin.HandlerFunc {
return func(c *gin.Context) {
w := &opsCaptureWriter{ResponseWriter: c.Writer, limit: 64 * 1024}
c.Writer = w
c.Next()
status := c.Writer.Status()
if status < 400 {
return
}
if ops == nil {
return
}
if !ops.IsMonitoringEnabled(c.Request.Context()) {
return
}
body := w.buf.Bytes()
parsed := parseOpsErrorResponse(body)
apiKey, _ := middleware2.GetAPIKeyFromContext(c)
clientRequestID, _ := c.Request.Context().Value(ctxkey.ClientRequestID).(string)
model, _ := c.Get(opsModelKey)
streamV, _ := c.Get(opsStreamKey)
accountIDV, _ := c.Get(opsAccountIDKey)
var modelName string
if s, ok := model.(string); ok {
modelName = s
}
stream := false
if b, ok := streamV.(bool); ok {
stream = b
}
var accountID *int64
if v, ok := accountIDV.(int64); ok && v > 0 {
accountID = &v
}
fallbackPlatform := guessPlatformFromPath(c.Request.URL.Path)
platform := resolveOpsPlatform(apiKey, fallbackPlatform)
requestID := c.Writer.Header().Get("X-Request-Id")
if requestID == "" {
requestID = c.Writer.Header().Get("x-request-id")
}
phase := classifyOpsPhase(parsed.ErrorType, parsed.Message, parsed.Code)
isBusinessLimited := classifyOpsIsBusinessLimited(parsed.ErrorType, phase, parsed.Code, status, parsed.Message)
errorOwner := classifyOpsErrorOwner(phase, parsed.Message)
errorSource := classifyOpsErrorSource(phase, parsed.Message)
entry := &service.OpsInsertErrorLogInput{
RequestID: requestID,
ClientRequestID: clientRequestID,
AccountID: accountID,
Platform: platform,
Model: modelName,
RequestPath: func() string {
if c.Request != nil && c.Request.URL != nil {
return c.Request.URL.Path
}
return ""
}(),
Stream: stream,
UserAgent: c.GetHeader("User-Agent"),
ErrorPhase: phase,
ErrorType: normalizeOpsErrorType(parsed.ErrorType, parsed.Code),
Severity: classifyOpsSeverity(parsed.ErrorType, status),
StatusCode: status,
IsBusinessLimited: isBusinessLimited,
ErrorMessage: parsed.Message,
// Keep the full captured error body (capture is already capped at 64KB) so the
// service layer can sanitize JSON before truncating for storage.
ErrorBody: string(body),
ErrorSource: errorSource,
ErrorOwner: errorOwner,
IsRetryable: classifyOpsIsRetryable(parsed.ErrorType, status),
RetryCount: 0,
CreatedAt: time.Now(),
}
if apiKey != nil {
entry.APIKeyID = &apiKey.ID
if apiKey.User != nil {
entry.UserID = &apiKey.User.ID
}
if apiKey.GroupID != nil {
entry.GroupID = apiKey.GroupID
}
// Prefer group platform if present (more stable than inferring from path).
if apiKey.Group != nil && apiKey.Group.Platform != "" {
entry.Platform = apiKey.Group.Platform
}
}
var clientIP string
if ip := strings.TrimSpace(c.ClientIP()); ip != "" {
clientIP = ip
entry.ClientIP = &clientIP
}
var requestBody []byte
if v, ok := c.Get(opsRequestBodyKey); ok {
if b, ok := v.([]byte); ok && len(b) > 0 {
requestBody = b
}
}
// Persist only a minimal, whitelisted set of request headers to improve retry fidelity.
// Do NOT store Authorization/Cookie/etc.
entry.RequestHeadersJSON = extractOpsRetryRequestHeaders(c)
enqueueOpsErrorLog(ops, entry, requestBody)
}
}
var opsRetryRequestHeaderAllowlist = []string{
"anthropic-beta",
"anthropic-version",
}
func extractOpsRetryRequestHeaders(c *gin.Context) *string {
if c == nil || c.Request == nil {
return nil
}
headers := make(map[string]string, 4)
for _, key := range opsRetryRequestHeaderAllowlist {
v := strings.TrimSpace(c.GetHeader(key))
if v == "" {
continue
}
// Keep headers small even if a client sends something unexpected.
headers[key] = truncateString(v, 512)
}
if len(headers) == 0 {
return nil
}
raw, err := json.Marshal(headers)
if err != nil {
return nil
}
s := string(raw)
return &s
}
type parsedOpsError struct {
ErrorType string
Message string
Code string
}
func parseOpsErrorResponse(body []byte) parsedOpsError {
if len(body) == 0 {
return parsedOpsError{}
}
// Fast path: attempt to decode into a generic map.
var m map[string]any
if err := json.Unmarshal(body, &m); err != nil {
return parsedOpsError{Message: truncateString(string(body), 1024)}
}
// Claude/OpenAI-style gateway error: { type:"error", error:{ type, message } }
if errObj, ok := m["error"].(map[string]any); ok {
t, _ := errObj["type"].(string)
msg, _ := errObj["message"].(string)
// Gemini googleError also uses "error": { code, message, status }
if msg == "" {
if v, ok := errObj["message"]; ok {
msg, _ = v.(string)
}
}
if t == "" {
// Gemini error does not have "type" field.
t = "api_error"
}
// For gemini error, capture numeric code as string for business-limited mapping if needed.
var code string
if v, ok := errObj["code"]; ok {
switch n := v.(type) {
case float64:
code = strconvItoa(int(n))
case int:
code = strconvItoa(n)
}
}
return parsedOpsError{ErrorType: t, Message: msg, Code: code}
}
// APIKeyAuth-style: { code:"INSUFFICIENT_BALANCE", message:"..." }
code, _ := m["code"].(string)
msg, _ := m["message"].(string)
if code != "" || msg != "" {
return parsedOpsError{ErrorType: "api_error", Message: msg, Code: code}
}
return parsedOpsError{Message: truncateString(string(body), 1024)}
}
func resolveOpsPlatform(apiKey *service.APIKey, fallback string) string {
if apiKey != nil && apiKey.Group != nil && apiKey.Group.Platform != "" {
return apiKey.Group.Platform
}
return fallback
}
func guessPlatformFromPath(path string) string {
p := strings.ToLower(path)
switch {
case strings.HasPrefix(p, "/antigravity/"):
return service.PlatformAntigravity
case strings.HasPrefix(p, "/v1beta/"):
return service.PlatformGemini
case strings.Contains(p, "/responses"):
return service.PlatformOpenAI
default:
return ""
}
}
func normalizeOpsErrorType(errType string, code string) string {
if errType != "" {
return errType
}
switch strings.TrimSpace(code) {
case "INSUFFICIENT_BALANCE":
return "billing_error"
case "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
return "subscription_error"
default:
return "api_error"
}
}
func classifyOpsPhase(errType, message, code string) string {
msg := strings.ToLower(message)
switch strings.TrimSpace(code) {
case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
return "billing"
}
switch errType {
case "authentication_error":
return "auth"
case "billing_error", "subscription_error":
return "billing"
case "rate_limit_error":
if strings.Contains(msg, "concurrency") || strings.Contains(msg, "pending") || strings.Contains(msg, "queue") {
return "concurrency"
}
return "upstream"
case "invalid_request_error":
return "response"
case "upstream_error", "overloaded_error":
return "upstream"
case "api_error":
if strings.Contains(msg, "no available accounts") {
return "scheduling"
}
return "internal"
default:
return "internal"
}
}
func classifyOpsSeverity(errType string, status int) string {
switch errType {
case "invalid_request_error", "authentication_error", "billing_error", "subscription_error":
return "P3"
}
if status >= 500 {
return "P1"
}
if status == 429 {
return "P1"
}
if status >= 400 {
return "P2"
}
return "P3"
}
func classifyOpsIsRetryable(errType string, statusCode int) bool {
switch errType {
case "authentication_error", "invalid_request_error":
return false
case "timeout_error":
return true
case "rate_limit_error":
// May be transient (upstream or queue); retry can help.
return true
case "billing_error", "subscription_error":
return false
case "upstream_error", "overloaded_error":
return statusCode >= 500 || statusCode == 429 || statusCode == 529
default:
return statusCode >= 500
}
}
func classifyOpsIsBusinessLimited(errType, phase, code string, status int, message string) bool {
switch strings.TrimSpace(code) {
case "INSUFFICIENT_BALANCE", "USAGE_LIMIT_EXCEEDED", "SUBSCRIPTION_NOT_FOUND", "SUBSCRIPTION_INVALID":
return true
}
if phase == "billing" || phase == "concurrency" {
// SLA/错误率排除“用户级业务限制”
return true
}
// Avoid treating upstream rate limits as business-limited.
if errType == "rate_limit_error" && strings.Contains(strings.ToLower(message), "upstream") {
return false
}
_ = status
return false
}
func classifyOpsErrorOwner(phase string, message string) string {
switch phase {
case "upstream", "network":
return "provider"
case "billing", "concurrency", "auth", "response":
return "client"
default:
if strings.Contains(strings.ToLower(message), "upstream") {
return "provider"
}
return "sub2api"
}
}
func classifyOpsErrorSource(phase string, message string) string {
switch phase {
case "upstream":
return "upstream_http"
case "network":
return "upstream_network"
case "billing":
return "billing"
case "concurrency":
return "concurrency"
default:
if strings.Contains(strings.ToLower(message), "upstream") {
return "upstream_http"
}
return "internal"
}
}
func truncateString(s string, max int) string {
if max <= 0 {
return ""
}
if len(s) <= max {
return s
}
cut := s[:max]
// Ensure truncation does not split multi-byte characters.
for len(cut) > 0 && !utf8.ValidString(cut) {
cut = cut[:len(cut)-1]
}
return cut
}
func strconvItoa(v int) string {
return strconv.Itoa(v)
}

View File

@@ -20,6 +20,7 @@ func ProvideAdminHandlers(
proxyHandler *admin.ProxyHandler,
redeemHandler *admin.RedeemHandler,
settingHandler *admin.SettingHandler,
opsHandler *admin.OpsHandler,
systemHandler *admin.SystemHandler,
subscriptionHandler *admin.SubscriptionHandler,
usageHandler *admin.UsageHandler,
@@ -37,6 +38,7 @@ func ProvideAdminHandlers(
Proxy: proxyHandler,
Redeem: redeemHandler,
Setting: settingHandler,
Ops: opsHandler,
System: systemHandler,
Subscription: subscriptionHandler,
Usage: usageHandler,
@@ -106,6 +108,7 @@ var ProviderSet = wire.NewSet(
admin.NewProxyHandler,
admin.NewRedeemHandler,
admin.NewSettingHandler,
admin.NewOpsHandler,
ProvideSystemHandler,
admin.NewSubscriptionHandler,
admin.NewUsageHandler,

View File

@@ -30,6 +30,20 @@ func adminAuth(
settingService *service.SettingService,
) gin.HandlerFunc {
return func(c *gin.Context) {
// WebSocket upgrade requests cannot set Authorization headers in browsers.
// For admin WebSocket endpoints (e.g. Ops realtime), allow passing the JWT via
// Sec-WebSocket-Protocol (subprotocol list) using a prefixed token item:
// Sec-WebSocket-Protocol: sub2api-admin, jwt.<token>
if isWebSocketUpgradeRequest(c) {
if token := extractJWTFromWebSocketSubprotocol(c); token != "" {
if !validateJWTForAdmin(c, token, authService, userService) {
return
}
c.Next()
return
}
}
// 检查 x-api-key headerAdmin API Key 认证)
apiKey := c.GetHeader("x-api-key")
if apiKey != "" {
@@ -58,6 +72,44 @@ func adminAuth(
}
}
func isWebSocketUpgradeRequest(c *gin.Context) bool {
if c == nil || c.Request == nil {
return false
}
// RFC6455 handshake uses:
// Connection: Upgrade
// Upgrade: websocket
upgrade := strings.ToLower(strings.TrimSpace(c.GetHeader("Upgrade")))
if upgrade != "websocket" {
return false
}
connection := strings.ToLower(c.GetHeader("Connection"))
return strings.Contains(connection, "upgrade")
}
func extractJWTFromWebSocketSubprotocol(c *gin.Context) string {
if c == nil {
return ""
}
raw := strings.TrimSpace(c.GetHeader("Sec-WebSocket-Protocol"))
if raw == "" {
return ""
}
// The header is a comma-separated list of tokens. We reserve the prefix "jwt."
// for carrying the admin JWT.
for _, part := range strings.Split(raw, ",") {
p := strings.TrimSpace(part)
if strings.HasPrefix(p, "jwt.") {
token := strings.TrimSpace(strings.TrimPrefix(p, "jwt."))
if token != "" {
return token
}
}
}
return ""
}
// validateAdminAPIKey 验证管理员 API Key
func validateAdminAPIKey(
c *gin.Context,

View File

@@ -0,0 +1,31 @@
package middleware
import (
"context"
"github.com/Wei-Shaw/sub2api/internal/pkg/ctxkey"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
)
// ClientRequestID ensures every request has a unique client_request_id in request.Context().
//
// This is used by the Ops monitoring module for end-to-end request correlation.
func ClientRequestID() gin.HandlerFunc {
return func(c *gin.Context) {
if c.Request == nil {
c.Next()
return
}
if v := c.Request.Context().Value(ctxkey.ClientRequestID); v != nil {
c.Next()
return
}
id := uuid.New().String()
c.Request = c.Request.WithContext(context.WithValue(c.Request.Context(), ctxkey.ClientRequestID, id))
c.Next()
}
}

View File

@@ -0,0 +1,54 @@
package middleware
import (
"net/http"
"strings"
"github.com/gin-gonic/gin"
)
// InjectBearerTokenFromQueryForWebSocket copies `?token=` into the Authorization header
// for WebSocket handshake requests on a small allow-list of endpoints.
//
// Why: browsers can't set custom headers on WebSocket handshake, but our admin routes
// are protected by header-based auth. This keeps the token support scoped to WS only.
func InjectBearerTokenFromQueryForWebSocket() gin.HandlerFunc {
return func(c *gin.Context) {
if c == nil || c.Request == nil {
if c != nil {
c.Next()
}
return
}
// Only GET websocket upgrades.
if c.Request.Method != http.MethodGet {
c.Next()
return
}
if !strings.EqualFold(strings.TrimSpace(c.GetHeader("Upgrade")), "websocket") {
c.Next()
return
}
// If caller already supplied auth headers, don't override.
if strings.TrimSpace(c.GetHeader("Authorization")) != "" || strings.TrimSpace(c.GetHeader("x-api-key")) != "" {
c.Next()
return
}
// Allow-list ops websocket endpoints.
path := strings.TrimSpace(c.Request.URL.Path)
if !strings.HasPrefix(path, "/api/v1/admin/ops/ws/") {
c.Next()
return
}
token := strings.TrimSpace(c.Query("token"))
if token != "" {
c.Request.Header.Set("Authorization", "Bearer "+token)
}
c.Next()
}
}