ai-xn-check/collectors/performance.py

"""Performance fingerprint collector — latency, TTFT, TPS, response length."""

import numpy as np
from typing import List
from core.client import AIClient
from core.models import PerformanceFingerprint, CollectionConfig
from utils.tokenizer import estimate_tokens


# 5 standardized prompts of varying complexity
PERFORMANCE_PROMPTS = [
    # Short, simple
    "What is 2 + 2? Answer in one sentence.",
    # Medium factual
    "Explain the difference between TCP and UDP protocols in 3-4 sentences.",
    # Longer creative
    "Write a short poem (4-8 lines) about the beauty of mathematics.",
    # Technical
    "Write a Python function that checks if a string is a palindrome. Include a brief docstring.",
    # Complex reasoning
    "Compare and contrast merge sort and quicksort algorithms. Discuss time complexity, space complexity, and when to use each. Keep it under 200 words.",
]


async def collect_performance(client: AIClient, config: CollectionConfig,
                               progress_callback=None) -> PerformanceFingerprint:
    """
    Collect performance fingerprint from an AI API channel.

    Runs each prompt multiple times and gathers timing/size metrics.
    """
    all_latencies: List[float] = []
    all_ttfts: List[float] = []
    all_tps: List[float] = []
    all_response_lengths: List[int] = []

    total_tasks = len(PERFORMANCE_PROMPTS) * config.repeat_count
    completed = 0

    for prompt_idx, prompt in enumerate(PERFORMANCE_PROMPTS):
        for repeat in range(config.repeat_count):
            try:
                # Use streaming to get TTFT and TPS metrics
                text, metrics, headers = await client.send_message_streaming(
                    prompt=prompt,
                    max_tokens=config.max_tokens,
                )

                # Calculate total latency from timestamps
                if metrics.token_timestamps:
                    total_latency = metrics.token_timestamps[-1] * 1000  # convert to ms
                else:
                    total_latency = metrics.ttft_ms

                all_latencies.append(total_latency)

                if metrics.ttft_ms > 0:
                    all_ttfts.append(metrics.ttft_ms)

                if metrics.tps > 0:
                    all_tps.append(metrics.tps)

                # Estimate response length in tokens
                token_count = estimate_tokens(text)
                all_response_lengths.append(token_count)

            except Exception as e:
                if progress_callback:
                    progress_callback(f"  ⚠ Prompt {prompt_idx+1} repeat {repeat+1} failed: {e}")
                continue

            completed += 1
            if progress_callback:
                progress_callback(f"  Performance: {completed}/{total_tasks}")

    # Calculate percentiles
    if all_latencies:
        latency_arr = np.array(all_latencies)
        p50 = float(np.percentile(latency_arr, 50))
        p95 = float(np.percentile(latency_arr, 95))
        p99 = float(np.percentile(latency_arr, 99))
    else:
        p50 = p95 = p99 = 0.0

    avg_ttft = float(np.mean(all_ttfts)) if all_ttfts else 0.0
    avg_tps = float(np.mean(all_tps)) if all_tps else 0.0
    avg_resp_len = float(np.mean(all_response_lengths)) if all_response_lengths else 0.0

    return PerformanceFingerprint(
        latencies_ms=all_latencies,
        p50_latency_ms=p50,
        p95_latency_ms=p95,
        p99_latency_ms=p99,
        avg_ttft_ms=avg_ttft,
        avg_tps=avg_tps,
        response_lengths=all_response_lengths,
        avg_response_length=avg_resp_len,
    )