litellm/scripts/adaptive_router_demo/eval.py

# ruff: noqa: T201
"""
Adaptive router evaluator — LLM-as-judge harness.

For each test case:
  1. Sends the prompt to the adaptive router.
  2. Reads which model was picked (x-litellm-adaptive-router-model header).
  3. Asks the judge model whether the response meets the ideal criteria.
  4. Prints PASS or FAIL with one line of reasoning.

Run:
  uv run python scripts/adaptive_router_demo/eval.py \
      --proxy-url   http://localhost:4000 \
      --api-key     sk-1234 \
      --router      smart-cheap-router \
      --judge-model smart
"""

from __future__ import annotations

import argparse
import asyncio
import sys
import uuid
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import httpx


# ---------------------------------------------------------------------------
# Test cases
# ---------------------------------------------------------------------------
@dataclass
class EvalCase:
    category: str
    prompt: str
    ideal: str          # criteria the judge checks the response against


EVAL_CASES: List[EvalCase] = [
    # code_generation
    EvalCase(
        category="code_generation",
        prompt="Write a Python function that flattens a nested list of arbitrary depth.",
        ideal=(
            "A Python function (def flatten(...)) that accepts a list which may "
            "contain nested lists to arbitrary depth and returns a single flat list "
            "with all elements in order. Must handle at least two levels of nesting."
        ),
    ),
    EvalCase(
        category="code_generation",
        prompt="Write a Python decorator that retries a function up to 3 times on exception.",
        ideal=(
            "A Python decorator that wraps a callable, catches exceptions, and "
            "retries the call up to 3 times before re-raising. Should use functools.wraps "
            "or equivalent to preserve the wrapped function's metadata."
        ),
    ),
    EvalCase(
        category="code_generation",
        prompt="Write a SQL query that returns the top 5 customers by total order value.",
        ideal=(
            "A valid SQL SELECT query that JOINs an orders or order_items table with a "
            "customers table, groups by customer, sums order value, orders descending, "
            "and limits to 5 rows."
        ),
    ),
    # factual_lookup
    EvalCase(
        category="factual_lookup",
        prompt="What is the capital of New Zealand?",
        ideal="The answer must state Wellington as the capital of New Zealand.",
    ),
    EvalCase(
        category="factual_lookup",
        prompt="In what year did World War II end?",
        ideal="The answer must state 1945 as the year World War II ended.",
    ),
    EvalCase(
        category="factual_lookup",
        prompt="What is the chemical symbol for gold?",
        ideal="The answer must include 'Au' as the chemical symbol for gold.",
    ),
    # writing
    EvalCase(
        category="writing",
        prompt=(
            "Write a short, polite email declining a meeting request because of "
            "a scheduling conflict."
        ),
        ideal=(
            "A professional email that: (1) thanks the sender for the invitation, "
            "(2) clearly declines, (3) mentions a scheduling conflict as the reason, "
            "and (4) offers to reschedule or an alternative. Tone must be polite."
        ),
    ),
    EvalCase(
        category="writing",
        prompt="Write a one-paragraph product description for noise-cancelling headphones.",
        ideal=(
            "A marketing paragraph for noise-cancelling headphones that mentions "
            "noise cancellation as a feature, highlights at least one other benefit "
            "(comfort, audio quality, battery life, or similar), and ends with a "
            "persuasive call to action or closing statement."
        ),
    ),
]

# Matches the satisfaction regex in signals.py (_SATISFACTION_PATTERNS).
SATISFY_FOLLOWUP = "great, thanks!"
NEUTRAL_FOLLOWUP = "ok, noted"
FAB_ASSISTANT = "Got it. Working on that now."

JUDGE_SYSTEM = (
    "You are a strict but fair evaluator. Your job is to decide whether a model "
    "response meets the stated requirements. Reply with exactly two lines:\n"
    "Line 1: PASS or FAIL\n"
    "Line 2: One sentence of reasoning (≤ 25 words)."
)


def _judge_user(prompt: str, ideal: str, actual: str) -> str:
    return (
        f"Question sent to model:\n{prompt}\n\n"
        f"Requirements the response must meet:\n{ideal}\n\n"
        f"Actual model response:\n{actual}\n\n"
        "Does the response meet the requirements? Reply PASS or FAIL."
    )


# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
async def _chat(
    client: httpx.AsyncClient,
    proxy_url: str,
    api_key: str,
    model: str,
    messages: List[Dict[str, str]],
    session_id: Optional[str] = None,
) -> Tuple[str, str]:
    """
    Returns (response_text, chosen_model_header).
    chosen_model_header is empty for non-router calls.
    """
    body: Dict = {"model": model, "messages": messages}
    if session_id:
        body["metadata"] = {"litellm_session_id": session_id}

    resp = await client.post(
        f"{proxy_url}/v1/chat/completions",
        json=body,
        headers={"Authorization": f"Bearer {api_key}"},
        timeout=60.0,
    )
    resp.raise_for_status()
    data = resp.json()
    text = data["choices"][0]["message"]["content"]
    chosen = resp.headers.get("x-litellm-adaptive-router-model", "")
    return text, chosen


# ---------------------------------------------------------------------------
# Evaluation loop
# ---------------------------------------------------------------------------
async def evaluate(
    proxy_url: str,
    api_key: str,
    router: str,
    judge_model: str,
) -> None:
    passed = 0
    failed = 0

    async with httpx.AsyncClient() as client:
        for i, case in enumerate(EVAL_CASES, 1):
            print(f"\n[{i}/{len(EVAL_CASES)}] category={case.category}")
            print(f"  prompt   : {case.prompt[:80]}{'…' if len(case.prompt) > 80 else ''}")

            session_id = f"eval-{uuid.uuid4()}"

            # Round 1: single-turn real request — get the actual LLM response to judge.
            try:
                response, chosen = await _chat(
                    client, proxy_url, api_key, router,
                    [{"role": "user", "content": case.prompt}],
                    session_id=session_id,
                )
            except Exception as exc:  # noqa: BLE001
                print(f"  ERROR calling router: {exc}", file=sys.stderr)
                failed += 1
                continue

            print(f"  model    : {chosen or router}")
            print(f"  response : {response[:120].replace(chr(10), ' ')}{'…' if len(response) > 120 else ''}")

            # Judge the real response.
            judge_msgs = [
                {"role": "system", "content": JUDGE_SYSTEM},
                {"role": "user", "content": _judge_user(case.prompt, case.ideal, response)},
            ]
            try:
                verdict, _ = await _chat(
                    client, proxy_url, api_key, judge_model, judge_msgs,
                )
            except Exception as exc:  # noqa: BLE001
                print(f"  ERROR calling judge: {exc}", file=sys.stderr)
                failed += 1
                continue

            # Parse verdict — first non-empty line should be PASS or FAIL.
            lines = [ln.strip() for ln in verdict.splitlines() if ln.strip()]
            first = lines[0].upper() if lines else ""
            reason = lines[1] if len(lines) > 1 else ""
            is_pass = "PASS" in first

            if is_pass:
                passed += 1
                print(f"  verdict  : \033[32mPASS\033[0m  {reason}")
            else:
                failed += 1
                print(f"  verdict  : \033[31mFAIL\033[0m  {reason}")

            # Round 2: 5-message conversation on the same session_id so the bandit fires.
            # On PASS → satisfaction follow-up (+alpha). On FAIL → neutral (no signal).
            follow_up = SATISFY_FOLLOWUP if is_pass else NEUTRAL_FOLLOWUP
            bandit_msgs = [
                {"role": "user",      "content": case.prompt},
                {"role": "assistant", "content": response},
                {"role": "user",      "content": "ok continue"},
                {"role": "assistant", "content": FAB_ASSISTANT},
                {"role": "user",      "content": follow_up},
            ]
            try:
                await _chat(
                    client, proxy_url, api_key, router, bandit_msgs,
                    session_id=session_id,
                )
            except Exception as exc:  # noqa: BLE001
                print(f"  WARNING: bandit update failed: {exc}", file=sys.stderr)

    total = passed + failed
    print(f"\n{'='*60}")
    print(f"Results: {passed}/{total} passed  ({failed} failed)")
    if passed == total:
        print("All test cases passed — the adaptive router is working well!")
    elif passed >= total * 0.8:
        print("Most test cases passed — minor issues to investigate.")
    else:
        print("Significant failures — check router config and model availability.")
    print("=" * 60)


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
    ap = argparse.ArgumentParser(description="Evaluate the adaptive router with LLM-as-judge.")
    ap.add_argument("--proxy-url",    default="http://localhost:4000")
    ap.add_argument("--api-key",      required=True, help="proxy API key")
    ap.add_argument("--router",       default="smart-cheap-router", help="adaptive router model name")
    ap.add_argument("--judge-model",  default="smart", help="model name for the judge (via proxy)")
    args = ap.parse_args()

    asyncio.run(evaluate(args.proxy_url, args.api_key, args.router, args.judge_model))


if __name__ == "__main__":
    main()