litellm/scripts/adaptive_router_demo/eval.py
user 5bafa8b3a2
Drop dep bumps + black-26 reformat to clear fork CI policy
PR was blocked by .github/workflows/guard-fork-dependencies.yml: fork PRs
cannot modify uv.lock. Reverting:

- uv.lock + pyproject.toml black bump (24.10.0 -> 26.3.1) and the 295
  files of mechanical Black 26 reformat coupled to it
- pyproject.toml diskcache extra change (kept the runtime mitigation in
  litellm/caching/disk_cache.py via JSONDisk)

Kept:
- Dockerfile cache narrowing (drops ~660 MB of uv build cache that
  surfaced cached setuptools as CVE findings)
- litellm/caching/disk_cache.py: dc.JSONDisk to neutralize CVE-2025-69872
- ui/litellm-dashboard/package-lock.json + litellm-js/spend-logs/package-lock.json:
  next/postcss/hono/uuid CVE bumps (these are not blocked by the fork guard)
- tests/test_litellm/caching/test_disk_cache.py
- tests/code_coverage_tests/liccheck.ini: harmless black authorization

Black + gitpython + langchain dep upgrades will need a follow-up from a
maintainer pushing a branch in the canonical BerriAI/litellm repo.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 23:04:52 +00:00

272 lines
10 KiB
Python

# ruff: noqa: T201
"""
Adaptive router evaluator — LLM-as-judge harness.
For each test case:
1. Sends the prompt to the adaptive router.
2. Reads which model was picked (x-litellm-adaptive-router-model header).
3. Asks the judge model whether the response meets the ideal criteria.
4. Prints PASS or FAIL with one line of reasoning.
Run:
uv run python scripts/adaptive_router_demo/eval.py \
--proxy-url http://localhost:4000 \
--api-key sk-1234 \
--router smart-cheap-router \
--judge-model smart
"""
from __future__ import annotations
import argparse
import asyncio
import sys
import uuid
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import httpx
# ---------------------------------------------------------------------------
# Test cases
# ---------------------------------------------------------------------------
@dataclass
class EvalCase:
category: str
prompt: str
ideal: str # criteria the judge checks the response against
EVAL_CASES: List[EvalCase] = [
# code_generation
EvalCase(
category="code_generation",
prompt="Write a Python function that flattens a nested list of arbitrary depth.",
ideal=(
"A Python function (def flatten(...)) that accepts a list which may "
"contain nested lists to arbitrary depth and returns a single flat list "
"with all elements in order. Must handle at least two levels of nesting."
),
),
EvalCase(
category="code_generation",
prompt="Write a Python decorator that retries a function up to 3 times on exception.",
ideal=(
"A Python decorator that wraps a callable, catches exceptions, and "
"retries the call up to 3 times before re-raising. Should use functools.wraps "
"or equivalent to preserve the wrapped function's metadata."
),
),
EvalCase(
category="code_generation",
prompt="Write a SQL query that returns the top 5 customers by total order value.",
ideal=(
"A valid SQL SELECT query that JOINs an orders or order_items table with a "
"customers table, groups by customer, sums order value, orders descending, "
"and limits to 5 rows."
),
),
# factual_lookup
EvalCase(
category="factual_lookup",
prompt="What is the capital of New Zealand?",
ideal="The answer must state Wellington as the capital of New Zealand.",
),
EvalCase(
category="factual_lookup",
prompt="In what year did World War II end?",
ideal="The answer must state 1945 as the year World War II ended.",
),
EvalCase(
category="factual_lookup",
prompt="What is the chemical symbol for gold?",
ideal="The answer must include 'Au' as the chemical symbol for gold.",
),
# writing
EvalCase(
category="writing",
prompt=(
"Write a short, polite email declining a meeting request because of "
"a scheduling conflict."
),
ideal=(
"A professional email that: (1) thanks the sender for the invitation, "
"(2) clearly declines, (3) mentions a scheduling conflict as the reason, "
"and (4) offers to reschedule or an alternative. Tone must be polite."
),
),
EvalCase(
category="writing",
prompt="Write a one-paragraph product description for noise-cancelling headphones.",
ideal=(
"A marketing paragraph for noise-cancelling headphones that mentions "
"noise cancellation as a feature, highlights at least one other benefit "
"(comfort, audio quality, battery life, or similar), and ends with a "
"persuasive call to action or closing statement."
),
),
]
# Matches the satisfaction regex in signals.py (_SATISFACTION_PATTERNS).
SATISFY_FOLLOWUP = "great, thanks!"
NEUTRAL_FOLLOWUP = "ok, noted"
FAB_ASSISTANT = "Got it. Working on that now."
JUDGE_SYSTEM = (
"You are a strict but fair evaluator. Your job is to decide whether a model "
"response meets the stated requirements. Reply with exactly two lines:\n"
"Line 1: PASS or FAIL\n"
"Line 2: One sentence of reasoning (≤ 25 words)."
)
def _judge_user(prompt: str, ideal: str, actual: str) -> str:
return (
f"Question sent to model:\n{prompt}\n\n"
f"Requirements the response must meet:\n{ideal}\n\n"
f"Actual model response:\n{actual}\n\n"
"Does the response meet the requirements? Reply PASS or FAIL."
)
# ---------------------------------------------------------------------------
# HTTP helpers
# ---------------------------------------------------------------------------
async def _chat(
client: httpx.AsyncClient,
proxy_url: str,
api_key: str,
model: str,
messages: List[Dict[str, str]],
session_id: Optional[str] = None,
) -> Tuple[str, str]:
"""
Returns (response_text, chosen_model_header).
chosen_model_header is empty for non-router calls.
"""
body: Dict = {"model": model, "messages": messages}
if session_id:
body["metadata"] = {"litellm_session_id": session_id}
resp = await client.post(
f"{proxy_url}/v1/chat/completions",
json=body,
headers={"Authorization": f"Bearer {api_key}"},
timeout=60.0,
)
resp.raise_for_status()
data = resp.json()
text = data["choices"][0]["message"]["content"]
chosen = resp.headers.get("x-litellm-adaptive-router-model", "")
return text, chosen
# ---------------------------------------------------------------------------
# Evaluation loop
# ---------------------------------------------------------------------------
async def evaluate(
proxy_url: str,
api_key: str,
router: str,
judge_model: str,
) -> None:
passed = 0
failed = 0
async with httpx.AsyncClient() as client:
for i, case in enumerate(EVAL_CASES, 1):
print(f"\n[{i}/{len(EVAL_CASES)}] category={case.category}")
print(f" prompt : {case.prompt[:80]}{'' if len(case.prompt) > 80 else ''}")
session_id = f"eval-{uuid.uuid4()}"
# Round 1: single-turn real request — get the actual LLM response to judge.
try:
response, chosen = await _chat(
client, proxy_url, api_key, router,
[{"role": "user", "content": case.prompt}],
session_id=session_id,
)
except Exception as exc: # noqa: BLE001
print(f" ERROR calling router: {exc}", file=sys.stderr)
failed += 1
continue
print(f" model : {chosen or router}")
print(f" response : {response[:120].replace(chr(10), ' ')}{'' if len(response) > 120 else ''}")
# Judge the real response.
judge_msgs = [
{"role": "system", "content": JUDGE_SYSTEM},
{"role": "user", "content": _judge_user(case.prompt, case.ideal, response)},
]
try:
verdict, _ = await _chat(
client, proxy_url, api_key, judge_model, judge_msgs,
)
except Exception as exc: # noqa: BLE001
print(f" ERROR calling judge: {exc}", file=sys.stderr)
failed += 1
continue
# Parse verdict — first non-empty line should be PASS or FAIL.
lines = [ln.strip() for ln in verdict.splitlines() if ln.strip()]
first = lines[0].upper() if lines else ""
reason = lines[1] if len(lines) > 1 else ""
is_pass = "PASS" in first
if is_pass:
passed += 1
print(f" verdict : \033[32mPASS\033[0m {reason}")
else:
failed += 1
print(f" verdict : \033[31mFAIL\033[0m {reason}")
# Round 2: 5-message conversation on the same session_id so the bandit fires.
# On PASS → satisfaction follow-up (+alpha). On FAIL → neutral (no signal).
follow_up = SATISFY_FOLLOWUP if is_pass else NEUTRAL_FOLLOWUP
bandit_msgs = [
{"role": "user", "content": case.prompt},
{"role": "assistant", "content": response},
{"role": "user", "content": "ok continue"},
{"role": "assistant", "content": FAB_ASSISTANT},
{"role": "user", "content": follow_up},
]
try:
await _chat(
client, proxy_url, api_key, router, bandit_msgs,
session_id=session_id,
)
except Exception as exc: # noqa: BLE001
print(f" WARNING: bandit update failed: {exc}", file=sys.stderr)
total = passed + failed
print(f"\n{'='*60}")
print(f"Results: {passed}/{total} passed ({failed} failed)")
if passed == total:
print("All test cases passed — the adaptive router is working well!")
elif passed >= total * 0.8:
print("Most test cases passed — minor issues to investigate.")
else:
print("Significant failures — check router config and model availability.")
print("=" * 60)
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
ap = argparse.ArgumentParser(description="Evaluate the adaptive router with LLM-as-judge.")
ap.add_argument("--proxy-url", default="http://localhost:4000")
ap.add_argument("--api-key", required=True, help="proxy API key")
ap.add_argument("--router", default="smart-cheap-router", help="adaptive router model name")
ap.add_argument("--judge-model", default="smart", help="model name for the judge (via proxy)")
args = ap.parse_args()
asyncio.run(evaluate(args.proxy_url, args.api_key, args.router, args.judge_model))
if __name__ == "__main__":
main()