litellm/cookbook/benchmark/benchmark.py
user 5bafa8b3a2
Drop dep bumps + black-26 reformat to clear fork CI policy
PR was blocked by .github/workflows/guard-fork-dependencies.yml: fork PRs
cannot modify uv.lock. Reverting:

- uv.lock + pyproject.toml black bump (24.10.0 -> 26.3.1) and the 295
  files of mechanical Black 26 reformat coupled to it
- pyproject.toml diskcache extra change (kept the runtime mitigation in
  litellm/caching/disk_cache.py via JSONDisk)

Kept:
- Dockerfile cache narrowing (drops ~660 MB of uv build cache that
  surfaced cached setuptools as CVE findings)
- litellm/caching/disk_cache.py: dc.JSONDisk to neutralize CVE-2025-69872
- ui/litellm-dashboard/package-lock.json + litellm-js/spend-logs/package-lock.json:
  next/postcss/hono/uuid CVE bumps (these are not blocked by the fork guard)
- tests/test_litellm/caching/test_disk_cache.py
- tests/code_coverage_tests/liccheck.ini: harmless black authorization

Black + gitpython + langchain dep upgrades will need a follow-up from a
maintainer pushing a branch in the canonical BerriAI/litellm repo.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 23:04:52 +00:00

91 lines
2.9 KiB
Python

from litellm import completion, completion_cost
import time
import click
from tqdm import tqdm
from tabulate import tabulate
from termcolor import colored
import os
# Define the list of models to benchmark
# select any LLM listed here: https://docs.litellm.ai/docs/providers
models = ["gpt-3.5-turbo", "claude-2"]
# Enter LLM API keys
# https://docs.litellm.ai/docs/providers
os.environ["OPENAI_API_KEY"] = ""
os.environ["ANTHROPIC_API_KEY"] = ""
# List of questions to benchmark (replace with your questions)
questions = ["When will BerriAI IPO?", "When will LiteLLM hit $100M ARR?"]
# Enter your system prompt here
system_prompt = """
You are LiteLLMs helpful assistant
"""
@click.command()
@click.option(
"--system-prompt",
default="You are a helpful assistant that can answer questions.",
help="System prompt for the conversation.",
)
def main(system_prompt):
for question in questions:
data = [] # Data for the current question
with tqdm(total=len(models)) as pbar:
for model in models:
colored_description = colored(
f"Running question: {question} for model: {model}", "green"
)
pbar.set_description(colored_description)
start_time = time.time()
response = completion(
model=model,
max_tokens=500,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": question},
],
)
end = time.time()
total_time = end - start_time
cost = completion_cost(completion_response=response)
raw_response = response["choices"][0]["message"]["content"]
data.append(
{
"Model": colored(model, "light_blue"),
"Response": raw_response, # Colorize the response
"ResponseTime": colored(f"{total_time:.2f} seconds", "red"),
"Cost": colored(f"${cost:.6f}", "green"), # Colorize the cost
}
)
pbar.update(1)
# Separate headers from the data
headers = ["Model", "Response", "Response Time (seconds)", "Cost ($)"]
colwidths = [15, 80, 15, 10]
# Create a nicely formatted table for the current question
table = tabulate(
[list(d.values()) for d in data],
headers,
tablefmt="grid",
maxcolwidths=colwidths,
)
# Print the table for the current question
colored_question = colored(question, "green")
click.echo(f"\nBenchmark Results for '{colored_question}':")
click.echo(table) # Display the formatted table
if __name__ == "__main__":
main()