litellm/proxy_server_config.yaml
Mateo Wang 84247d954d
test(ci): record/replay OpenAI image gen so the spend E2E isn't outage-bound (#29787)
* test(ci): record/replay OpenAI image gen so the spend E2E isn't outage-bound

The dockerized spend test test_key_info_spend_values_image_generation curls
the proxy for a gpt-image-1 image, which wildcard-routes to real api.openai.com
on every commit; an OpenAI outage then reddens unrelated PRs and each run pays
for an image.

Add an in-repo record/replay reverse proxy (tests/_openai_record_replay_proxy.py)
that sits between the proxy and OpenAI. The first run, and the first after the
recording lapses, records live; subsequent runs replay from the shared Redis
cassette store. The proxy keeps its real separate-process HTTP topology; only
the image model's api_base is pointed at the recorder in CI via
IMAGE_GEN_RECORDER_BASE_URL, which is unset elsewhere so it falls back to
api.openai.com.

Recordings lapse 24h after write and are never refreshed on read, matching the
VCR persister contract, so provider drift is still caught. Replayed responses
drop upstream framing/server headers (content-length, transfer-encoding,
content-encoding, date, server) so the re-serving layer recomputes them,
honoring the Bedrock content-length lesson.

* test(ci): close recorder http client on app shutdown

Add a Starlette lifespan that closes the self-created httpx.AsyncClient on
teardown, and leave caller-injected clients untouched so reuse across
create_app calls is not broken. Covers the unclosed-client ResourceWarning
raised in review.
2026-06-05 10:27:23 -07:00

240 lines
8.7 KiB
YAML

model_list:
- model_name: gpt-5-mini-end-user-test
litellm_params:
model: gpt-5-mini
region_name: "eu"
model_info:
id: "1"
- model_name: gpt-5-mini-end-user-test
litellm_params:
model: openai/gpt-5-mini
api_key: os.environ/OPENAI_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
- model_name: gpt-3.5-turbo
litellm_params:
model: openai/gpt-4.1-mini
api_key: os.environ/OPENAI_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
- model_name: gpt-3.5-turbo-large
litellm_params:
model: "gpt-4.1"
api_key: os.environ/OPENAI_API_KEY
rpm: 480
timeout: 300
stream_timeout: 60
- model_name: gpt-4
litellm_params:
model: openai/gpt-4.1
api_key: os.environ/OPENAI_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
rpm: 480
timeout: 300
stream_timeout: 60
- model_name: sagemaker-completion-model
litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
input_cost_per_second: 0.000420
- model_name: text-embedding-ada-002
litellm_params:
model: openai/text-embedding-3-small
api_key: os.environ/OPENAI_API_KEY
model_info:
mode: embedding
base_model: text-embedding-3-small
- model_name: dall-e-2 # dall-e-2 and dall-e-3 were deprecated 2026-05-12; alias to gpt-image-1
litellm_params:
model: openai/gpt-image-1
- model_name: openai-dall-e-3 # dall-e-3 deprecated 2026-05-12; underlying now gpt-image-1
litellm_params:
model: gpt-image-1
# In CI, IMAGE_GEN_RECORDER_BASE_URL points this at the record/replay proxy
# (tests/_openai_record_replay_proxy.py) so the image spend E2E doesn't depend
# on OpenAI's uptime every commit. Unset elsewhere, so it resolves to None and
# falls back to api.openai.com.
- model_name: gpt-image-1
litellm_params:
model: openai/gpt-image-1
api_key: os.environ/OPENAI_API_KEY
api_base: os.environ/IMAGE_GEN_RECORDER_BASE_URL
- model_name: fake-openai-endpoint
litellm_params:
model: openai/gpt-5-mini
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: fake-openai-endpoint-2
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
stream_timeout: 0.001
rpm: 1
- model_name: fake-openai-endpoint-3
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
stream_timeout: 0.001
rpm: 1000
- model_name: fake-openai-endpoint-4
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
num_retries: 50
- model_name: fake-openai-endpoint-3
litellm_params:
model: openai/my-fake-model-2
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
stream_timeout: 0.001
rpm: 1000
- model_name: bad-model
litellm_params:
model: openai/bad-model
api_key: os.environ/OPENAI_API_KEY
api_base: https://exampleopenaiendpoint-production.up.railway.app/
mock_timeout: True
timeout: 60
rpm: 1000
model_info:
health_check_timeout: 1
- model_name: good-model
litellm_params:
model: openai/bad-model
api_key: os.environ/OPENAI_API_KEY
api_base: https://exampleopenaiendpoint-production.up.railway.app/
rpm: 1000
model_info:
health_check_timeout: 1
- model_name: "*"
litellm_params:
model: openai/*
api_key: os.environ/OPENAI_API_KEY
- model_name: realtime-v1
litellm_params:
model: azure/gpt-realtime-20250828-standard
api_version: "2025-08-28"
realtime_protocol: GA # Possible values: "GA"/ "v1", "beta"
- model_name: realtime-beta
litellm_params:
model: azure/gpt-realtime-20250828-standard
api_version: 2025-04-01-preview
# provider specific wildcard routing
- model_name: "anthropic/*"
litellm_params:
model: "anthropic/*"
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: "bedrock/*"
litellm_params:
model: "bedrock/*"
- model_name: "groq/*"
litellm_params:
model: "groq/*"
api_key: os.environ/GROQ_API_KEY
- model_name: mistral-embed
litellm_params:
model: mistral/mistral-embed
- model_name: gpt-instruct # [PROD TEST] - tests if `/health` automatically infers this to be a text completion model
litellm_params:
model: text-completion-openai/gpt-3.5-turbo-instruct
- model_name: fake-openai-endpoint-5
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
timeout: 1
- model_name: badly-configured-openai-endpoint
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.appxxxx/
- model_name: gemini-2.5-flash
litellm_params:
model: gemini/gemini-2.5-flash
api_key: os.environ/GOOGLE_API_KEY
- model_name: gpt-5.5
litellm_params:
model: gpt-5.5
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
# set_verbose: True # Uncomment this if you want to see verbose logs; not recommended in production
drop_params: True
success_callback: ["prometheus"]
# max_budget: 100
# budget_duration: 30d
num_retries: 5
request_timeout: 600
telemetry: False
context_window_fallbacks: [{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}]
default_team_settings:
- team_id: team-1
success_callback: ["langfuse"]
failure_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PROJECT1_PUBLIC # Project 1
langfuse_secret: os.environ/LANGFUSE_PROJECT1_SECRET # Project 1
- team_id: team-2
success_callback: ["langfuse"]
failure_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PROJECT2_PUBLIC # Project 2
langfuse_secret: os.environ/LANGFUSE_PROJECT2_SECRET # Project 2
langfuse_host: https://us.cloud.langfuse.com
# cache: true # [OPTIONAL] use for caching responses
# enable_caching_on_provider_specific_optional_params: True # Include provider-specific params in cache keys
# cache_params: # And for shared health check
# type: redis
# host: localhost
# port: 6379
# For /fine_tuning/jobs endpoints
finetune_settings:
- custom_llm_provider: azure
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-03-15-preview"
- custom_llm_provider: openai
api_key: os.environ/OPENAI_API_KEY
# for /files endpoints
files_settings:
- custom_llm_provider: azure
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-03-15-preview"
- custom_llm_provider: openai
api_key: os.environ/OPENAI_API_KEY
router_settings:
routing_strategy: usage-based-routing-v2
redis_host: os.environ/REDIS_HOST
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
enable_pre_call_checks: true
model_group_alias: {"my-special-fake-model-alias-name": "fake-openai-endpoint-3"}
general_settings:
master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
store_model_in_db: True
proxy_budget_rescheduler_min_time: 60
proxy_budget_rescheduler_max_time: 64
proxy_batch_write_at: 1
database_connection_pool_limit: 10
# background_health_checks: true
# use_shared_health_check: true
# health_check_interval: 30
# database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy
pass_through_endpoints:
- path: "/v1/rerank" # route you want to add to LiteLLM Proxy Server
target: "https://api.cohere.com/v1/rerank" # URL this route should forward requests to
headers: # headers to forward to this URL
content-type: application/json # (Optional) Extra Headers to pass to this endpoint
accept: application/json
forward_headers: True
# environment_variables:
# settings for using redis caching
# REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
# REDIS_PORT: "16337"
# REDIS_PASSWORD: