litellm/proxy_server_config.yaml

model_list:
  - model_name: gpt-5-mini-end-user-test
    litellm_params:
      model: gpt-5-mini
      region_name: "eu"
    model_info:
      id: "1"
  - model_name: gpt-5-mini-end-user-test
    litellm_params:
      model: openai/gpt-5-mini
      api_key: os.environ/OPENAI_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: openai/gpt-4.1-mini
      api_key: os.environ/OPENAI_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
  - model_name: gpt-3.5-turbo-large
    litellm_params:
      model: "gpt-4.1"
      api_key: os.environ/OPENAI_API_KEY
      rpm: 480
      timeout: 300
      stream_timeout: 60
  - model_name: gpt-4
    litellm_params:
      model: openai/gpt-4.1
      api_key: os.environ/OPENAI_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
      rpm: 480
      timeout: 300
      stream_timeout: 60
  - model_name: sagemaker-completion-model
    litellm_params:
      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
      input_cost_per_second: 0.000420
  - model_name: text-embedding-ada-002
    litellm_params:
      model: openai/text-embedding-3-small
      api_key: os.environ/OPENAI_API_KEY
    model_info:
      mode: embedding
      base_model: text-embedding-3-small
  - model_name: dall-e-2 # dall-e-2 and dall-e-3 were deprecated 2026-05-12; alias to gpt-image-1
    litellm_params:
      model: openai/gpt-image-1
  - model_name: openai-dall-e-3 # dall-e-3 deprecated 2026-05-12; underlying now gpt-image-1
    litellm_params:
      model: gpt-image-1
  # In CI, IMAGE_GEN_RECORDER_BASE_URL points this at the record/replay proxy
  # (tests/_openai_record_replay_proxy.py) so the image spend E2E doesn't depend
  # on OpenAI's uptime every commit. Unset elsewhere, so it resolves to None and
  # falls back to api.openai.com.
  - model_name: gpt-image-1
    litellm_params:
      model: openai/gpt-image-1
      api_key: os.environ/OPENAI_API_KEY
      api_base: os.environ/IMAGE_GEN_RECORDER_BASE_URL
  - model_name: fake-openai-endpoint
    litellm_params:
      model: openai/gpt-5-mini
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
  - model_name: fake-openai-endpoint-2
    litellm_params:
      model: openai/my-fake-model
      api_key: my-fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      stream_timeout: 0.001
      rpm: 1
  - model_name: fake-openai-endpoint-3
    litellm_params:
      model: openai/my-fake-model
      api_key: my-fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      stream_timeout: 0.001
      rpm: 1000
  - model_name: fake-openai-endpoint-4
    litellm_params:
      model: openai/my-fake-model
      api_key: my-fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      num_retries: 50
  - model_name: fake-openai-endpoint-3
    litellm_params:
      model: openai/my-fake-model-2
      api_key: my-fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      stream_timeout: 0.001
      rpm: 1000
  - model_name: bad-model
    litellm_params:
      model: openai/bad-model
      api_key: os.environ/OPENAI_API_KEY
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      mock_timeout: True
      timeout: 60
      rpm: 1000
    model_info:
      health_check_timeout: 1
  - model_name: good-model
    litellm_params:
      model: openai/bad-model
      api_key: os.environ/OPENAI_API_KEY
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      rpm: 1000
    model_info:
      health_check_timeout: 1
  - model_name: "*"
    litellm_params:
      model: openai/*
      api_key: os.environ/OPENAI_API_KEY
  - model_name: realtime-v1
    litellm_params:
      model: azure/gpt-realtime-20250828-standard
      api_version: "2025-08-28"
      realtime_protocol: GA # Possible values: "GA"/ "v1", "beta"

  - model_name: realtime-beta
    litellm_params:
      model: azure/gpt-realtime-20250828-standard
      api_version: 2025-04-01-preview


  # provider specific wildcard routing
  - model_name: "anthropic/*"
    litellm_params:
      model: "anthropic/*"
      api_key: os.environ/ANTHROPIC_API_KEY
  - model_name: "bedrock/*"
    litellm_params:
      model: "bedrock/*"
  - model_name: "groq/*"
    litellm_params:
      model: "groq/*"
      api_key: os.environ/GROQ_API_KEY
  - model_name: mistral-embed
    litellm_params:
      model: mistral/mistral-embed
  - model_name: gpt-instruct # [PROD TEST] - tests if `/health` automatically infers this to be a text completion model
    litellm_params:
      model: text-completion-openai/gpt-3.5-turbo-instruct
  - model_name: fake-openai-endpoint-5
    litellm_params:
      model: openai/my-fake-model
      api_key: my-fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
      timeout: 1
  - model_name: badly-configured-openai-endpoint
    litellm_params:
      model: openai/my-fake-model
      api_key: my-fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.appxxxx/
  - model_name: gemini-2.5-flash
    litellm_params:
      model: gemini/gemini-2.5-flash
      api_key: os.environ/GOOGLE_API_KEY
  - model_name: gpt-5.5
    litellm_params:
      model: gpt-5.5
      api_key: os.environ/OPENAI_API_KEY


litellm_settings:
  # set_verbose: True  # Uncomment this if you want to see verbose logs; not recommended in production
  drop_params: True
  success_callback: ["prometheus"]
  # max_budget: 100
  # budget_duration: 30d
  num_retries: 5
  request_timeout: 600
  telemetry: False
  context_window_fallbacks: [{"gpt-3.5-turbo": ["gpt-3.5-turbo-large"]}]
  default_team_settings:
    - team_id: team-1
      success_callback: ["langfuse"]
      failure_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PROJECT1_PUBLIC # Project 1
      langfuse_secret: os.environ/LANGFUSE_PROJECT1_SECRET # Project 1
    - team_id: team-2
      success_callback: ["langfuse"]
      failure_callback: ["langfuse"]
      langfuse_public_key: os.environ/LANGFUSE_PROJECT2_PUBLIC # Project 2
      langfuse_secret: os.environ/LANGFUSE_PROJECT2_SECRET # Project 2
      langfuse_host: https://us.cloud.langfuse.com
  # cache: true   # [OPTIONAL] use for caching responses
  # enable_caching_on_provider_specific_optional_params: True  # Include provider-specific params in cache keys
  # cache_params:  # And for shared health check
  #   type: redis
  #   host: localhost
  #   port: 6379

# For /fine_tuning/jobs endpoints
finetune_settings:
  - custom_llm_provider: azure
    api_base: os.environ/AZURE_API_BASE
    api_key: os.environ/AZURE_API_KEY
    api_version: "2023-03-15-preview"
  - custom_llm_provider: openai
    api_key: os.environ/OPENAI_API_KEY

# for /files endpoints
files_settings:
  - custom_llm_provider: azure
    api_base: os.environ/AZURE_API_BASE
    api_key: os.environ/AZURE_API_KEY
    api_version: "2023-03-15-preview"
  - custom_llm_provider: openai
    api_key: os.environ/OPENAI_API_KEY

router_settings:
  routing_strategy: usage-based-routing-v2
  redis_host: os.environ/REDIS_HOST
  redis_password: os.environ/REDIS_PASSWORD
  redis_port: os.environ/REDIS_PORT
  enable_pre_call_checks: true
  model_group_alias: {"my-special-fake-model-alias-name": "fake-openai-endpoint-3"}

general_settings:
  master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
  store_model_in_db: True
  proxy_budget_rescheduler_min_time: 60
  proxy_budget_rescheduler_max_time: 64
  proxy_batch_write_at: 1
  database_connection_pool_limit: 10
  # background_health_checks: true
  # use_shared_health_check: true
  # health_check_interval: 30
  # database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # [OPTIONAL] use for token-based auth to proxy

  pass_through_endpoints:
    - path: "/v1/rerank"                                  # route you want to add to LiteLLM Proxy Server
      target: "https://api.cohere.com/v1/rerank"          # URL this route should forward requests to
      headers:                                            # headers to forward to this URL
        content-type: application/json                    # (Optional) Extra Headers to pass to this endpoint
        accept: application/json
      forward_headers: True

# environment_variables:
  # settings for using redis caching
  # REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com
  # REDIS_PORT: "16337"
  # REDIS_PASSWORD: