Litellm OpenAI double prefix bug (#28661)
* Fix token cost lookup when deployment ids repeat the provider prefix. Router configs may expose models like openai/openai/<model>; normalize those strings before joining provider/model so model_cost resolves correctly. Co-authored-by: Cursor <cursoragent@cursor.com> * scope duplicate-prefix cost fix to explicit providers; isolate test patch state Co-authored-by: Cursor <cursoragent@cursor.com> * fix(mypy): narrow custom_llm_provider after resolution in cost_per_token Co-authored-by: Cursor <cursoragent@cursor.com> * fix(cost_calculator): guard provider-prefix dedup against non-string model The provider-prefix dedup loop assumed `model` is always a string. When a non-string is passed (e.g. a MagicMock from a mocked transport in router tests), `model.startswith(...)` is always truthy and each slice returns a new object, so the loop never terminates — it spins and OOM-kills the test worker (observed as the litellm_router_testing CI regression, e.g. test_router_pattern_match_e2e). Only run the string-based dedup and prefix-join when `model` is actually a str, preserving the previous graceful behavior for non-string inputs. --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: mateo-berri <277851410+mateo-berri@users.noreply.github.com>
This commit is contained in:
parent
fbff60e9d9
commit
bd2d0ad519
@ -417,9 +417,36 @@ def cost_per_token( # noqa: PLR0915
|
||||
prompt_tokens_cost_usd_dollar: float = 0
|
||||
completion_tokens_cost_usd_dollar: float = 0
|
||||
model_cost_ref = litellm.model_cost
|
||||
# Only callers that explicitly pass `custom_llm_provider` get the
|
||||
# dedup/prefix-join treatment. When provider is omitted, preserve legacy
|
||||
# behavior: `model_with_provider` stays equal to the raw `model` string
|
||||
# (provider is detected below for downstream use only).
|
||||
caller_supplied_provider = custom_llm_provider is not None
|
||||
|
||||
# `model` is normally a string, but callers that mock the transport can pass
|
||||
# non-string objects. Only run the string-based dedup/prefix-join when it is
|
||||
# actually a string — e.g. a MagicMock's `.startswith()` is always truthy and
|
||||
# its slices return new mocks, which would spin the dedup loop forever.
|
||||
model_is_str = isinstance(model, str)
|
||||
|
||||
# Router/proxy deployments may repeat the provider segment (e.g. model_name
|
||||
# "openai/openai/gpt-5.5"). Strip duplicated `{provider}/` chains before joining.
|
||||
if caller_supplied_provider and model_is_str:
|
||||
_dup_prefix = f"{custom_llm_provider}/"
|
||||
while model.startswith(_dup_prefix):
|
||||
_remainder = model[len(_dup_prefix) :]
|
||||
if _remainder.startswith(_dup_prefix):
|
||||
model = _remainder
|
||||
else:
|
||||
break
|
||||
|
||||
model_with_provider = model
|
||||
if custom_llm_provider is not None:
|
||||
model_with_provider = custom_llm_provider + "/" + model
|
||||
if caller_supplied_provider:
|
||||
_prov_prefix = f"{custom_llm_provider}/"
|
||||
if model_is_str and model.startswith(_prov_prefix):
|
||||
model_with_provider = model
|
||||
else:
|
||||
model_with_provider = f"{custom_llm_provider}/{model}"
|
||||
if region_name is not None:
|
||||
model_with_provider_and_region = (
|
||||
f"{custom_llm_provider}/{region_name}/{model}"
|
||||
@ -430,6 +457,9 @@ def cost_per_token( # noqa: PLR0915
|
||||
model_with_provider = model_with_provider_and_region
|
||||
else:
|
||||
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
|
||||
|
||||
assert custom_llm_provider is not None # caller-supplied or get_llm_provider
|
||||
|
||||
model_without_prefix = model
|
||||
model_parts = model.split("/", 1)
|
||||
if len(model_parts) > 1:
|
||||
|
||||
@ -13,6 +13,7 @@ from pydantic import BaseModel
|
||||
import litellm
|
||||
from litellm.cost_calculator import (
|
||||
completion_cost,
|
||||
cost_per_token,
|
||||
handle_realtime_stream_cost_calculation,
|
||||
response_cost_calculator,
|
||||
)
|
||||
@ -21,6 +22,55 @@ from litellm.types.utils import ModelResponse, PromptTokensDetailsWrapper, Usage
|
||||
from litellm.utils import TranscriptionResponse
|
||||
|
||||
|
||||
def test_cost_per_token_duplicate_openai_prefix_matches_model_cost(monkeypatch):
|
||||
"""
|
||||
Router/proxy configs may use deployment ids like openai/openai/<model>. Cost lookup must
|
||||
resolve to model_prices keys (e.g. gpt-5.5), not fail or multiply prefixes.
|
||||
"""
|
||||
monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
|
||||
monkeypatch.setattr(litellm, "model_cost", litellm.get_model_cost_map(url=""))
|
||||
|
||||
prompt_usd, completion_usd = cost_per_token(
|
||||
model="openai/openai/gpt-5.5",
|
||||
prompt_tokens=100,
|
||||
completion_tokens=50,
|
||||
custom_llm_provider="openai",
|
||||
)
|
||||
|
||||
assert prompt_usd + completion_usd > 0
|
||||
|
||||
|
||||
def test_cost_per_token_non_string_model_does_not_hang():
|
||||
"""
|
||||
The provider-prefix dedup loop must not spin forever when `model` is a
|
||||
non-string object (e.g. a MagicMock from a mocked transport). It should
|
||||
return or raise promptly instead of looping on a truthy `.startswith()`.
|
||||
"""
|
||||
import threading
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
result: dict = {}
|
||||
|
||||
def _run():
|
||||
try:
|
||||
cost_per_token(
|
||||
model=MagicMock(),
|
||||
prompt_tokens=10,
|
||||
completion_tokens=5,
|
||||
custom_llm_provider="anthropic",
|
||||
)
|
||||
result["status"] = "returned"
|
||||
except Exception:
|
||||
result["status"] = "raised"
|
||||
|
||||
worker = threading.Thread(target=_run, daemon=True)
|
||||
worker.start()
|
||||
worker.join(timeout=10)
|
||||
|
||||
assert not worker.is_alive(), "cost_per_token hung on a non-string model"
|
||||
assert result.get("status") in ("returned", "raised")
|
||||
|
||||
|
||||
def test_completion_cost_uses_response_model_for_dynamic_routing():
|
||||
"""
|
||||
Test that completion_cost uses the model from the response object
|
||||
|
||||
Loading…
Reference in New Issue
Block a user