Litellm OpenAI double prefix bug (#28661)

* Fix token cost lookup when deployment ids repeat the provider prefix. Router configs may expose models like openai/openai/<model>; normalize those strings before joining provider/model so model_cost resolves correctly. Co-authored-by: Cursor <cursoragent@cursor.com> * scope duplicate-prefix cost fix to explicit providers; isolate test patch state Co-authored-by: Cursor <cursoragent@cursor.com> * fix(mypy): narrow custom_llm_provider after resolution in cost_per_token Co-authored-by: Cursor <cursoragent@cursor.com> * fix(cost_calculator): guard provider-prefix dedup against non-string model The provider-prefix dedup loop assumed `model` is always a string. When a non-string is passed (e.g. a MagicMock from a mocked transport in router tests), `model.startswith(...)` is always truthy and each slice returns a new object, so the loop never terminates — it spins and OOM-kills the test worker (observed as the litellm_router_testing CI regression, e.g. test_router_pattern_match_e2e). Only run the string-based dedup and prefix-join when `model` is actually a str, preserving the previous graceful behavior for non-string inputs. --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: mateo-berri <277851410+mateo-berri@users.noreply.github.com>
2026-05-26 11:40:44 -07:00 · 2026-05-26 11:40:44 -07:00 · bd2d0ad519
commit bd2d0ad519
parent fbff60e9d9
2 changed files with 82 additions and 2 deletions
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -417,9 +417,36 @@ def cost_per_token(  # noqa: PLR0915
    prompt_tokens_cost_usd_dollar: float = 0
    completion_tokens_cost_usd_dollar: float = 0
    model_cost_ref = litellm.model_cost
+    # Only callers that explicitly pass `custom_llm_provider` get the
+    # dedup/prefix-join treatment. When provider is omitted, preserve legacy
+    # behavior: `model_with_provider` stays equal to the raw `model` string
+    # (provider is detected below for downstream use only).
+    caller_supplied_provider = custom_llm_provider is not None
+
+    # `model` is normally a string, but callers that mock the transport can pass
+    # non-string objects. Only run the string-based dedup/prefix-join when it is
+    # actually a string — e.g. a MagicMock's `.startswith()` is always truthy and
+    # its slices return new mocks, which would spin the dedup loop forever.
+    model_is_str = isinstance(model, str)
+
+    # Router/proxy deployments may repeat the provider segment (e.g. model_name
+    # "openai/openai/gpt-5.5"). Strip duplicated `{provider}/` chains before joining.
+    if caller_supplied_provider and model_is_str:
+        _dup_prefix = f"{custom_llm_provider}/"
+        while model.startswith(_dup_prefix):
+            _remainder = model[len(_dup_prefix) :]
+            if _remainder.startswith(_dup_prefix):
+                model = _remainder
+            else:
+                break
+
    model_with_provider = model
-    if custom_llm_provider is not None:
-        model_with_provider = custom_llm_provider + "/" + model
+    if caller_supplied_provider:
+        _prov_prefix = f"{custom_llm_provider}/"
+        if model_is_str and model.startswith(_prov_prefix):
+            model_with_provider = model
+        else:
+            model_with_provider = f"{custom_llm_provider}/{model}"
        if region_name is not None:
            model_with_provider_and_region = (
                f"{custom_llm_provider}/{region_name}/{model}"
@ -430,6 +457,9 @@ def cost_per_token(  # noqa: PLR0915
                model_with_provider = model_with_provider_and_region
    else:
        _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+
+    assert custom_llm_provider is not None  # caller-supplied or get_llm_provider
+
    model_without_prefix = model
    model_parts = model.split("/", 1)
    if len(model_parts) > 1:
--- a/tests/test_litellm/test_cost_calculator.py
+++ b/tests/test_litellm/test_cost_calculator.py
@ -13,6 +13,7 @@ from pydantic import BaseModel
 import litellm
 from litellm.cost_calculator import (
    completion_cost,
+    cost_per_token,
    handle_realtime_stream_cost_calculation,
    response_cost_calculator,
 )
@ -21,6 +22,55 @@ from litellm.types.utils import ModelResponse, PromptTokensDetailsWrapper, Usage
 from litellm.utils import TranscriptionResponse


+def test_cost_per_token_duplicate_openai_prefix_matches_model_cost(monkeypatch):
+    """
+    Router/proxy configs may use deployment ids like openai/openai/<model>. Cost lookup must
+    resolve to model_prices keys (e.g. gpt-5.5), not fail or multiply prefixes.
+    """
+    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
+    monkeypatch.setattr(litellm, "model_cost", litellm.get_model_cost_map(url=""))
+
+    prompt_usd, completion_usd = cost_per_token(
+        model="openai/openai/gpt-5.5",
+        prompt_tokens=100,
+        completion_tokens=50,
+        custom_llm_provider="openai",
+    )
+
+    assert prompt_usd + completion_usd > 0
+
+
+def test_cost_per_token_non_string_model_does_not_hang():
+    """
+    The provider-prefix dedup loop must not spin forever when `model` is a
+    non-string object (e.g. a MagicMock from a mocked transport). It should
+    return or raise promptly instead of looping on a truthy `.startswith()`.
+    """
+    import threading
+    from unittest.mock import MagicMock
+
+    result: dict = {}
+
+    def _run():
+        try:
+            cost_per_token(
+                model=MagicMock(),
+                prompt_tokens=10,
+                completion_tokens=5,
+                custom_llm_provider="anthropic",
+            )
+            result["status"] = "returned"
+        except Exception:
+            result["status"] = "raised"
+
+    worker = threading.Thread(target=_run, daemon=True)
+    worker.start()
+    worker.join(timeout=10)
+
+    assert not worker.is_alive(), "cost_per_token hung on a non-string model"
+    assert result.get("status") in ("returned", "raised")
+
+
 def test_completion_cost_uses_response_model_for_dynamic_routing():
    """
    Test that completion_cost uses the model from the response object