From bd2d0ad519cbd95e5ee60bd9b1c1ea60f46f5336 Mon Sep 17 00:00:00 2001
From: Shivam Rawat <shivam@berri.ai>
Date: Tue, 26 May 2026 11:40:44 -0700
Subject: [PATCH] Litellm OpenAI double prefix bug (#28661)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix token cost lookup when deployment ids repeat the provider prefix.

Router configs may expose models like openai/openai/<model>; normalize those
strings before joining provider/model so model_cost resolves correctly.

Co-authored-by: Cursor <cursoragent@cursor.com>

* scope duplicate-prefix cost fix to explicit providers; isolate test patch state

Co-authored-by: Cursor <cursoragent@cursor.com>

* fix(mypy): narrow custom_llm_provider after resolution in cost_per_token

Co-authored-by: Cursor <cursoragent@cursor.com>

* fix(cost_calculator): guard provider-prefix dedup against non-string model

The provider-prefix dedup loop assumed `model` is always a string. When a
non-string is passed (e.g. a MagicMock from a mocked transport in router
tests), `model.startswith(...)` is always truthy and each slice returns a new
object, so the loop never terminates — it spins and OOM-kills the test worker
(observed as the litellm_router_testing CI regression, e.g.
test_router_pattern_match_e2e). Only run the string-based dedup and prefix-join
when `model` is actually a str, preserving the previous graceful behavior for
non-string inputs.

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: mateo-berri <277851410+mateo-berri@users.noreply.github.com>
---
 litellm/cost_calculator.py                 | 34 ++++++++++++++-
 tests/test_litellm/test_cost_calculator.py | 50 ++++++++++++++++++++++
 2 files changed, 82 insertions(+), 2 deletions(-)
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
index ab882559d3..1d4c57df42 100644
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@@ -417,9 +417,36 @@ def cost_per_token(  # noqa: PLR0915
     prompt_tokens_cost_usd_dollar: float = 0
     completion_tokens_cost_usd_dollar: float = 0
     model_cost_ref = litellm.model_cost
+    # Only callers that explicitly pass `custom_llm_provider` get the
+    # dedup/prefix-join treatment. When provider is omitted, preserve legacy
+    # behavior: `model_with_provider` stays equal to the raw `model` string
+    # (provider is detected below for downstream use only).
+    caller_supplied_provider = custom_llm_provider is not None
+
+    # `model` is normally a string, but callers that mock the transport can pass
+    # non-string objects. Only run the string-based dedup/prefix-join when it is
+    # actually a string — e.g. a MagicMock's `.startswith()` is always truthy and
+    # its slices return new mocks, which would spin the dedup loop forever.
+    model_is_str = isinstance(model, str)
+
+    # Router/proxy deployments may repeat the provider segment (e.g. model_name
+    # "openai/openai/gpt-5.5"). Strip duplicated `{provider}/` chains before joining.
+    if caller_supplied_provider and model_is_str:
+        _dup_prefix = f"{custom_llm_provider}/"
+        while model.startswith(_dup_prefix):
+            _remainder = model[len(_dup_prefix) :]
+            if _remainder.startswith(_dup_prefix):
+                model = _remainder
+            else:
+                break
+
     model_with_provider = model
-    if custom_llm_provider is not None:
-        model_with_provider = custom_llm_provider + "/" + model
+    if caller_supplied_provider:
+        _prov_prefix = f"{custom_llm_provider}/"
+        if model_is_str and model.startswith(_prov_prefix):
+            model_with_provider = model
+        else:
+            model_with_provider = f"{custom_llm_provider}/{model}"
         if region_name is not None:
             model_with_provider_and_region = (
                 f"{custom_llm_provider}/{region_name}/{model}"
@@ -430,6 +457,9 @@ def cost_per_token(  # noqa: PLR0915
                 model_with_provider = model_with_provider_and_region
     else:
         _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+
+    assert custom_llm_provider is not None  # caller-supplied or get_llm_provider
+
     model_without_prefix = model
     model_parts = model.split("/", 1)
     if len(model_parts) > 1:
diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py
index 00902890da..1a9bf5a942 100644
--- a/tests/test_litellm/test_cost_calculator.py
+++ b/tests/test_litellm/test_cost_calculator.py
@@ -13,6 +13,7 @@ from pydantic import BaseModel
 import litellm
 from litellm.cost_calculator import (
     completion_cost,
+    cost_per_token,
     handle_realtime_stream_cost_calculation,
     response_cost_calculator,
 )
@@ -21,6 +22,55 @@ from litellm.types.utils import ModelResponse, PromptTokensDetailsWrapper, Usage
 from litellm.utils import TranscriptionResponse
 
 
+def test_cost_per_token_duplicate_openai_prefix_matches_model_cost(monkeypatch):
+    """
+    Router/proxy configs may use deployment ids like openai/openai/<model>. Cost lookup must
+    resolve to model_prices keys (e.g. gpt-5.5), not fail or multiply prefixes.
+    """
+    monkeypatch.setenv("LITELLM_LOCAL_MODEL_COST_MAP", "True")
+    monkeypatch.setattr(litellm, "model_cost", litellm.get_model_cost_map(url=""))
+
+    prompt_usd, completion_usd = cost_per_token(
+        model="openai/openai/gpt-5.5",
+        prompt_tokens=100,
+        completion_tokens=50,
+        custom_llm_provider="openai",
+    )
+
+    assert prompt_usd + completion_usd > 0
+
+
+def test_cost_per_token_non_string_model_does_not_hang():
+    """
+    The provider-prefix dedup loop must not spin forever when `model` is a
+    non-string object (e.g. a MagicMock from a mocked transport). It should
+    return or raise promptly instead of looping on a truthy `.startswith()`.
+    """
+    import threading
+    from unittest.mock import MagicMock
+
+    result: dict = {}
+
+    def _run():
+        try:
+            cost_per_token(
+                model=MagicMock(),
+                prompt_tokens=10,
+                completion_tokens=5,
+                custom_llm_provider="anthropic",
+            )
+            result["status"] = "returned"
+        except Exception:
+            result["status"] = "raised"
+
+    worker = threading.Thread(target=_run, daemon=True)
+    worker.start()
+    worker.join(timeout=10)
+
+    assert not worker.is_alive(), "cost_per_token hung on a non-string model"
+    assert result.get("status") in ("returned", "raised")
+
+
 def test_completion_cost_uses_response_model_for_dynamic_routing():
     """
     Test that completion_cost uses the model from the response object