fix: completion_cost AttributeError on streaming Anthropic web_search responses (#26153) (#27346)

* fix: coerce server_tool_use dict to ServerToolUse in Usage.__init__ (#26153) * fix: coerce server_tool_use to ServerToolUse in stream_chunk_builder (#26153) * fix: dict/pydantic-tolerant access in tool_call_cost_tracking (#26153) * fix: dict/pydantic-tolerant access in anthropic cost_calculation (#26153) * test: assert ServerToolUse type in existing stream_chunk_builder anthropic web search test * test: regression test for #26153 (stream_chunk_builder server_tool_use type) * test: dict/pydantic safety for tool_call_cost_tracking helper * test: dict/pydantic safety for anthropic web_search cost * refactor: consolidate _get_web_search_requests into shared cost-calc utils * test(realtime): use gpt-realtime; openai retired gpt-4o-realtime-preview OpenAI shut down the gpt-4o-realtime-preview family (incl. the undated alias) on 2026-05-07, causing the live realtime test to fail with a 4000 invalid_request_error.invalid_model close. gpt-realtime is the GA successor; switch the live-call tests to it, matching the base branch. * refactor(types): drop redundant server_tool_use coercion in Usage.__init__ --------- Co-authored-by: mateo-berri <277851410+mateo-berri@users.noreply.github.com>
2026-06-10 21:20:11 -07:00 · 2026-06-10 21:20:11 -07:00 · 4a3860df1f
commit 4a3860df1f
parent 6068bb7781
8 changed files with 360 additions and 13 deletions
--- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py
@ -6,6 +6,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
 import litellm
 from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
 from litellm.litellm_core_utils.llm_cost_calc.utils import _get_web_search_requests
 from litellm.types.llms.openai import (
    FileSearchTool,
    ResponsesAPIResponse,
@ -339,8 +340,7 @@ class StandardBuiltInToolCostTracking:
                # and _handle_web_search_cost() is never called.
                if (
                    hasattr(usage, "server_tool_use")
-                    and usage.server_tool_use is not None
+                    and _get_web_search_requests(usage.server_tool_use) is not None
                    and usage.server_tool_use.web_search_requests is not None
                ):
                    return True
            return False
@ -352,8 +352,7 @@ class StandardBuiltInToolCostTracking:
        elif usage is not None:
            if (
                hasattr(usage, "server_tool_use")
-                and usage.server_tool_use is not None
+                and _get_web_search_requests(usage.server_tool_use) is not None
                and usage.server_tool_use.web_search_requests is not None
            ):
                return True
            elif (
--- a/litellm/litellm_core_utils/llm_cost_calc/utils.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@ -1,7 +1,7 @@
 # What is this?
 ## Helper utilities for cost_per_token()
-from typing import Literal, Optional, Tuple, TypedDict, cast
+from typing import Any, Literal, Optional, Tuple, TypedDict, cast
 import litellm
 from litellm._logging import verbose_logger
@ -42,6 +42,26 @@ def _get_token_detail_value(details: object, key: str) -> Optional[int]:
    return value if isinstance(value, int) else None
 def _get_web_search_requests(server_tool_use: Any) -> Optional[int]:
    """
    Tolerantly read ``web_search_requests`` from a ``server_tool_use`` value
    that may be ``None``, a ``dict``, a ``ServerToolUse`` pydantic instance,
    or any other object supporting attribute access.
    Returns ``None`` when the value cannot be resolved — callers can
    distinguish "absent" from "zero" using ``is None``.
    See https://github.com/BerriAI/litellm/issues/26153 — ``stream_chunk_builder``
    historically left this as a plain ``dict``, which broke direct attribute
    access in cost calculation.
    """
    if server_tool_use is None:
        return None
    if isinstance(server_tool_use, dict):
        return server_tool_use.get("web_search_requests")
    return getattr(server_tool_use, "web_search_requests", None)
 def _is_above_128k(tokens: float) -> bool:
    if tokens > 128000:
        return True
--- a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
+++ b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
@ -637,7 +637,18 @@ class ChunkProcessor:
                    hasattr(usage_chunk, "server_tool_use")
                    and usage_chunk.server_tool_use is not None
                ):
-                    server_tool_use = usage_chunk.server_tool_use
+                    # Coerce dict to ServerToolUse so downstream cost-calc code
                    # (which accesses .web_search_requests as an attribute)
                    # doesn't raise AttributeError. Some providers / streaming
                    # paths leave server_tool_use as a plain dict on the chunk.
                    if isinstance(usage_chunk.server_tool_use, dict):
                        server_tool_use = ServerToolUse(**usage_chunk.server_tool_use)
                    elif isinstance(usage_chunk.server_tool_use, ServerToolUse):
                        server_tool_use = usage_chunk.server_tool_use
                    else:
                        server_tool_use = ServerToolUse.model_validate(
                            usage_chunk.server_tool_use
                        )
                if (
                    usage_chunk_dict["prompt_tokens_details"] is not None
                    and getattr(
--- a/litellm/llms/anthropic/cost_calculation.py
+++ b/litellm/llms/anthropic/cost_calculation.py
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Optional, Tuple
 from litellm.litellm_core_utils.llm_cost_calc.utils import (
    _get_token_base_cost,
    _get_web_search_requests,
    _parse_prompt_tokens_details,
    calculate_cache_writing_cost,
    generic_cost_per_token,
@ -110,11 +111,12 @@ def get_cost_for_anthropic_web_search(
    if model_info is None:
        return 0.0
-    if (
+    if usage is None:
-        usage is None
+        return 0.0
-        or usage.server_tool_use is None
+    web_search_requests = _get_web_search_requests(
-        or usage.server_tool_use.web_search_requests is None
+        getattr(usage, "server_tool_use", None)
-    ):
+    )
    if web_search_requests is None:
        return 0.0
    ## Get the cost per web search request
@ -128,5 +130,5 @@ def get_cost_for_anthropic_web_search(
        return 0.0
    ## Calculate the total cost
-    total_cost = cost_per_web_search_request * usage.server_tool_use.web_search_requests
+    total_cost = cost_per_web_search_request * web_search_requests
    return total_cost
--- a/tests/test_litellm/litellm_core_utils/llm_cost_calc/test_tool_call_cost_tracking_dict_safety.py
+++ b/tests/test_litellm/litellm_core_utils/llm_cost_calc/test_tool_call_cost_tracking_dict_safety.py
@ -0,0 +1,88 @@
 """
 Tests that the cost-tracking call sites tolerate ``server_tool_use`` being
 either a ``dict`` or a ``ServerToolUse`` pydantic instance.
 See https://github.com/BerriAI/litellm/issues/26153.
 """
 import os
 import sys
 import pytest
 sys.path.insert(0, os.path.abspath("../../../.."))
 from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
    StandardBuiltInToolCostTracking,
    _get_web_search_requests,
 )
 from litellm.types.utils import ModelResponse, ServerToolUse, Usage
 class _UsageWithDictServerToolUse:
    """
    Tiny stand-in that mimics the broken streaming-rebuild shape:
    ``server_tool_use`` is a plain dict.
    """
    def __init__(self, server_tool_use):
        self.server_tool_use = server_tool_use
        self.prompt_tokens_details = None
 def test_get_web_search_requests_handles_none():
    assert _get_web_search_requests(None) is None
 def test_get_web_search_requests_handles_dict():
    assert _get_web_search_requests({"web_search_requests": 5}) == 5
 def test_get_web_search_requests_handles_dict_missing_key():
    assert _get_web_search_requests({}) is None
 def test_get_web_search_requests_handles_pydantic():
    stu = ServerToolUse(web_search_requests=7)
    assert _get_web_search_requests(stu) == 7
 def test_get_web_search_requests_handles_pydantic_with_none_value():
    stu = ServerToolUse()
    assert _get_web_search_requests(stu) is None
 def test_response_object_includes_web_search_call_with_dict_server_tool_use():
    """
    The exact bug: ``usage.server_tool_use`` is a dict and the check in
    ``response_object_includes_web_search_call`` used to crash with
    ``AttributeError``.
    """
    response = ModelResponse()
    usage = _UsageWithDictServerToolUse({"web_search_requests": 2})
    # Must not raise — and must correctly detect the web search call.
    result = StandardBuiltInToolCostTracking.response_object_includes_web_search_call(
        response_object=response, usage=usage  # type: ignore[arg-type]
    )
    assert result is True
 def test_response_object_includes_web_search_call_with_pydantic_server_tool_use():
    response = ModelResponse()
    usage = _UsageWithDictServerToolUse(ServerToolUse(web_search_requests=2))
    result = StandardBuiltInToolCostTracking.response_object_includes_web_search_call(
        response_object=response, usage=usage  # type: ignore[arg-type]
    )
    assert result is True
 def test_response_object_includes_web_search_call_with_none_server_tool_use():
    response = ModelResponse()
    usage = _UsageWithDictServerToolUse(None)
    result = StandardBuiltInToolCostTracking.response_object_includes_web_search_call(
        response_object=response, usage=usage  # type: ignore[arg-type]
    )
    assert result is False
--- a/tests/test_litellm/litellm_core_utils/test_streaming_chunk_builder_server_tool_use.py
+++ b/tests/test_litellm/litellm_core_utils/test_streaming_chunk_builder_server_tool_use.py
@ -0,0 +1,130 @@
 """
 Regression tests for https://github.com/BerriAI/litellm/issues/26153
 ``stream_chunk_builder`` used to leave ``usage.server_tool_use`` as a plain
 ``dict`` when reconstructing a streaming response. Downstream cost-calculation
 code (``StandardBuiltInToolCostTracking.response_object_includes_web_search_call``
 and ``get_cost_for_anthropic_web_search``) accesses
 ``usage.server_tool_use.web_search_requests`` as an attribute, which raised
 ``AttributeError: 'dict' object has no attribute 'web_search_requests'``.
 These tests reconstruct streaming chunks for an Anthropic-style web_search
 response and assert:
 1. ``stream_chunk_builder`` returns ``ServerToolUse`` (not ``dict``) for
   ``usage.server_tool_use``.
 2. ``completion_cost`` runs end-to-end on the rebuilt response without
   raising ``AttributeError``.
 """
 import os
 import sys
 import pytest
 sys.path.insert(0, os.path.abspath("../../.."))
 from litellm import completion_cost, stream_chunk_builder
 from litellm.types.utils import (
    Delta,
    ModelResponseStream,
    ServerToolUse,
    StreamingChoices,
    Usage,
 )
 def _make_text_chunk(text: str) -> ModelResponseStream:
    return ModelResponseStream(
        id="chatcmpl-test-26153",
        created=1700000000,
        model="claude-3-haiku-20240307",
        object="chat.completion.chunk",
        choices=[
            StreamingChoices(
                finish_reason=None,
                index=0,
                delta=Delta(role="assistant", content=text),
            )
        ],
    )
 def _make_finish_chunk_with_usage_dict_server_tool_use() -> ModelResponseStream:
    """Final chunk where server_tool_use is a *dict* — reproduces the bug shape."""
    return ModelResponseStream(
        id="chatcmpl-test-26153",
        created=1700000000,
        model="claude-3-haiku-20240307",
        object="chat.completion.chunk",
        choices=[
            StreamingChoices(
                finish_reason="stop",
                index=0,
                delta=Delta(),
            )
        ],
        usage=Usage(
            prompt_tokens=42,
            completion_tokens=11,
            total_tokens=53,
            # NOTE: passed as a dict on purpose — this is the shape that
            # historically slipped through stream_chunk_builder unchanged.
            server_tool_use={"web_search_requests": 3},
        ),
    )
 def test_stream_chunk_builder_coerces_server_tool_use_to_pydantic():
    """
    Regression: stream_chunk_builder must produce ServerToolUse, not dict.
    """
    chunks = [
        _make_text_chunk("Otters "),
        _make_text_chunk("are great."),
        _make_finish_chunk_with_usage_dict_server_tool_use(),
    ]
    rebuilt = stream_chunk_builder(chunks)
    assert rebuilt is not None
    assert rebuilt.usage is not None  # type: ignore[attr-defined]
    server_tool_use = rebuilt.usage.server_tool_use  # type: ignore[attr-defined]
    assert (
        server_tool_use is not None
    ), "server_tool_use should be carried through from the final chunk"
    assert isinstance(server_tool_use, ServerToolUse), (
        f"expected ServerToolUse, got {type(server_tool_use).__name__}: "
        f"{server_tool_use!r}"
    )
    # Attribute access must not raise (this is exactly what was broken).
    assert server_tool_use.web_search_requests == 3
 def test_completion_cost_does_not_raise_on_streaming_web_search_response():
    """
    Regression: completion_cost(...) must not raise AttributeError when the
    response was reconstructed by stream_chunk_builder from a streaming
    Anthropic web_search call.
    """
    chunks = [
        _make_text_chunk("hello"),
        _make_finish_chunk_with_usage_dict_server_tool_use(),
    ]
    rebuilt = stream_chunk_builder(chunks)
    assert rebuilt is not None
    # The exact dollar amount depends on the model-pricing table; what matters
    # for this regression is that it does NOT raise AttributeError on
    # `dict has no attribute 'web_search_requests'`.
    try:
        cost = completion_cost(completion_response=rebuilt)
    except AttributeError as e:  # pragma: no cover - regression guard
        pytest.fail(
            "completion_cost raised AttributeError after stream_chunk_builder "
            f"(issue #26153 regression): {e}"
        )
    assert isinstance(cost, (int, float))
--- a/tests/test_litellm/litellm_core_utils/test_streaming_chunk_builder_utils.py
+++ b/tests/test_litellm/litellm_core_utils/test_streaming_chunk_builder_utils.py
@ -520,7 +520,10 @@ def test_stream_chunk_builder_anthropic_web_search():
    assert usage.prompt_tokens == 50
    assert usage.completion_tokens == 27
    assert usage.total_tokens == 77
-    assert usage.server_tool_use["web_search_requests"] == 2
+    # server_tool_use must be a ServerToolUse pydantic so downstream cost-calc
    # (which uses attribute access) works. See issue #26153.
    assert isinstance(usage.server_tool_use, ServerToolUse)
    assert usage.server_tool_use.web_search_requests == 2
 def test_sort_chunks_handles_dict_hidden_params_created_at():
--- a/tests/test_litellm/llms/anthropic/test_cost_calculation_dict_safety.py
+++ b/tests/test_litellm/llms/anthropic/test_cost_calculation_dict_safety.py
@ -0,0 +1,94 @@
 """
 Tests that ``get_cost_for_anthropic_web_search`` tolerates ``server_tool_use``
 being either a ``dict`` or a ``ServerToolUse`` pydantic instance.
 See https://github.com/BerriAI/litellm/issues/26153.
 """
 import os
 import sys
 import pytest
 sys.path.insert(0, os.path.abspath("../../../.."))
 from litellm.llms.anthropic.cost_calculation import (
    _get_web_search_requests,
    get_cost_for_anthropic_web_search,
 )
 from litellm.types.utils import ModelInfo, ServerToolUse
 class _UsageWithServerToolUse:
    def __init__(self, server_tool_use):
        self.server_tool_use = server_tool_use
 def _make_model_info(cost_per_query: float = 0.01) -> ModelInfo:
    info: ModelInfo = {  # type: ignore[typeddict-item]
        "search_context_cost_per_query": {
            "search_context_size_low": cost_per_query,
            "search_context_size_medium": cost_per_query,
            "search_context_size_high": cost_per_query,
        }
    }
    return info
 def test_get_web_search_requests_handles_none():
    assert _get_web_search_requests(None) is None
 def test_get_web_search_requests_handles_dict():
    assert _get_web_search_requests({"web_search_requests": 4}) == 4
 def test_get_web_search_requests_handles_dict_missing_key():
    assert _get_web_search_requests({}) is None
 def test_get_web_search_requests_handles_pydantic():
    assert _get_web_search_requests(ServerToolUse(web_search_requests=2)) == 2
 def test_get_cost_for_anthropic_web_search_with_dict_server_tool_use():
    """
    Regression: ``server_tool_use`` was a dict from ``stream_chunk_builder`` and
    direct attribute access on it raised ``AttributeError``.
    """
    usage = _UsageWithServerToolUse({"web_search_requests": 3})
    info = _make_model_info(cost_per_query=0.01)
    cost = get_cost_for_anthropic_web_search(
        model_info=info, usage=usage  # type: ignore[arg-type]
    )
    assert cost == pytest.approx(0.03)
 def test_get_cost_for_anthropic_web_search_with_pydantic_server_tool_use():
    usage = _UsageWithServerToolUse(ServerToolUse(web_search_requests=3))
    info = _make_model_info(cost_per_query=0.01)
    cost = get_cost_for_anthropic_web_search(
        model_info=info, usage=usage  # type: ignore[arg-type]
    )
    assert cost == pytest.approx(0.03)
 def test_get_cost_for_anthropic_web_search_with_none_server_tool_use():
    usage = _UsageWithServerToolUse(None)
    info = _make_model_info(cost_per_query=0.01)
    cost = get_cost_for_anthropic_web_search(
        model_info=info, usage=usage  # type: ignore[arg-type]
    )
    assert cost == 0.0
 def test_get_cost_for_anthropic_web_search_with_no_usage():
    info = _make_model_info(cost_per_query=0.01)
    cost = get_cost_for_anthropic_web_search(model_info=info, usage=None)
    assert cost == 0.0