Litellm ishaan april15 2 (#25828)

* [Test] Add Azure async chat completion timeout test. WIP * Capture TTFT for /v1/messages streaming responses The pass-through streaming path for /v1/messages (Anthropic, Bedrock, Vertex AI, Azure AI, Minimax) logged completion_start_time only after the entire stream finished. async_success_handler then fell back to end_time, making TTFT equal to total duration or null in the UI and Prometheus. Record the timestamp of the first chunk in async_sse_wrapper and propagate it to model_call_details before the logging handler runs, so gen_ai.response.time_to_first_token reflects the real first-chunk latency. Fixes #25598 * [Refactor] Implement timeout resolution logic in completion function add fetch ``request_timeout`` from litellm_settings * remove stale test case * remove extra print statement * default request timeout value in constants to 600s to match timeout defaults handled in the proxy * fix request timeout if using default value from constants.py * update code structure, test cases * only override if the global timeout sets timeout to 6000s * update code structure, move hard coded values to const and make the reslve function readable by moving fallback logic to a seperate function * modify default timeout values, replacing hard coded ones with default values defined --------- Co-authored-by: harish876 <harishgokul01@gmail.com> Co-authored-by: Joaquin Hui Gomez <joaquinhuigomez@users.noreply.github.com>
2026-04-15 18:42:23 -07:00 · 2026-04-15 18:42:23 -07:00 · a588f76789
commit a588f76789
parent 10131374ee
11 changed files with 390 additions and 13 deletions
--- a/litellm/constants.py
+++ b/litellm/constants.py
@ -413,7 +413,20 @@ MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = int(
 )
 DEFAULT_MAX_TOKENS_FOR_TRITON = int(os.getenv("DEFAULT_MAX_TOKENS_FOR_TRITON", 2000))
 #### Networking settings ####
-request_timeout: float = float(os.getenv("REQUEST_TIMEOUT", 6000))  # time in seconds
+# Sentinel used when `REQUEST_TIMEOUT` is unset: `litellm.request_timeout` keeps this
+# value so longer-running surfaces (Router `timeout or litellm.request_timeout`,
+# speech/TTS, responses, vector stores, etc.) get a long HTTP deadline. Chat
+# `completion()` maps this sentinel down to 600s when the caller did not set a
+# per-request/model timeout—see ``CompletionTimeout.resolve`` in completion_timeout.py. MCP uses
+# dedicated timeouts (e.g. `MCP_CLIENT_TIMEOUT`), not `request_timeout`.
+DEFAULT_REQUEST_TIMEOUT_SECONDS: float = 6000.0
+# Pair used for default httpx clients when no custom timeout is passed: read/write
+# deadline and connect handshake (see ``http_handler`` cached handler paths).
+COMPLETION_HTTP_FALLBACK_SECONDS: float = 600.0
+HTTP_HANDLER_CONNECT_TIMEOUT_SECONDS: float = 5.0
+request_timeout: float = float(
+    os.getenv("REQUEST_TIMEOUT", str(int(DEFAULT_REQUEST_TIMEOUT_SECONDS)))
+)
 DEFAULT_A2A_AGENT_TIMEOUT: float = float(
    os.getenv("DEFAULT_A2A_AGENT_TIMEOUT", 6000)
 )  # 10 minutes
--- a/litellm/litellm_core_utils/completion_timeout.py
+++ b/litellm/litellm_core_utils/completion_timeout.py
@ -0,0 +1,83 @@
+"""Completion HTTP timeout resolution (kept out of ``main.py`` to limit import cycles)."""
+
+from __future__ import annotations
+
+from typing import Callable, Optional, Union
+
+import httpx
+
+from litellm.constants import (
+    COMPLETION_HTTP_FALLBACK_SECONDS,
+    DEFAULT_REQUEST_TIMEOUT_SECONDS,
+)
+
+
+class CompletionTimeout:
+    """Resolves HTTP timeout for ``completion()`` from model vs global settings."""
+
+    @staticmethod
+    def _fallback_when_no_explicit_timeout(
+        global_timeout: Optional[Union[float, str]],
+    ) -> float:
+        """
+        Used when ``model_timeout`` and kwargs timeouts are all unset.
+
+        ``global_timeout`` is :attr:`litellm.request_timeout` (numeric / string), not
+        :class:`httpx.Timeout`.
+
+        If it equals :data:`~litellm.constants.DEFAULT_REQUEST_TIMEOUT_SECONDS` (6000),
+        return :data:`~litellm.constants.COMPLETION_HTTP_FALLBACK_SECONDS`. Same if
+        ``None``. Otherwise return ``float(global_timeout)``.
+        """
+        if global_timeout is None:
+            return COMPLETION_HTTP_FALLBACK_SECONDS
+        if float(global_timeout) == float(DEFAULT_REQUEST_TIMEOUT_SECONDS):
+            return COMPLETION_HTTP_FALLBACK_SECONDS
+        return float(global_timeout)
+
+    @staticmethod
+    def resolve(
+        model_timeout: Optional[Union[float, str, httpx.Timeout]],
+        kwargs: dict,
+        custom_llm_provider: str,
+        *,
+        global_timeout: Optional[Union[float, str]],
+        supports_httpx_timeout: Callable[[str], bool],
+    ) -> Union[float, httpx.Timeout]:
+        """
+        Resolution order (first non-None wins):
+
+        1. ``model_timeout`` (call argument / merged ``litellm_params``)
+        2. ``kwargs["timeout"]``
+        3. ``kwargs["request_timeout"]``
+        4. Fallback from ``global_timeout`` (:attr:`litellm.request_timeout`) — if it is
+           the package default (6000), use 600 instead.
+
+        Coerce :class:`httpx.Timeout` when the provider does not support it.
+        Explicit ``6000`` on the model or in kwargs is kept as ``6000``.
+        """
+        resolved: Union[float, str, httpx.Timeout]
+        if model_timeout is not None:
+            resolved = model_timeout
+        elif kwargs.get("timeout") is not None:
+            resolved = kwargs["timeout"]
+        elif kwargs.get("request_timeout") is not None:
+            resolved = kwargs["request_timeout"]
+        else:
+            resolved = CompletionTimeout._fallback_when_no_explicit_timeout(
+                global_timeout
+            )
+
+        if isinstance(resolved, httpx.Timeout) and not supports_httpx_timeout(
+            custom_llm_provider
+        ):
+            read_timeout = resolved.read
+            resolved = (
+                float(read_timeout)
+                if read_timeout is not None
+                else COMPLETION_HTTP_FALLBACK_SECONDS
+            )  # default 10 min timeout
+        elif not isinstance(resolved, httpx.Timeout):
+            resolved = float(resolved)  # type: ignore
+
+        return resolved
--- a/litellm/llms/anthropic/experimental_pass_through/messages/streaming_iterator.py
+++ b/litellm/llms/anthropic/experimental_pass_through/messages/streaming_iterator.py
@ -27,6 +27,7 @@ class BaseAnthropicMessagesStreamingIterator:
        self.litellm_logging_obj = litellm_logging_obj
        self.request_body = request_body
        self.start_time = datetime.now()
+        self.completion_start_time: datetime | None = None

    async def _handle_streaming_logging(self, collected_chunks: List[bytes]):
        """Handle the logging after all chunks have been collected."""
@ -35,6 +36,15 @@ class BaseAnthropicMessagesStreamingIterator:
        )

        end_time = datetime.now()
+        # Set completion_start_time so TTFT is calculated from the first
+        # chunk rather than falling back to end_time in async_success_handler.
+        if self.completion_start_time is not None:
+            self.litellm_logging_obj.completion_start_time = (
+                self.completion_start_time
+            )
+            self.litellm_logging_obj.model_call_details[
+                "completion_start_time"
+            ] = self.completion_start_time
        asyncio.create_task(
            PassThroughStreamingHandler._route_streaming_logging_to_handler(
                litellm_logging_obj=self.litellm_logging_obj,
@ -100,6 +110,8 @@ class BaseAnthropicMessagesStreamingIterator:
        collected_chunks = []

        async for chunk in completion_stream:
+            if self.completion_start_time is None:
+                self.completion_start_time = datetime.now()
            encoded_chunk = self._convert_chunk_to_sse_format(chunk)
            collected_chunks.append(encoded_chunk)
            yield encoded_chunk
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -30,7 +30,9 @@ from litellm.constants import (
    AIOHTTP_KEEPALIVE_TIMEOUT,
    AIOHTTP_NEEDS_CLEANUP_CLOSED,
    AIOHTTP_TTL_DNS_CACHE,
+    COMPLETION_HTTP_FALLBACK_SECONDS,
    DEFAULT_SSL_CIPHERS,
+    HTTP_HANDLER_CONNECT_TIMEOUT_SECONDS,
 )
 from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.types.llms.custom_http import *
@ -70,7 +72,10 @@ def get_default_headers() -> dict:
 headers = get_default_headers()

 # https://www.python-httpx.org/advanced/timeouts
-_DEFAULT_TIMEOUT = httpx.Timeout(timeout=5.0, connect=5.0)
+_DEFAULT_TIMEOUT = httpx.Timeout(
+    timeout=COMPLETION_HTTP_FALLBACK_SECONDS,
+    connect=HTTP_HANDLER_CONNECT_TIMEOUT_SECONDS,
+)


 def _prepare_request_data_and_content(
@ -1244,7 +1249,7 @@ def get_async_httpx_client(
        _new_client = AsyncHTTPHandler(**handler_params)
    else:
        _new_client = AsyncHTTPHandler(
-            timeout=httpx.Timeout(timeout=600.0, connect=5.0),
+            timeout=_DEFAULT_TIMEOUT,
            shared_session=shared_session,
        )

@ -1293,7 +1298,7 @@ def _get_httpx_client(params: Optional[dict] = None) -> HTTPHandler:
        }
        _new_client = HTTPHandler(**handler_params)
    else:
-        _new_client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
+        _new_client = HTTPHandler(timeout=_DEFAULT_TIMEOUT)

    cache.set_cache(
        key=_cache_key_name,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -76,6 +76,7 @@ from litellm.litellm_core_utils.audio_utils.utils import (
    calculate_request_duration,
    get_audio_file_for_health_check,
 )
+from litellm.litellm_core_utils.completion_timeout import CompletionTimeout
 from litellm.litellm_core_utils.dd_tracing import tracer
 from litellm.litellm_core_utils.get_provider_specific_headers import (
    ProviderSpecificHeaderUtils,
@ -1400,14 +1401,13 @@ def completion(  # type: ignore # noqa: PLR0915
            )  # support region-based pricing for bedrock

        ### TIMEOUT LOGIC ###
-        timeout = timeout or kwargs.get("request_timeout", 600) or 600
-        # set timeout for 10 minutes by default
-        if isinstance(timeout, httpx.Timeout) and not supports_httpx_timeout(
-            custom_llm_provider
-        ):
-            timeout = timeout.read or 600  # default 10 min timeout
-        elif not isinstance(timeout, httpx.Timeout):
-            timeout = float(timeout)  # type: ignore
+        timeout = CompletionTimeout.resolve(
+            timeout,
+            kwargs,
+            custom_llm_provider,
+            global_timeout=getattr(litellm, "request_timeout", None),
+            supports_httpx_timeout=supports_httpx_timeout,
+        )

        ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
        if (
--- a/tests/llm_translation/test_azure_openai.py
+++ b/tests/llm_translation/test_azure_openai.py
@ -5,6 +5,7 @@ sys.path.insert(
    0, os.path.abspath("../../")
 )  # Adds the parent directory to the system path

+import httpx
 import pytest
 from litellm.llms.azure.common_utils import process_azure_headers
 from httpx import Headers
--- a/tests/local_testing/test_azure_anthropic_sync_post.py
+++ b/tests/local_testing/test_azure_anthropic_sync_post.py
@ -0,0 +1,46 @@
+"""
+``_get_httpx_client`` + ``HTTPHandler.post`` (same pattern as Azure Anthropic sync path:
+``_get_httpx_client(params={"timeout": ...})`` then ``post(..., timeout=...)``).
+
+Uses https://httpbin.org/delay/10 with ``timeout=5`` — the handler must raise :class:`~litellm.exceptions.Timeout`
+before the 10s delay completes. Skips if httpbin is unreachable.
+
+Lives under ``local_testing`` (not ``make test-unit``).
+"""
+
+import json
+import os
+import sys
+
+import httpx
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+)
+
+from litellm.exceptions import Timeout as LitellmTimeout
+from litellm.llms.custom_httpx.http_handler import _get_httpx_client
+
+_HTTPBIN_DELAY_S = 10
+_PER_REQUEST_TIMEOUT_S = 5.0
+_CLIENT_DEFAULT_TIMEOUT_S = 60.0
+
+
+def test_post_delay_exceeds_per_request_timeout_raises():
+    try:
+        httpx.get("https://httpbin.org/get", timeout=5.0)
+    except Exception as e:
+        pytest.skip(f"httpbin.org unreachable: {e}")
+
+    handler = _get_httpx_client(params={"timeout": _CLIENT_DEFAULT_TIMEOUT_S})
+    try:
+        with pytest.raises(LitellmTimeout):
+            handler.post(
+                f"https://httpbin.org/delay/{_HTTPBIN_DELAY_S}",
+                headers={"content-type": "application/json"},
+                data=json.dumps({"model": "claude", "messages": []}),
+                timeout=_PER_REQUEST_TIMEOUT_S,
+            )
+    finally:
+        handler.close()
--- a/tests/test_litellm/llms/azure_ai/claude/test_azure_anthropic_handler.py
+++ b/tests/test_litellm/llms/azure_ai/claude/test_azure_anthropic_handler.py
@ -222,5 +222,7 @@ class TestAzureAnthropicChatCompletion:

        # Verify non-streaming was handled
        mock_client.post.assert_called_once()
+        mock_get_client.assert_called_once_with(params={"timeout": timeout})
+        assert mock_client.post.call_args.kwargs["timeout"] == timeout
        assert result is not None

--- a/tests/test_litellm/llms/azure_ai/claude/test_main_azure_anthropic_timeout.py
+++ b/tests/test_litellm/llms/azure_ai/claude/test_main_azure_anthropic_timeout.py
@ -0,0 +1,42 @@
+"""
+Ensure litellm.completion() forwards timeout to Azure Anthropic handler (main.py dispatch).
+"""
+
+import os
+import sys
+from unittest.mock import MagicMock, patch
+
+sys.path.insert(
+    0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../.."))
+)
+
+from litellm import completion
+from litellm.types.utils import ModelResponse
+
+
+def test_main_azure_ai_claude_completion_passes_timeout_to_azure_anthropic_handler():
+    captured: dict = {}
+
+    def fake_azure_anthropic_completion(**kwargs):
+        captured.update(kwargs)
+        return ModelResponse()
+
+    with patch(
+        "litellm.main.azure_anthropic_chat_completions"
+    ) as mock_azure_anthropic:
+        mock_azure_anthropic.completion = MagicMock(
+            side_effect=fake_azure_anthropic_completion
+        )
+
+        completion(
+            model="azure_ai/claude-sonnet-4-5",
+            messages=[{"role": "user", "content": "hi"}],
+            api_base="https://example.services.ai.azure.com/anthropic",
+            api_key="test-key",
+            timeout=42.5,
+        )
+
+    mock_azure_anthropic.completion.assert_called_once()
+    assert captured["timeout"] == 42.5
+    assert captured["model"] == "claude-sonnet-4-5"
+    assert captured["custom_llm_provider"] == "azure_ai"
--- a/tests/test_litellm/llms/custom_httpx/test_http_handler.py
+++ b/tests/test_litellm/llms/custom_httpx/test_http_handler.py
@ -15,7 +15,12 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 import litellm
 from litellm.llms.custom_httpx.aiohttp_transport import LiteLLMAiohttpTransport
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, get_ssl_configuration
+from litellm.llms.custom_httpx.http_handler import (
+    AsyncHTTPHandler,
+    HTTPHandler,
+    _get_httpx_client,
+    get_ssl_configuration,
+)


@pytest.mark.asyncio
@ -658,3 +663,26 @@ async def test_httpx_handler_uses_env_user_agent(monkeypatch):
        assert req.headers.get("User-Agent") == "Claude Code"
    finally:
        await handler.close()
+
+
+def test_get_httpx_client_applies_float_timeout_without_mocking_handler():
+    """
+    Exercise real _get_httpx_client + HTTPHandler: params={'timeout': x} must reach httpx.Client(timeout=...).
+    Uses an uncommon timeout value to avoid colliding with other cached clients in-process.
+    """
+    timeout = 3847.291
+    handler = _get_httpx_client(params={"timeout": timeout})
+    try:
+        assert isinstance(handler, HTTPHandler)
+        assert handler.client.timeout == httpx.Timeout(timeout)
+    finally:
+        handler.close()
+
+
+def test_get_httpx_client_applies_httpx_timeout_object_without_mocking_handler():
+    t = httpx.Timeout(40.0, connect=5.0)
+    handler = _get_httpx_client(params={"timeout": t})
+    try:
+        assert handler.client.timeout == t
+    finally:
+        handler.close()
--- a/tests/test_litellm/test_completion_timeout_resolution.py
+++ b/tests/test_litellm/test_completion_timeout_resolution.py
@ -0,0 +1,145 @@
+"""Unit tests for litellm.litellm_core_utils.completion_timeout.CompletionTimeout."""
+
+import os
+import sys
+
+import httpx
+
+sys.path.insert(
+    0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
+)
+
+from litellm.litellm_core_utils.completion_timeout import CompletionTimeout
+from litellm.utils import supports_httpx_timeout
+
+
+def test_explicit_timeout_wins():
+    assert (
+        CompletionTimeout.resolve(
+            12.5,
+            {"timeout": 99.0, "request_timeout": 88.0},
+            "openai",
+            global_timeout=None,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 12.5
+    )
+
+
+def test_kwargs_timeout_when_param_none():
+    assert (
+        CompletionTimeout.resolve(
+            None,
+            {"timeout": 21.0},
+            "azure_ai",
+            global_timeout=None,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 21.0
+    )
+
+
+def test_request_timeout_alias_in_kwargs():
+    assert (
+        CompletionTimeout.resolve(
+            None,
+            {"request_timeout": 33.0},
+            "bedrock",
+            global_timeout=None,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 33.0
+    )
+
+
+def test_global_timeout_from_litellm_settings():
+    assert (
+        CompletionTimeout.resolve(
+            None,
+            {},
+            "vertex_ai",
+            global_timeout=360.0,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 360.0
+    )
+
+
+def test_global_timeout_package_default_coerced_to_600_for_completion():
+    """Package default 6000s → 600s for completion-only path."""
+    assert (
+        CompletionTimeout.resolve(
+            None,
+            {},
+            "openai",
+            global_timeout=6000.0,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 600.0
+    )
+
+
+def test_explicit_request_timeout_6000_preserved():
+    """Explicit deployment/request timeout must not be truncated by the package sentinel."""
+    assert (
+        CompletionTimeout.resolve(
+            None,
+            {"request_timeout": 6000.0},
+            "openai",
+            global_timeout=None,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 6000.0
+    )
+
+
+def test_explicit_model_timeout_6000_preserved():
+    assert (
+        CompletionTimeout.resolve(
+            6000.0,
+            {"timeout": 1.0, "request_timeout": 2.0},
+            "openai",
+            global_timeout=None,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 6000.0
+    )
+
+
+def test_fallback_600_when_no_global_timeout():
+    assert (
+        CompletionTimeout.resolve(
+            None,
+            {},
+            "azure_ai",
+            global_timeout=None,
+            supports_httpx_timeout=supports_httpx_timeout,
+        )
+        == 600.0
+    )
+
+
+def test_httpx_timeout_coerced_for_provider_without_httpx_timeout_support():
+    t = httpx.Timeout(50.0, connect=2.0)
+    out = CompletionTimeout.resolve(
+        t,
+        {},
+        "azure_ai",
+        global_timeout=None,
+        supports_httpx_timeout=supports_httpx_timeout,
+    )
+    assert out == 50.0
+    assert not isinstance(out, httpx.Timeout)
+
+
+def test_httpx_timeout_preserved_for_openai():
+    t = httpx.Timeout(40.0, connect=5.0)
+    out = CompletionTimeout.resolve(
+        t,
+        {},
+        "openai",
+        global_timeout=None,
+        supports_httpx_timeout=supports_httpx_timeout,
+    )
+    assert out is t
+    assert isinstance(out, httpx.Timeout)