Litellm ishaan april15 2 (#25828)

* [Test] Add Azure async chat completion timeout test. WIP

* Capture TTFT for /v1/messages streaming responses

The pass-through streaming path for /v1/messages (Anthropic, Bedrock,
Vertex AI, Azure AI, Minimax) logged completion_start_time only after
the entire stream finished. async_success_handler then fell back to
end_time, making TTFT equal to total duration or null in the UI and
Prometheus.

Record the timestamp of the first chunk in async_sse_wrapper and
propagate it to model_call_details before the logging handler runs,
so gen_ai.response.time_to_first_token reflects the real first-chunk
latency.

Fixes #25598

* [Refactor] Implement timeout resolution logic in completion function

add fetch ``request_timeout`` from litellm_settings

* remove stale test case

* remove extra print statement

* default request timeout value in constants to 600s to match timeout defaults handled in the proxy

* fix request timeout if using default value from constants.py

* update code structure, test cases

* only override if the global timeout sets timeout to 6000s

* update code structure, move hard coded values to const and make the reslve function readable by moving fallback logic to a seperate function

* modify default timeout values, replacing hard coded ones with default values defined

---------

Co-authored-by: harish876 <harishgokul01@gmail.com>
Co-authored-by: Joaquin Hui Gomez <joaquinhuigomez@users.noreply.github.com>
This commit is contained in:
ishaan-berri 2026-04-15 18:42:23 -07:00 committed by GitHub
parent 10131374ee
commit a588f76789
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 390 additions and 13 deletions

View File

@ -413,7 +413,20 @@ MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = int(
)
DEFAULT_MAX_TOKENS_FOR_TRITON = int(os.getenv("DEFAULT_MAX_TOKENS_FOR_TRITON", 2000))
#### Networking settings ####
request_timeout: float = float(os.getenv("REQUEST_TIMEOUT", 6000)) # time in seconds
# Sentinel used when `REQUEST_TIMEOUT` is unset: `litellm.request_timeout` keeps this
# value so longer-running surfaces (Router `timeout or litellm.request_timeout`,
# speech/TTS, responses, vector stores, etc.) get a long HTTP deadline. Chat
# `completion()` maps this sentinel down to 600s when the caller did not set a
# per-request/model timeout—see ``CompletionTimeout.resolve`` in completion_timeout.py. MCP uses
# dedicated timeouts (e.g. `MCP_CLIENT_TIMEOUT`), not `request_timeout`.
DEFAULT_REQUEST_TIMEOUT_SECONDS: float = 6000.0
# Pair used for default httpx clients when no custom timeout is passed: read/write
# deadline and connect handshake (see ``http_handler`` cached handler paths).
COMPLETION_HTTP_FALLBACK_SECONDS: float = 600.0
HTTP_HANDLER_CONNECT_TIMEOUT_SECONDS: float = 5.0
request_timeout: float = float(
os.getenv("REQUEST_TIMEOUT", str(int(DEFAULT_REQUEST_TIMEOUT_SECONDS)))
)
DEFAULT_A2A_AGENT_TIMEOUT: float = float(
os.getenv("DEFAULT_A2A_AGENT_TIMEOUT", 6000)
) # 10 minutes

View File

@ -0,0 +1,83 @@
"""Completion HTTP timeout resolution (kept out of ``main.py`` to limit import cycles)."""
from __future__ import annotations
from typing import Callable, Optional, Union
import httpx
from litellm.constants import (
COMPLETION_HTTP_FALLBACK_SECONDS,
DEFAULT_REQUEST_TIMEOUT_SECONDS,
)
class CompletionTimeout:
"""Resolves HTTP timeout for ``completion()`` from model vs global settings."""
@staticmethod
def _fallback_when_no_explicit_timeout(
global_timeout: Optional[Union[float, str]],
) -> float:
"""
Used when ``model_timeout`` and kwargs timeouts are all unset.
``global_timeout`` is :attr:`litellm.request_timeout` (numeric / string), not
:class:`httpx.Timeout`.
If it equals :data:`~litellm.constants.DEFAULT_REQUEST_TIMEOUT_SECONDS` (6000),
return :data:`~litellm.constants.COMPLETION_HTTP_FALLBACK_SECONDS`. Same if
``None``. Otherwise return ``float(global_timeout)``.
"""
if global_timeout is None:
return COMPLETION_HTTP_FALLBACK_SECONDS
if float(global_timeout) == float(DEFAULT_REQUEST_TIMEOUT_SECONDS):
return COMPLETION_HTTP_FALLBACK_SECONDS
return float(global_timeout)
@staticmethod
def resolve(
model_timeout: Optional[Union[float, str, httpx.Timeout]],
kwargs: dict,
custom_llm_provider: str,
*,
global_timeout: Optional[Union[float, str]],
supports_httpx_timeout: Callable[[str], bool],
) -> Union[float, httpx.Timeout]:
"""
Resolution order (first non-None wins):
1. ``model_timeout`` (call argument / merged ``litellm_params``)
2. ``kwargs["timeout"]``
3. ``kwargs["request_timeout"]``
4. Fallback from ``global_timeout`` (:attr:`litellm.request_timeout`) if it is
the package default (6000), use 600 instead.
Coerce :class:`httpx.Timeout` when the provider does not support it.
Explicit ``6000`` on the model or in kwargs is kept as ``6000``.
"""
resolved: Union[float, str, httpx.Timeout]
if model_timeout is not None:
resolved = model_timeout
elif kwargs.get("timeout") is not None:
resolved = kwargs["timeout"]
elif kwargs.get("request_timeout") is not None:
resolved = kwargs["request_timeout"]
else:
resolved = CompletionTimeout._fallback_when_no_explicit_timeout(
global_timeout
)
if isinstance(resolved, httpx.Timeout) and not supports_httpx_timeout(
custom_llm_provider
):
read_timeout = resolved.read
resolved = (
float(read_timeout)
if read_timeout is not None
else COMPLETION_HTTP_FALLBACK_SECONDS
) # default 10 min timeout
elif not isinstance(resolved, httpx.Timeout):
resolved = float(resolved) # type: ignore
return resolved

View File

@ -27,6 +27,7 @@ class BaseAnthropicMessagesStreamingIterator:
self.litellm_logging_obj = litellm_logging_obj
self.request_body = request_body
self.start_time = datetime.now()
self.completion_start_time: datetime | None = None
async def _handle_streaming_logging(self, collected_chunks: List[bytes]):
"""Handle the logging after all chunks have been collected."""
@ -35,6 +36,15 @@ class BaseAnthropicMessagesStreamingIterator:
)
end_time = datetime.now()
# Set completion_start_time so TTFT is calculated from the first
# chunk rather than falling back to end_time in async_success_handler.
if self.completion_start_time is not None:
self.litellm_logging_obj.completion_start_time = (
self.completion_start_time
)
self.litellm_logging_obj.model_call_details[
"completion_start_time"
] = self.completion_start_time
asyncio.create_task(
PassThroughStreamingHandler._route_streaming_logging_to_handler(
litellm_logging_obj=self.litellm_logging_obj,
@ -100,6 +110,8 @@ class BaseAnthropicMessagesStreamingIterator:
collected_chunks = []
async for chunk in completion_stream:
if self.completion_start_time is None:
self.completion_start_time = datetime.now()
encoded_chunk = self._convert_chunk_to_sse_format(chunk)
collected_chunks.append(encoded_chunk)
yield encoded_chunk

View File

@ -30,7 +30,9 @@ from litellm.constants import (
AIOHTTP_KEEPALIVE_TIMEOUT,
AIOHTTP_NEEDS_CLEANUP_CLOSED,
AIOHTTP_TTL_DNS_CACHE,
COMPLETION_HTTP_FALLBACK_SECONDS,
DEFAULT_SSL_CIPHERS,
HTTP_HANDLER_CONNECT_TIMEOUT_SECONDS,
)
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
from litellm.types.llms.custom_http import *
@ -70,7 +72,10 @@ def get_default_headers() -> dict:
headers = get_default_headers()
# https://www.python-httpx.org/advanced/timeouts
_DEFAULT_TIMEOUT = httpx.Timeout(timeout=5.0, connect=5.0)
_DEFAULT_TIMEOUT = httpx.Timeout(
timeout=COMPLETION_HTTP_FALLBACK_SECONDS,
connect=HTTP_HANDLER_CONNECT_TIMEOUT_SECONDS,
)
def _prepare_request_data_and_content(
@ -1244,7 +1249,7 @@ def get_async_httpx_client(
_new_client = AsyncHTTPHandler(**handler_params)
else:
_new_client = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0),
timeout=_DEFAULT_TIMEOUT,
shared_session=shared_session,
)
@ -1293,7 +1298,7 @@ def _get_httpx_client(params: Optional[dict] = None) -> HTTPHandler:
}
_new_client = HTTPHandler(**handler_params)
else:
_new_client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
_new_client = HTTPHandler(timeout=_DEFAULT_TIMEOUT)
cache.set_cache(
key=_cache_key_name,

View File

@ -76,6 +76,7 @@ from litellm.litellm_core_utils.audio_utils.utils import (
calculate_request_duration,
get_audio_file_for_health_check,
)
from litellm.litellm_core_utils.completion_timeout import CompletionTimeout
from litellm.litellm_core_utils.dd_tracing import tracer
from litellm.litellm_core_utils.get_provider_specific_headers import (
ProviderSpecificHeaderUtils,
@ -1400,14 +1401,13 @@ def completion( # type: ignore # noqa: PLR0915
) # support region-based pricing for bedrock
### TIMEOUT LOGIC ###
timeout = timeout or kwargs.get("request_timeout", 600) or 600
# set timeout for 10 minutes by default
if isinstance(timeout, httpx.Timeout) and not supports_httpx_timeout(
custom_llm_provider
):
timeout = timeout.read or 600 # default 10 min timeout
elif not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
timeout = CompletionTimeout.resolve(
timeout,
kwargs,
custom_llm_provider,
global_timeout=getattr(litellm, "request_timeout", None),
supports_httpx_timeout=supports_httpx_timeout,
)
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if (

View File

@ -5,6 +5,7 @@ sys.path.insert(
0, os.path.abspath("../../")
) # Adds the parent directory to the system path
import httpx
import pytest
from litellm.llms.azure.common_utils import process_azure_headers
from httpx import Headers

View File

@ -0,0 +1,46 @@
"""
``_get_httpx_client`` + ``HTTPHandler.post`` (same pattern as Azure Anthropic sync path:
``_get_httpx_client(params={"timeout": ...})`` then ``post(..., timeout=...)``).
Uses https://httpbin.org/delay/10 with ``timeout=5`` the handler must raise :class:`~litellm.exceptions.Timeout`
before the 10s delay completes. Skips if httpbin is unreachable.
Lives under ``local_testing`` (not ``make test-unit``).
"""
import json
import os
import sys
import httpx
import pytest
sys.path.insert(
0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
)
from litellm.exceptions import Timeout as LitellmTimeout
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
_HTTPBIN_DELAY_S = 10
_PER_REQUEST_TIMEOUT_S = 5.0
_CLIENT_DEFAULT_TIMEOUT_S = 60.0
def test_post_delay_exceeds_per_request_timeout_raises():
try:
httpx.get("https://httpbin.org/get", timeout=5.0)
except Exception as e:
pytest.skip(f"httpbin.org unreachable: {e}")
handler = _get_httpx_client(params={"timeout": _CLIENT_DEFAULT_TIMEOUT_S})
try:
with pytest.raises(LitellmTimeout):
handler.post(
f"https://httpbin.org/delay/{_HTTPBIN_DELAY_S}",
headers={"content-type": "application/json"},
data=json.dumps({"model": "claude", "messages": []}),
timeout=_PER_REQUEST_TIMEOUT_S,
)
finally:
handler.close()

View File

@ -222,5 +222,7 @@ class TestAzureAnthropicChatCompletion:
# Verify non-streaming was handled
mock_client.post.assert_called_once()
mock_get_client.assert_called_once_with(params={"timeout": timeout})
assert mock_client.post.call_args.kwargs["timeout"] == timeout
assert result is not None

View File

@ -0,0 +1,42 @@
"""
Ensure litellm.completion() forwards timeout to Azure Anthropic handler (main.py dispatch).
"""
import os
import sys
from unittest.mock import MagicMock, patch
sys.path.insert(
0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../.."))
)
from litellm import completion
from litellm.types.utils import ModelResponse
def test_main_azure_ai_claude_completion_passes_timeout_to_azure_anthropic_handler():
captured: dict = {}
def fake_azure_anthropic_completion(**kwargs):
captured.update(kwargs)
return ModelResponse()
with patch(
"litellm.main.azure_anthropic_chat_completions"
) as mock_azure_anthropic:
mock_azure_anthropic.completion = MagicMock(
side_effect=fake_azure_anthropic_completion
)
completion(
model="azure_ai/claude-sonnet-4-5",
messages=[{"role": "user", "content": "hi"}],
api_base="https://example.services.ai.azure.com/anthropic",
api_key="test-key",
timeout=42.5,
)
mock_azure_anthropic.completion.assert_called_once()
assert captured["timeout"] == 42.5
assert captured["model"] == "claude-sonnet-4-5"
assert captured["custom_llm_provider"] == "azure_ai"

View File

@ -15,7 +15,12 @@ sys.path.insert(
) # Adds the parent directory to the system path
import litellm
from litellm.llms.custom_httpx.aiohttp_transport import LiteLLMAiohttpTransport
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, get_ssl_configuration
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
_get_httpx_client,
get_ssl_configuration,
)
@pytest.mark.asyncio
@ -658,3 +663,26 @@ async def test_httpx_handler_uses_env_user_agent(monkeypatch):
assert req.headers.get("User-Agent") == "Claude Code"
finally:
await handler.close()
def test_get_httpx_client_applies_float_timeout_without_mocking_handler():
"""
Exercise real _get_httpx_client + HTTPHandler: params={'timeout': x} must reach httpx.Client(timeout=...).
Uses an uncommon timeout value to avoid colliding with other cached clients in-process.
"""
timeout = 3847.291
handler = _get_httpx_client(params={"timeout": timeout})
try:
assert isinstance(handler, HTTPHandler)
assert handler.client.timeout == httpx.Timeout(timeout)
finally:
handler.close()
def test_get_httpx_client_applies_httpx_timeout_object_without_mocking_handler():
t = httpx.Timeout(40.0, connect=5.0)
handler = _get_httpx_client(params={"timeout": t})
try:
assert handler.client.timeout == t
finally:
handler.close()

View File

@ -0,0 +1,145 @@
"""Unit tests for litellm.litellm_core_utils.completion_timeout.CompletionTimeout."""
import os
import sys
import httpx
sys.path.insert(
0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
)
from litellm.litellm_core_utils.completion_timeout import CompletionTimeout
from litellm.utils import supports_httpx_timeout
def test_explicit_timeout_wins():
assert (
CompletionTimeout.resolve(
12.5,
{"timeout": 99.0, "request_timeout": 88.0},
"openai",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
== 12.5
)
def test_kwargs_timeout_when_param_none():
assert (
CompletionTimeout.resolve(
None,
{"timeout": 21.0},
"azure_ai",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
== 21.0
)
def test_request_timeout_alias_in_kwargs():
assert (
CompletionTimeout.resolve(
None,
{"request_timeout": 33.0},
"bedrock",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
== 33.0
)
def test_global_timeout_from_litellm_settings():
assert (
CompletionTimeout.resolve(
None,
{},
"vertex_ai",
global_timeout=360.0,
supports_httpx_timeout=supports_httpx_timeout,
)
== 360.0
)
def test_global_timeout_package_default_coerced_to_600_for_completion():
"""Package default 6000s → 600s for completion-only path."""
assert (
CompletionTimeout.resolve(
None,
{},
"openai",
global_timeout=6000.0,
supports_httpx_timeout=supports_httpx_timeout,
)
== 600.0
)
def test_explicit_request_timeout_6000_preserved():
"""Explicit deployment/request timeout must not be truncated by the package sentinel."""
assert (
CompletionTimeout.resolve(
None,
{"request_timeout": 6000.0},
"openai",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
== 6000.0
)
def test_explicit_model_timeout_6000_preserved():
assert (
CompletionTimeout.resolve(
6000.0,
{"timeout": 1.0, "request_timeout": 2.0},
"openai",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
== 6000.0
)
def test_fallback_600_when_no_global_timeout():
assert (
CompletionTimeout.resolve(
None,
{},
"azure_ai",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
== 600.0
)
def test_httpx_timeout_coerced_for_provider_without_httpx_timeout_support():
t = httpx.Timeout(50.0, connect=2.0)
out = CompletionTimeout.resolve(
t,
{},
"azure_ai",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
assert out == 50.0
assert not isinstance(out, httpx.Timeout)
def test_httpx_timeout_preserved_for_openai():
t = httpx.Timeout(40.0, connect=5.0)
out = CompletionTimeout.resolve(
t,
{},
"openai",
global_timeout=None,
supports_httpx_timeout=supports_httpx_timeout,
)
assert out is t
assert isinstance(out, httpx.Timeout)