fix(otel): capture 401 error details in management endpoint spans (#29535)

Auth failures on management endpoints such as team/list and
organization/list (invalid or expired keys) were raised as
ProxyException, whose __str__ returned an empty string, so the OTEL
SERVER span recorded an error with no message. ProxyException now
stringifies to its message, get_error_information prefers the explicit
.message attribute, and the proxy exception handlers stamp a consistent
error.type, error.code and error.message on the span

Resolves LIT-3515
This commit is contained in:
ryan-crabbe-berri 2026-06-02 16:40:30 -07:00 committed by GitHub
parent 9d9558e78f
commit f047b1571e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 128 additions and 7 deletions

View File

@ -3287,6 +3287,32 @@ class OpenTelemetry(OTELGenAISemconvMixin, CustomLogger):
value=int(status_code),
)
def record_error_attributes_on_span(
self,
span: Optional[Span],
exception: Optional[Exception],
status_code: int,
) -> None:
"""Stamp structured ``error.*`` attributes on the SERVER span from the
exception returned to the client, with ``error.code`` pinned to the real
response status. Idempotent (overwrites); emits no exception event."""
if span is None or exception is None:
return
from litellm.litellm_core_utils.litellm_logging import (
StandardLoggingPayloadSetup,
)
error_information = StandardLoggingPayloadSetup.get_error_information(
original_exception=exception
)
error_information["error_code"] = str(status_code)
self._record_exception_on_span(
span=span,
kwargs={
"standard_logging_object": {"error_information": error_information}
},
)
def set_preprocessing_duration_attribute(
self, span: Optional[Span], container: Any
) -> None:

View File

@ -5300,8 +5300,12 @@ class StandardLoggingPayloadSetup:
tb_lines[:MAXIMUM_TRACEBACK_LINES_TO_LOG]
) # Limit to first 100 lines
# Get additional error details
error_message = str(original_exception)
explicit_message = getattr(original_exception, "message", None)
error_message = (
explicit_message
if isinstance(explicit_message, str) and explicit_message
else str(original_exception)
)
return StandardLoggingPayloadErrorInformation(
error_code=error_status,

View File

@ -3673,6 +3673,7 @@ class ProxyException(Exception):
provider_specific_fields: Optional[dict] = None,
):
self.message = str(message)
super().__init__(self.message)
self.type = type
self.param = param
self.openai_code = openai_code or code

View File

@ -1271,7 +1271,7 @@ async def openai_exception_handler(request: Request, exc: ProxyException):
headers = exc.headers
error_dict = exc.to_dict()
status_code = int(exc.code) if exc.code else status.HTTP_500_INTERNAL_SERVER_ERROR
_close_dangling_otel_server_span(request, status_code)
_close_dangling_otel_server_span(request, status_code, exc=exc)
return JSONResponse(
status_code=status_code,
content={"error": error_dict},
@ -1279,7 +1279,9 @@ async def openai_exception_handler(request: Request, exc: ProxyException):
)
def _close_dangling_otel_server_span(request: Request, status_code: int) -> None:
def _close_dangling_otel_server_span(
request: Request, status_code: int, exc: Optional[Exception] = None
) -> None:
parent_otel_span = getattr(request.state, "parent_otel_span", None)
if parent_otel_span is None:
return
@ -1302,6 +1304,10 @@ def _close_dangling_otel_server_span(request: Request, status_code: int) -> None
open_telemetry_logger.set_response_status_code_attribute(
parent_otel_span, status_code
)
if status_code >= 400:
open_telemetry_logger.record_error_attributes_on_span(
parent_otel_span, exc, status_code
)
parent_otel_span.set_status(
Status(StatusCode.ERROR if status_code >= 400 else StatusCode.OK)
)
@ -1318,7 +1324,7 @@ def _close_dangling_otel_server_span(request: Request, status_code: int) -> None
async def otel_request_validation_exception_handler(
request: Request, exc: RequestValidationError
):
_close_dangling_otel_server_span(request, 422)
_close_dangling_otel_server_span(request, 422, exc=exc)
return JSONResponse(
status_code=422,
content={"detail": jsonable_encoder(exc.errors())},
@ -1332,7 +1338,7 @@ async def otel_unhandled_exception_handler(request: Request, exc: Exception):
verbose_proxy_logger.exception(
"Unhandled exception in request: %s", type(exc).__name__
)
_close_dangling_otel_server_span(request, 500)
_close_dangling_otel_server_span(request, 500, exc=exc)
return JSONResponse(
status_code=500,
content={

View File

@ -18,7 +18,9 @@ from litellm.proxy.proxy_server import (
otel_unhandled_exception_handler,
)
from ._helpers import assert_server_span_attrs
from litellm.integrations._types.open_inference import ErrorAttributes
from ._helpers import assert_server_span_attrs, get_server_span
def _fake_request(parent_otel_span=None):
@ -92,6 +94,33 @@ def test_exception_handler_closes_span(
)
@pytest.mark.parametrize("path", ["/team/list", "/organization/list"])
def test_openai_exception_handler_stamps_structured_error_on_span(
wired_otel, server_span_factory, path
):
"""A ProxyException 401 (invalid/expired key on a management endpoint) must
leave error.type, error.code AND error.message on the SERVER span. Pre-fix,
ProxyException stringified to "" so error.message was dropped the span
showed an error with no message."""
msg = "Authentication Error, Invalid proxy server token passed."
request = _fake_request(parent_otel_span=server_span_factory(path))
exc = ProxyException(message=msg, type="auth_error", param="key", code=401)
response = asyncio.run(openai_exception_handler(request, exc))
assert response.status_code == 401
assert_server_span_attrs(
wired_otel,
expected_status=401,
expected_url_path=path,
where=f"openai_exception_handler ({path})",
)
attrs = get_server_span(wired_otel).attributes
assert attrs.get(ErrorAttributes.ERROR_MESSAGE) == msg
assert attrs.get(ErrorAttributes.ERROR_TYPE) == "ProxyException"
assert attrs.get(ErrorAttributes.ERROR_CODE) == "401"
def test_unhandled_handler_reraises_known_exceptions(wired_otel, server_span_factory):
"""ProxyException / HTTPException / RequestValidationError have dedicated handlers."""
request = _fake_request(parent_otel_span=server_span_factory("/key/generate"))

View File

@ -3078,3 +3078,39 @@ class TestFirstApiCallStartTimeSetOnce:
assert obj.model_call_details["api_call_start_time"] > first
assert obj.model_call_details["first_api_call_start_time"] == first
assert user_meta == {}
def test_get_error_information_proxy_exception_preserves_message():
"""ProxyException keeps its text in ``.message`` (str() was empty pre-fix),
so error_information must still surface the message and code."""
from litellm.litellm_core_utils.litellm_logging import StandardLoggingPayloadSetup
from litellm.proxy._types import ProxyException
msg = "Authentication Error, Invalid proxy server token passed."
exc = ProxyException(message=msg, type="auth_error", param="key", code=401)
info = StandardLoggingPayloadSetup.get_error_information(original_exception=exc)
assert info["error_message"] == msg
assert info["error_class"] == "ProxyException"
assert info["error_code"] == "401"
def test_get_error_information_prefers_message_attribute_over_empty_str():
"""error_message must come from a populated ``.message`` even when the
exception's __str__ is empty — guards classes that store the text on
``.message`` without forwarding it to ``Exception.__init__``."""
from litellm.litellm_core_utils.litellm_logging import StandardLoggingPayloadSetup
class _SilentExc(Exception):
def __init__(self):
self.message = "real failure detail"
self.code = 401
def __str__(self):
return ""
info = StandardLoggingPayloadSetup.get_error_information(
original_exception=_SilentExc()
)
assert info["error_message"] == "real failure detail"
assert info["error_code"] == "401"

View File

@ -87,3 +87,22 @@ def test_user_api_key_auth_hashes_authorization_header_form_of_key():
assert from_header.api_key == baseline.api_key
assert from_header.token == baseline.token
assert not from_header.api_key.lower().startswith("bearer")
def test_proxy_exception_str_returns_message():
"""ProxyException must stringify to its message: OTEL's
``span.record_exception`` and ``str(exc)``-based logging read the string
form, which was empty pre-fix. The OpenAI-mapped fields must stay intact."""
from litellm.proxy._types import ProxyException
msg = "Authentication Error, Invalid proxy server token passed."
exc = ProxyException(message=msg, type="auth_error", param="key", code=401)
assert str(exc) == msg
assert exc.message == msg
assert exc.to_dict() == {
"message": msg,
"type": "auth_error",
"param": "key",
"code": "401",
}