fix: resolve flaky test failures in health, spend logs, and CLI tests (#21769)

* fix: reset db_health_cache in source module to prevent stale cache hits

The test was reassigning db_health_cache via `global` in the test module,
which doesn't affect the _health_endpoints module's variable. When a prior
test set the cache to "connected" within 2 minutes, _db_health_readiness_check
returned early without calling health_check(), causing assert_called_once to fail.

Also use PrismaError with a connection message so it's properly recognized
as a connection error by PrismaDBExceptionHandler.is_database_connection_error.

* fix: replace asyncio.sleep with polling loop in spend logs tests

The GLOBAL_LOGGING_WORKER processes callbacks via an async queue, so
asyncio.sleep(1) is a race condition - under CI load the worker may not
have processed the queued task within 1 second. Replace with a polling
helper that waits up to 10 seconds for the mock to be called.

Also add metadata.attempted_retries and metadata.max_retries to
ignored_keys since these are new fields.

* fix: isolate test_skip_server_startup from CI environment

Remove mix_stderr=False (unsupported in some Click versions). Strip
DATABASE_URL/DIRECT_URL from environment during the test to prevent
real prisma operations when these are set in CI.
This commit is contained in:
Ishaan Jaff 2026-02-21 10:02:24 -08:00 committed by GitHub
parent 061a3cdc3e
commit 3e67cb5287
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 36 additions and 23 deletions

View File

@ -12,6 +12,7 @@ sys.path.insert(
import pytest
from prisma.errors import ClientNotConnectedError, HTTPClientClosedError, PrismaError
import litellm.proxy.health_endpoints._health_endpoints as _health_endpoints_module
from litellm.proxy.health_endpoints._health_endpoints import (
_db_health_readiness_check,
db_health_cache,
@ -31,7 +32,7 @@ from tests.test_litellm.proxy.conftest import create_proxy_test_client
@pytest.mark.parametrize(
"prisma_error",
[
PrismaError(),
PrismaError("Can't reach database server"),
ClientNotConnectedError(),
HTTPClientClosedError(),
],
@ -46,9 +47,9 @@ async def test_db_health_readiness_check_with_prisma_error(prisma_error):
mock_prisma_client = MagicMock()
mock_prisma_client.health_check.side_effect = prisma_error
# Reset the health cache to a known state
global db_health_cache
db_health_cache = {
# Reset the health cache in the source module so _db_health_readiness_check
# sees the updated value (assigning to a test-module global doesn't work).
_health_endpoints_module.db_health_cache = {
"status": "unknown",
"last_updated": datetime.now() - timedelta(minutes=5),
}
@ -73,7 +74,7 @@ async def test_db_health_readiness_check_with_prisma_error(prisma_error):
@pytest.mark.parametrize(
"prisma_error",
[
PrismaError(),
PrismaError("Can't reach database server"),
ClientNotConnectedError(),
HTTPClientClosedError(),
],
@ -87,9 +88,8 @@ async def test_db_health_readiness_check_with_error_and_flag_off(prisma_error):
mock_prisma_client = MagicMock()
mock_prisma_client.health_check.side_effect = prisma_error
# Reset the health cache
global db_health_cache
db_health_cache = {
# Reset the health cache in the source module
_health_endpoints_module.db_health_cache = {
"status": "unknown",
"last_updated": datetime.now() - timedelta(minutes=5),
}
@ -491,7 +491,7 @@ def test_get_callback_identifier_string_and_object_with_callback_name():
- Object with empty/None callback_name (should fall through to other checks)
"""
from litellm.proxy.health_endpoints._health_endpoints import get_callback_identifier
# Test 1: String callback should be returned as-is
assert get_callback_identifier("datadog") == "datadog"
assert get_callback_identifier("langfuse") == "langfuse"
@ -522,9 +522,9 @@ def test_get_callback_identifier_custom_logger_registry_and_fallback():
- Object with callback_name that matches registry entry
- Fallback to callback_name() helper function
"""
from litellm.proxy.health_endpoints._health_endpoints import get_callback_identifier
from litellm.litellm_core_utils.custom_logger_registry import CustomLoggerRegistry
from litellm.proxy.health_endpoints._health_endpoints import get_callback_identifier
# Test 1: Object registered in CustomLoggerRegistry (without callback_name attribute)
# Mock a class that's registered in the registry
class MockRegisteredLogger:

View File

@ -299,6 +299,8 @@ ignored_keys = [
"metadata.status",
"metadata.proxy_server_request",
"metadata.error_information",
"metadata.attempted_retries",
"metadata.max_retries",
]
MODEL_LIST = [
@ -1196,6 +1198,18 @@ async def test_ui_view_spend_logs_with_key_hash(client, monkeypatch):
assert data["data"][0]["api_key"] == "sk-test-key-1"
async def _wait_for_mock_call(mock, timeout=10, interval=0.1):
"""Poll until mock has been called at least once, or timeout."""
import time
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
if mock.call_count > 0:
return
await asyncio.sleep(interval)
mock.assert_called_once() # will raise with a clear message
class TestSpendLogsPayload:
@pytest.mark.asyncio
async def test_spend_logs_payload_e2e(self):
@ -1215,9 +1229,7 @@ class TestSpendLogsPayload:
assert response.choices[0].message.content == "Hello, world!"
await asyncio.sleep(1)
mock_client.assert_called_once()
await _wait_for_mock_call(mock_client)
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
@ -1310,9 +1322,7 @@ class TestSpendLogsPayload:
assert response.choices[0].message.content == "Hi! My name is Claude."
await asyncio.sleep(1)
mock_client.assert_called_once()
await _wait_for_mock_call(mock_client)
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]
@ -1402,9 +1412,7 @@ class TestSpendLogsPayload:
assert response.choices[0].message.content == "Hi! My name is Claude."
await asyncio.sleep(1)
mock_client.assert_called_once()
await _wait_for_mock_call(mock_client)
kwargs = mock_client.call_args.kwargs
payload: SpendLogsPayload = kwargs["payload"]

View File

@ -233,7 +233,7 @@ class TestProxyInitializationHelpers:
from litellm.proxy.proxy_cli import run_server
runner = CliRunner(mix_stderr=False)
runner = CliRunner()
mock_proxy_module = MagicMock(
app=MagicMock(),
@ -241,7 +241,12 @@ class TestProxyInitializationHelpers:
KeyManagementSettings=MagicMock(),
save_worker_config=MagicMock(),
)
# Remove DATABASE_URL/DIRECT_URL so the CLI doesn't attempt
# real prisma operations when these are set in CI.
clean_env = {k: v for k, v in os.environ.items() if k not in ("DATABASE_URL", "DIRECT_URL")}
with patch.dict(
os.environ, clean_env, clear=True,
), patch.dict(
"sys.modules",
{
"proxy_server": mock_proxy_module,
@ -262,7 +267,7 @@ class TestProxyInitializationHelpers:
# --- skip startup ---
result = runner.invoke(run_server, ["--local", "--skip_server_startup"])
assert result.exit_code == 0
assert result.exit_code == 0, f"exit_code={result.exit_code}, output={result.output}"
assert "Skipping server startup" in result.output
mock_uvicorn_run.assert_not_called()
@ -271,7 +276,7 @@ class TestProxyInitializationHelpers:
result = runner.invoke(run_server, ["--local"])
assert result.exit_code == 0
assert result.exit_code == 0, f"exit_code={result.exit_code}, output={result.output}"
mock_uvicorn_run.assert_called_once()
@patch("uvicorn.run")