fix: resolve flaky test failures in health, spend logs, and CLI tests (#21769)

* fix: reset db_health_cache in source module to prevent stale cache hits The test was reassigning db_health_cache via `global` in the test module, which doesn't affect the _health_endpoints module's variable. When a prior test set the cache to "connected" within 2 minutes, _db_health_readiness_check returned early without calling health_check(), causing assert_called_once to fail. Also use PrismaError with a connection message so it's properly recognized as a connection error by PrismaDBExceptionHandler.is_database_connection_error. * fix: replace asyncio.sleep with polling loop in spend logs tests The GLOBAL_LOGGING_WORKER processes callbacks via an async queue, so asyncio.sleep(1) is a race condition - under CI load the worker may not have processed the queued task within 1 second. Replace with a polling helper that waits up to 10 seconds for the mock to be called. Also add metadata.attempted_retries and metadata.max_retries to ignored_keys since these are new fields. * fix: isolate test_skip_server_startup from CI environment Remove mix_stderr=False (unsupported in some Click versions). Strip DATABASE_URL/DIRECT_URL from environment during the test to prevent real prisma operations when these are set in CI.
2026-02-21 10:02:24 -08:00 · 2026-02-21 10:02:24 -08:00 · 3e67cb5287
commit 3e67cb5287
parent 061a3cdc3e
3 changed files with 36 additions and 23 deletions
--- a/tests/test_litellm/proxy/health_endpoints/test_health_endpoints.py
+++ b/tests/test_litellm/proxy/health_endpoints/test_health_endpoints.py
@ -12,6 +12,7 @@ sys.path.insert(
 import pytest
 from prisma.errors import ClientNotConnectedError, HTTPClientClosedError, PrismaError

+import litellm.proxy.health_endpoints._health_endpoints as _health_endpoints_module
 from litellm.proxy.health_endpoints._health_endpoints import (
    _db_health_readiness_check,
    db_health_cache,
@ -31,7 +32,7 @@ from tests.test_litellm.proxy.conftest import create_proxy_test_client
@pytest.mark.parametrize(
    "prisma_error",
    [
-        PrismaError(),
+        PrismaError("Can't reach database server"),
        ClientNotConnectedError(),
        HTTPClientClosedError(),
    ],
@ -46,9 +47,9 @@ async def test_db_health_readiness_check_with_prisma_error(prisma_error):
    mock_prisma_client = MagicMock()
    mock_prisma_client.health_check.side_effect = prisma_error

-    # Reset the health cache to a known state
-    global db_health_cache
-    db_health_cache = {
+    # Reset the health cache in the source module so _db_health_readiness_check
+    # sees the updated value (assigning to a test-module global doesn't work).
+    _health_endpoints_module.db_health_cache = {
        "status": "unknown",
        "last_updated": datetime.now() - timedelta(minutes=5),
    }
@ -73,7 +74,7 @@ async def test_db_health_readiness_check_with_prisma_error(prisma_error):
@pytest.mark.parametrize(
    "prisma_error",
    [
-        PrismaError(),
+        PrismaError("Can't reach database server"),
        ClientNotConnectedError(),
        HTTPClientClosedError(),
    ],
@ -87,9 +88,8 @@ async def test_db_health_readiness_check_with_error_and_flag_off(prisma_error):
    mock_prisma_client = MagicMock()
    mock_prisma_client.health_check.side_effect = prisma_error

-    # Reset the health cache
-    global db_health_cache
-    db_health_cache = {
+    # Reset the health cache in the source module
+    _health_endpoints_module.db_health_cache = {
        "status": "unknown",
        "last_updated": datetime.now() - timedelta(minutes=5),
    }
@ -491,7 +491,7 @@ def test_get_callback_identifier_string_and_object_with_callback_name():
    - Object with empty/None callback_name (should fall through to other checks)
    """
    from litellm.proxy.health_endpoints._health_endpoints import get_callback_identifier
-    
+
    # Test 1: String callback should be returned as-is
    assert get_callback_identifier("datadog") == "datadog"
    assert get_callback_identifier("langfuse") == "langfuse"
@ -522,9 +522,9 @@ def test_get_callback_identifier_custom_logger_registry_and_fallback():
    - Object with callback_name that matches registry entry
    - Fallback to callback_name() helper function
    """
-    from litellm.proxy.health_endpoints._health_endpoints import get_callback_identifier
    from litellm.litellm_core_utils.custom_logger_registry import CustomLoggerRegistry
-    
+    from litellm.proxy.health_endpoints._health_endpoints import get_callback_identifier
+
    # Test 1: Object registered in CustomLoggerRegistry (without callback_name attribute)
    # Mock a class that's registered in the registry
    class MockRegisteredLogger:
--- a/tests/test_litellm/proxy/spend_tracking/test_spend_management_endpoints.py
+++ b/tests/test_litellm/proxy/spend_tracking/test_spend_management_endpoints.py
@ -299,6 +299,8 @@ ignored_keys = [
    "metadata.status",
    "metadata.proxy_server_request",
    "metadata.error_information",
+    "metadata.attempted_retries",
+    "metadata.max_retries",
 ]

 MODEL_LIST = [
@ -1196,6 +1198,18 @@ async def test_ui_view_spend_logs_with_key_hash(client, monkeypatch):
    assert data["data"][0]["api_key"] == "sk-test-key-1"


+async def _wait_for_mock_call(mock, timeout=10, interval=0.1):
+    """Poll until mock has been called at least once, or timeout."""
+    import time
+
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        if mock.call_count > 0:
+            return
+        await asyncio.sleep(interval)
+    mock.assert_called_once()  # will raise with a clear message
+
+
 class TestSpendLogsPayload:
    @pytest.mark.asyncio
    async def test_spend_logs_payload_e2e(self):
@ -1215,9 +1229,7 @@ class TestSpendLogsPayload:

            assert response.choices[0].message.content == "Hello, world!"

-            await asyncio.sleep(1)
-
-            mock_client.assert_called_once()
+            await _wait_for_mock_call(mock_client)

            kwargs = mock_client.call_args.kwargs
            payload: SpendLogsPayload = kwargs["payload"]
@ -1310,9 +1322,7 @@ class TestSpendLogsPayload:

            assert response.choices[0].message.content == "Hi! My name is Claude."

-            await asyncio.sleep(1)
-
-            mock_client.assert_called_once()
+            await _wait_for_mock_call(mock_client)

            kwargs = mock_client.call_args.kwargs
            payload: SpendLogsPayload = kwargs["payload"]
@ -1402,9 +1412,7 @@ class TestSpendLogsPayload:

            assert response.choices[0].message.content == "Hi! My name is Claude."

-            await asyncio.sleep(1)
-
-            mock_client.assert_called_once()
+            await _wait_for_mock_call(mock_client)

            kwargs = mock_client.call_args.kwargs
            payload: SpendLogsPayload = kwargs["payload"]
--- a/tests/test_litellm/proxy/test_proxy_cli.py
+++ b/tests/test_litellm/proxy/test_proxy_cli.py
@ -233,7 +233,7 @@ class TestProxyInitializationHelpers:

        from litellm.proxy.proxy_cli import run_server

-        runner = CliRunner(mix_stderr=False)
+        runner = CliRunner()

        mock_proxy_module = MagicMock(
            app=MagicMock(),
@ -241,7 +241,12 @@ class TestProxyInitializationHelpers:
            KeyManagementSettings=MagicMock(),
            save_worker_config=MagicMock(),
        )
+        # Remove DATABASE_URL/DIRECT_URL so the CLI doesn't attempt
+        # real prisma operations when these are set in CI.
+        clean_env = {k: v for k, v in os.environ.items() if k not in ("DATABASE_URL", "DIRECT_URL")}
        with patch.dict(
+            os.environ, clean_env, clear=True,
+        ), patch.dict(
            "sys.modules",
            {
                "proxy_server": mock_proxy_module,
@ -262,7 +267,7 @@ class TestProxyInitializationHelpers:
            # --- skip startup ---
            result = runner.invoke(run_server, ["--local", "--skip_server_startup"])

-            assert result.exit_code == 0
+            assert result.exit_code == 0, f"exit_code={result.exit_code}, output={result.output}"
            assert "Skipping server startup" in result.output
            mock_uvicorn_run.assert_not_called()

@ -271,7 +276,7 @@ class TestProxyInitializationHelpers:

            result = runner.invoke(run_server, ["--local"])

-            assert result.exit_code == 0
+            assert result.exit_code == 0, f"exit_code={result.exit_code}, output={result.output}"
            mock_uvicorn_run.assert_called_once()

    @patch("uvicorn.run")