feat(proxy): skip disable_background_health_check models on GET /health when flag set (#27716)

* feat(proxy): skip disable_background_health_check models on GET /health when flag set Co-authored-by: Cursor <cursoragent@cursor.com> * fix comment * fix greptile comments * Fix health check fallback kwargs * Format health endpoint * Harden direct health check kwargs compatibility for monkeypatched perform_health_check Replace substring-based TypeError detection with unexpected-keyword checks and a short retry chain (full kwargs, instrumentation only, filter only, minimal) so partial stubs work regardless of which optional kwarg fails first. Add proxy unit tests for legacy three-arg stubs and single-kwarg variants. Co-authored-by: Sameer Kankute <Sameerlite@users.noreply.github.com> * fix black --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: Sameer Kankute <Sameerlite@users.noreply.github.com>
2026-05-13 22:19:05 +05:30 · 2026-05-13 22:19:05 +05:30 · 38709ba9bb
commit 38709ba9bb
parent 2e5ebf826f
9 changed files with 326 additions and 52 deletions
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -2401,6 +2401,13 @@ class ConfigGeneralSettings(LiteLLMPydanticObjectBase):
            "health checks run without a concurrency cap"
        ),
    )
+    health_check_skip_disabled_background_models: bool = Field(
+        False,
+        description=(
+            "When true, deployments with model_info.disable_background_health_check "
+            "are skipped for on-demand GET /health as well as the background health loop."
+        ),
+    )
    alerting: Optional[List] = Field(
        None,
        description="List of alerting integrations. Today, just slack - `alerting: ['slack']`",
--- a/litellm/proxy/health_check.py
+++ b/litellm/proxy/health_check.py
@ -86,6 +86,24 @@ def _clean_endpoint_data(endpoint_data: dict, details: Optional[bool] = True):
    )


+def health_check_filter_kwargs_from_general_settings(
+    general_settings: Optional[dict],
+) -> dict:
+    """
+    Build kwargs for ``perform_health_check`` from ``general_settings``.
+
+    When ``health_check_skip_disabled_background_models`` is true, deployments with
+    ``model_info.disable_background_health_check`` are omitted from health runs
+    (including on-demand ``GET /health``), matching the background loop behavior.
+    """
+    g = general_settings or {}
+    return {
+        "health_check_skip_disabled_background_models": bool(
+            g.get("health_check_skip_disabled_background_models", False)
+        ),
+    }
+
+
 def filter_deployments_by_id(
    model_list: List,
 ) -> List:
@ -438,6 +456,7 @@ async def perform_health_check(
    model_id: Optional[str] = None,
    max_concurrency: Optional[int] = None,
    instrumentation_context: Optional[dict] = None,
+    health_check_skip_disabled_background_models: bool = False,
 ):
    """
    Perform a health check on the system.
@ -446,6 +465,12 @@ async def perform_health_check(
    (so models that share the same name but have different ids are checked separately).
    When model (name) is provided, all deployments matching that name are checked.

+    When ``health_check_skip_disabled_background_models`` is True (via
+    ``general_settings.health_check_skip_disabled_background_models``), deployments
+    with ``model_info.disable_background_health_check: true`` are omitted from
+    this run (including targeted ``/health`` queries), consistent with the
+    background health loop.
+
    Returns:
        (bool): True if the health check passes, False otherwise.
    """
@ -486,6 +511,23 @@ async def perform_health_check(
            _new_model_list = [x for x in model_list if x["model_name"] == model]
        model_list = _new_model_list

+    if health_check_skip_disabled_background_models:
+        model_list = [
+            x
+            for x in model_list
+            if not (x.get("model_info") or {}).get(
+                "disable_background_health_check", False
+            )
+        ]
+    if not model_list:
+        if instrumentation_enabled:
+            logger.debug(
+                "health_check_cycle_skipped source=%s cycle_id=%s reason=no_models_after_filter",
+                source,
+                cycle_id,
+            )
+        return [], [], {}
+
    post_filter_model_count = len(model_list)
    model_list = filter_deployments_by_id(
        model_list=model_list
--- a/litellm/proxy/health_check_utils/shared_health_check_manager.py
+++ b/litellm/proxy/health_check_utils/shared_health_check_manager.py
@ -192,6 +192,7 @@ class SharedHealthCheckManager:
        model_list: List[Dict[str, Any]],
        details: bool = True,
        max_concurrency: Optional[int] = None,
+        health_check_skip_disabled_background_models: bool = False,
    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], Dict[str, Any]]:
        """
        Perform health check with shared state coordination.
@ -207,6 +208,7 @@ class SharedHealthCheckManager:
            model_list: List of models to check
            details: Whether to include detailed information
            max_concurrency: Optional limit on concurrent health check requests
+            health_check_skip_disabled_background_models: Remove models with disable_background_health_check: true

        Returns:
            Tuple of (healthy_endpoints, unhealthy_endpoints)
@ -240,6 +242,7 @@ class SharedHealthCheckManager:
                    model_list=model_list,
                    details=details,
                    max_concurrency=max_concurrency,
+                    health_check_skip_disabled_background_models=health_check_skip_disabled_background_models,
                )

                # Cache the results
@ -260,6 +263,7 @@ class SharedHealthCheckManager:
                    model_list=model_list,
                    details=details,
                    max_concurrency=max_concurrency,
+                    health_check_skip_disabled_background_models=health_check_skip_disabled_background_models,
                )

            # Lock not acquired — poll for cached results until the lock
@ -316,6 +320,7 @@ class SharedHealthCheckManager:
                model_list=model_list,
                details=details,
                max_concurrency=max_concurrency,
+                health_check_skip_disabled_background_models=health_check_skip_disabled_background_models,
            )

    async def is_health_check_in_progress(self) -> bool:
--- a/litellm/proxy/health_endpoints/_health_endpoints.py
+++ b/litellm/proxy/health_endpoints/_health_endpoints.py
@ -32,6 +32,7 @@ from litellm.proxy.health_check import (
    ADMIN_ONLY_HEALTH_DISPLAY_PARAMS,
    _clean_endpoint_data,
    _update_litellm_params_for_health_check,
+    health_check_filter_kwargs_from_general_settings,
    perform_health_check,
    run_with_timeout,
 )
@ -858,6 +859,7 @@ async def _perform_health_check_and_save(
    user_id,
    model_id=None,
    max_concurrency=None,
+    **perform_health_check_extra,
 ):
    """Helper function to perform health check and save results to database"""
    healthy_endpoints, unhealthy_endpoints, _ = await perform_health_check(
@ -867,6 +869,7 @@ async def _perform_health_check_and_save(
        details=details,
        max_concurrency=max_concurrency,
        model_id=model_id,
+        **perform_health_check_extra,
    )

    # Optionally save health check result to database (non-blocking)
@ -894,6 +897,37 @@ async def _perform_health_check_and_save(
    }


+def _health_endpoint_resolve_target_model_name(
+    model: Optional[str],
+    model_id: Optional[str],
+    llm_router,
+) -> Optional[str]:
+    """Map ``model_id`` (without ``model``) to ``model_name`` for live health checks."""
+    if not model_id or model:
+        return model
+    if llm_router is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail={"error": f"Model with ID {model_id} not found"},
+        )
+    try:
+        deployment = llm_router.get_deployment(model_id=model_id)
+    except Exception as e:
+        verbose_proxy_logger.error(
+            f"Error getting deployment for model_id {model_id}: {e}"
+        )
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail={"error": f"Model with ID {model_id} not found"},
+        ) from e
+    if deployment is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail={"error": f"Model with ID {model_id} not found"},
+        )
+    return deployment.model_name
+
+
@router.get("/health", tags=["health"], dependencies=[Depends(user_api_key_auth)])
 async def health_endpoint(
    response: Response,
@ -920,10 +954,15 @@ async def health_endpoint(
        background_health_checks: True
    ```
    else, the health checks will be run on models when /health is called.
+
+    To skip deployments that set ``model_info.disable_background_health_check: true``
+    on ``GET /health`` as well as in the background loop, set
+    ``general_settings.health_check_skip_disabled_background_models: true``.
    """
    import time

    from litellm.proxy.proxy_server import (
+        general_settings,
        health_check_concurrency,
        health_check_details,
        health_check_results,
@ -934,35 +973,12 @@ async def health_endpoint(
        user_model,
    )

+    _hc_filter = health_check_filter_kwargs_from_general_settings(general_settings)
    start_time = time.time()

-    # Handle model_id parameter - convert to model name for health check
-    target_model = model
-    if model_id and not model:
-        # Use get_deployment from router to find the model name
-        if llm_router is not None:
-            try:
-                deployment = llm_router.get_deployment(model_id=model_id)
-                if deployment is not None:
-                    target_model = deployment.model_name
-                else:
-                    raise HTTPException(
-                        status_code=status.HTTP_404_NOT_FOUND,
-                        detail={"error": f"Model with ID {model_id} not found"},
-                    )
-            except Exception as e:
-                verbose_proxy_logger.error(
-                    f"Error getting deployment for model_id {model_id}: {e}"
-                )
-                raise HTTPException(
-                    status_code=status.HTTP_404_NOT_FOUND,
-                    detail={"error": f"Model with ID {model_id} not found"},
-                )
-        else:
-            raise HTTPException(
-                status_code=status.HTTP_404_NOT_FOUND,
-                detail={"error": f"Model with ID {model_id} not found"},
-            )
+    target_model = _health_endpoint_resolve_target_model_name(
+        model, model_id, llm_router
+    )

    is_admin = _is_proxy_admin(user_api_key_dict)
    model_specific_request = bool(model or model_id)
@ -1000,6 +1016,7 @@ async def health_endpoint(
                    user_id=user_api_key_dict.user_id,
                    model_id=None,  # CLI model doesn't have model_id
                    max_concurrency=health_check_concurrency,
+                    **_hc_filter,
                )
                return _post_process(cli_result)
            raise HTTPException(
@ -1085,6 +1102,7 @@ async def health_endpoint(
                user_id=user_api_key_dict.user_id,
                model_id=model_id,
                max_concurrency=health_check_concurrency,
+                **_hc_filter,
            )
            return _post_process(router_result)
    except Exception as e:
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -314,7 +314,10 @@ from litellm.proxy.guardrails.init_guardrails import (
    init_guardrails_v2,
    initialize_guardrails,
 )
-from litellm.proxy.health_check import perform_health_check
+from litellm.proxy.health_check import (
+    health_check_filter_kwargs_from_general_settings,
+    perform_health_check,
+)
 from litellm.proxy.health_endpoints._health_endpoints import router as health_router
 from litellm.proxy.hooks.model_max_budget_limiter import (
    _PROXY_VirtualKeyModelMaxBudgetLimiter,
@ -2733,29 +2736,44 @@ def _rss_mb_for_log() -> str:
    return f"{rss_mb:.2f}"


+def _is_unexpected_keyword_argument_type_error(exc: BaseException) -> bool:
+    """True when ``exc`` is a TypeError from passing a kwarg the callee does not accept."""
+    return isinstance(exc, TypeError) and (
+        "unexpected keyword argument" in str(exc).lower()
+    )
+
+
 async def _run_direct_health_check_with_instrumentation(
    model_list: list,
    details: Optional[bool],
    max_concurrency: Optional[int],
    instrumentation_context: dict,
 ):
-    try:
-        return await perform_health_check(
-            model_list=model_list,
-            details=details,
-            max_concurrency=max_concurrency,
-            instrumentation_context=instrumentation_context,
-        )
-    except TypeError as e:
-        if "instrumentation_context" not in str(e):
-            raise
-        # Backward compatibility for monkeypatched or wrapped callables
-        # that do not accept instrumentation_context.
-        return await perform_health_check(
-            model_list=model_list,
-            details=details,
-            max_concurrency=max_concurrency,
-        )
+    """Call ``perform_health_check``, retrying with fewer kwargs on unexpected-kw TypeErrors."""
+    _hc_filter = health_check_filter_kwargs_from_general_settings(general_settings)
+    last_type_error: Optional[TypeError] = None
+    for extra_kwargs in (
+        {
+            "instrumentation_context": instrumentation_context,
+            **_hc_filter,
+        },
+        {"instrumentation_context": instrumentation_context},
+        dict(_hc_filter),
+        {},
+    ):
+        try:
+            return await perform_health_check(
+                model_list=model_list,
+                details=details,
+                max_concurrency=max_concurrency,
+                **extra_kwargs,
+            )
+        except TypeError as e:
+            if not _is_unexpected_keyword_argument_type_error(e):
+                raise
+            last_type_error = e
+    assert last_type_error is not None
+    raise last_type_error


 def _schedule_background_health_check_db_save(
@ -3020,6 +3038,7 @@ async def _run_background_health_check():
        details_bool = (
            health_check_details if health_check_details is not None else True
        )
+        _hc_filter = health_check_filter_kwargs_from_general_settings(general_settings)

        if shared_health_manager is not None:
            try:
@ -3031,6 +3050,7 @@ async def _run_background_health_check():
                    model_list=_llm_model_list,
                    details=details_bool,
                    max_concurrency=health_check_concurrency,
+                    **_hc_filter,
                )
            except Exception as e:
                verbose_proxy_logger.error(
@ -3043,7 +3063,7 @@ async def _run_background_health_check():
                    _exceptions_by_model_id,
                ) = await _run_direct_health_check_with_instrumentation(
                    _llm_model_list,
-                    health_check_details,
+                    details_bool,
                    health_check_concurrency,
                    instrumentation_context,
                )
@ -3054,7 +3074,7 @@ async def _run_background_health_check():
                _exceptions_by_model_id,
            ) = await _run_direct_health_check_with_instrumentation(
                _llm_model_list,
-                health_check_details,
+                details_bool,
                health_check_concurrency,
                instrumentation_context,
            )
--- a/tests/litellm_utils_tests/test_health_check.py
+++ b/tests/litellm_utils_tests/test_health_check.py
@ -495,6 +495,45 @@ async def test_perform_health_check_filters_by_model_id():
    assert healthy_endpoints[0]["api_key"] == "fake-key-2"


+@pytest.mark.asyncio
+async def test_perform_health_check_skip_disabled_background_models():
+    from litellm.proxy.health_check import perform_health_check
+
+    model_list = [
+        {
+            "model_name": "a",
+            "model_info": {"id": "id-a"},
+            "litellm_params": {"model": "m-a", "api_key": "k1"},
+        },
+        {
+            "model_name": "b",
+            "model_info": {
+                "id": "id-b",
+                "disable_background_health_check": True,
+            },
+            "litellm_params": {"model": "m-b", "api_key": "k2"},
+        },
+    ]
+    captured = []
+
+    async def mock_inner(m_list, details=True, **kwargs):
+        captured.append(list(m_list))
+        return [], [], {}
+
+    with patch(
+        "litellm.proxy.health_check._perform_health_check",
+        side_effect=mock_inner,
+    ):
+        await perform_health_check(
+            model_list=model_list,
+            health_check_skip_disabled_background_models=True,
+        )
+
+    assert len(captured) == 1
+    assert len(captured[0]) == 1
+    assert captured[0][0]["model_name"] == "a"
+
+
@pytest.mark.asyncio
 async def test_perform_health_check_with_health_check_model():
    """
--- a/tests/proxy_unit_tests/test_proxy_server.py
+++ b/tests/proxy_unit_tests/test_proxy_server.py
@ -2485,7 +2485,9 @@ async def test_background_health_check_skip_disabled_models(monkeypatch):
    ]
    called_model_lists = []

-    async def fake_perform_health_check(model_list, details, max_concurrency=None):
+    async def fake_perform_health_check(
+        model_list, details, max_concurrency=None, **kwargs
+    ):
        called_model_lists.append(copy.deepcopy(model_list))
        return (["healthy"], [], {})

@ -2508,6 +2510,100 @@ async def test_background_health_check_skip_disabled_models(monkeypatch):
    assert called_model_lists == [[{"model_name": "model-a"}]]


+@pytest.mark.asyncio
+async def test_run_direct_health_check_with_instrumentation_legacy_three_arg_stub(
+    monkeypatch,
+):
+    """Monkeypatched perform_health_check with only base kwargs should still run."""
+    import litellm.proxy.proxy_server as proxy_server
+
+    async def fake_perform_health_check(model_list, details, max_concurrency=None):
+        return ([], [], {})
+
+    monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
+    result = await proxy_server._run_direct_health_check_with_instrumentation(
+        [{"model_name": "m"}],
+        True,
+        1,
+        {"enabled": True, "source": "test", "cycle_id": "c1"},
+    )
+    assert result == ([], [], {})
+
+
+@pytest.mark.asyncio
+async def test_run_direct_health_check_with_instrumentation_accepts_instrumentation_only(
+    monkeypatch,
+):
+    """Stub that accepts instrumentation_context but not health_check filter kwargs."""
+    import litellm.proxy.proxy_server as proxy_server
+
+    seen: list = []
+
+    async def fake_perform_health_check(
+        model_list, details, max_concurrency=None, instrumentation_context=None
+    ):
+        seen.append(instrumentation_context)
+        return ([], [], {})
+
+    monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
+    await proxy_server._run_direct_health_check_with_instrumentation(
+        [],
+        False,
+        2,
+        {"enabled": True, "source": "test", "cycle_id": "c2"},
+    )
+    assert len(seen) == 1
+    assert seen[0]["cycle_id"] == "c2"
+
+
+@pytest.mark.asyncio
+async def test_run_direct_health_check_with_instrumentation_accepts_filter_only(
+    monkeypatch,
+):
+    """Stub that accepts health_check_skip_disabled_background_models but not instrumentation."""
+    import litellm.proxy.proxy_server as proxy_server
+
+    seen: list = []
+
+    async def fake_perform_health_check(
+        model_list,
+        details,
+        max_concurrency=None,
+        health_check_skip_disabled_background_models=False,
+    ):
+        seen.append(health_check_skip_disabled_background_models)
+        return ([], [], {})
+
+    monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
+    await proxy_server._run_direct_health_check_with_instrumentation(
+        [],
+        True,
+        None,
+        {"enabled": False},
+    )
+    assert len(seen) == 1
+    assert seen[0] is False
+
+
+@pytest.mark.asyncio
+async def test_run_direct_health_check_with_instrumentation_non_kw_typeerror_reraises(
+    monkeypatch,
+):
+    import litellm.proxy.proxy_server as proxy_server
+
+    async def fake_perform_health_check(**kwargs):
+        raise TypeError("unsupported operand type(s)")
+
+    monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
+    with pytest.raises(TypeError, match="unsupported operand"):
+        await proxy_server._run_direct_health_check_with_instrumentation(
+            [],
+            True,
+            1,
+            {},
+        )
+
+
 def test_get_timeout_from_request():
    from litellm.proxy.litellm_pre_call_utils import LiteLLMProxyRequestSetup

--- a/tests/test_litellm/proxy/test_health_check_functions.py
+++ b/tests/test_litellm/proxy/test_health_check_functions.py
@ -566,6 +566,7 @@ async def test_perform_health_check_and_save_passes_model_id_to_perform_health_c
        details=True,
        model_id=None,
        max_concurrency=None,
+        **kwargs,
    ):
        return healthy, unhealthy, {}

@ -591,5 +592,39 @@ async def test_perform_health_check_and_save_passes_model_id_to_perform_health_c
    assert result["unhealthy_count"] == 0


+@pytest.mark.asyncio
+async def test_perform_health_check_and_save_forwards_skip_disabled_background_flag():
+    """health_check_skip_disabled_background_models should reach perform_health_check."""
+    model_list = [
+        {
+            "model_name": "gpt-4",
+            "model_info": {"id": "deployment-abc"},
+            "litellm_params": {"model": "gpt-4"},
+        },
+    ]
+
+    async def mock_perform_health_check(**kwargs):
+        return [], [], {}
+
+    with patch(
+        "litellm.proxy.health_endpoints._health_endpoints.perform_health_check",
+        side_effect=mock_perform_health_check,
+    ) as mock_perform:
+        await _perform_health_check_and_save(
+            model_list=model_list,
+            target_model=None,
+            cli_model=None,
+            details=True,
+            prisma_client=None,
+            start_time=0.0,
+            user_id="user-1",
+            model_id=None,
+            health_check_skip_disabled_background_models=True,
+        )
+
+    call_kwargs = mock_perform.call_args[1]
+    assert call_kwargs["health_check_skip_disabled_background_models"] is True
+
+
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/test_litellm/proxy/test_shared_health_check.py
+++ b/tests/test_litellm/proxy/test_shared_health_check.py
@ -310,7 +310,10 @@ class TestSharedHealthCheckManager:

        # Should call perform_health_check and cache results
        mock_perform.assert_called_once_with(
-            model_list=model_list, details=True, max_concurrency=None
+            model_list=model_list,
+            details=True,
+            max_concurrency=None,
+            health_check_skip_disabled_background_models=False,
        )
        assert healthy == expected_healthy
        assert unhealthy == expected_unhealthy
@ -397,7 +400,10 @@ class TestSharedHealthCheckManager:
        assert mock_sleep.call_count == 2
        mock_sleep.assert_called_with(5)
        mock_perform.assert_called_once_with(
-            model_list=model_list, details=True, max_concurrency=None
+            model_list=model_list,
+            details=True,
+            max_concurrency=None,
+            health_check_skip_disabled_background_models=False,
        )
        assert healthy == expected_healthy
        assert unhealthy == expected_unhealthy
@ -437,7 +443,10 @@ class TestSharedHealthCheckManager:
        # Should detect orphaned lock after 1 iteration and fall back immediately
        mock_sleep.assert_called_once_with(5)
        mock_perform.assert_called_once_with(
-            model_list=model_list, details=True, max_concurrency=None
+            model_list=model_list,
+            details=True,
+            max_concurrency=None,
+            health_check_skip_disabled_background_models=False,
        )
        assert healthy == expected_healthy
        assert unhealthy == expected_unhealthy
@ -506,7 +515,10 @@ class TestSharedHealthCheckManager:
        # Should NOT sleep at all — falls back to local health check immediately
        mock_sleep.assert_not_called()
        mock_perform.assert_called_once_with(
-            model_list=model_list, details=True, max_concurrency=None
+            model_list=model_list,
+            details=True,
+            max_concurrency=None,
+            health_check_skip_disabled_background_models=False,
        )
        assert healthy == expected_healthy
        assert unhealthy == expected_unhealthy