feat(proxy): skip disable_background_health_check models on GET /health when flag set (#27716)

* feat(proxy): skip disable_background_health_check models on GET /health when flag set

Co-authored-by: Cursor <cursoragent@cursor.com>

* fix comment

* fix greptile comments

* Fix health check fallback kwargs

* Format health endpoint

* Harden direct health check kwargs compatibility for monkeypatched perform_health_check

Replace substring-based TypeError detection with unexpected-keyword checks
and a short retry chain (full kwargs, instrumentation only, filter only,
minimal) so partial stubs work regardless of which optional kwarg fails first.
Add proxy unit tests for legacy three-arg stubs and single-kwarg variants.

Co-authored-by: Sameer Kankute <Sameerlite@users.noreply.github.com>

* fix black

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Sameer Kankute <Sameerlite@users.noreply.github.com>
This commit is contained in:
Sameer Kankute 2026-05-13 22:19:05 +05:30 committed by GitHub
parent 2e5ebf826f
commit 38709ba9bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 326 additions and 52 deletions

View File

@ -2401,6 +2401,13 @@ class ConfigGeneralSettings(LiteLLMPydanticObjectBase):
"health checks run without a concurrency cap"
),
)
health_check_skip_disabled_background_models: bool = Field(
False,
description=(
"When true, deployments with model_info.disable_background_health_check "
"are skipped for on-demand GET /health as well as the background health loop."
),
)
alerting: Optional[List] = Field(
None,
description="List of alerting integrations. Today, just slack - `alerting: ['slack']`",

View File

@ -86,6 +86,24 @@ def _clean_endpoint_data(endpoint_data: dict, details: Optional[bool] = True):
)
def health_check_filter_kwargs_from_general_settings(
general_settings: Optional[dict],
) -> dict:
"""
Build kwargs for ``perform_health_check`` from ``general_settings``.
When ``health_check_skip_disabled_background_models`` is true, deployments with
``model_info.disable_background_health_check`` are omitted from health runs
(including on-demand ``GET /health``), matching the background loop behavior.
"""
g = general_settings or {}
return {
"health_check_skip_disabled_background_models": bool(
g.get("health_check_skip_disabled_background_models", False)
),
}
def filter_deployments_by_id(
model_list: List,
) -> List:
@ -438,6 +456,7 @@ async def perform_health_check(
model_id: Optional[str] = None,
max_concurrency: Optional[int] = None,
instrumentation_context: Optional[dict] = None,
health_check_skip_disabled_background_models: bool = False,
):
"""
Perform a health check on the system.
@ -446,6 +465,12 @@ async def perform_health_check(
(so models that share the same name but have different ids are checked separately).
When model (name) is provided, all deployments matching that name are checked.
When ``health_check_skip_disabled_background_models`` is True (via
``general_settings.health_check_skip_disabled_background_models``), deployments
with ``model_info.disable_background_health_check: true`` are omitted from
this run (including targeted ``/health`` queries), consistent with the
background health loop.
Returns:
(bool): True if the health check passes, False otherwise.
"""
@ -486,6 +511,23 @@ async def perform_health_check(
_new_model_list = [x for x in model_list if x["model_name"] == model]
model_list = _new_model_list
if health_check_skip_disabled_background_models:
model_list = [
x
for x in model_list
if not (x.get("model_info") or {}).get(
"disable_background_health_check", False
)
]
if not model_list:
if instrumentation_enabled:
logger.debug(
"health_check_cycle_skipped source=%s cycle_id=%s reason=no_models_after_filter",
source,
cycle_id,
)
return [], [], {}
post_filter_model_count = len(model_list)
model_list = filter_deployments_by_id(
model_list=model_list

View File

@ -192,6 +192,7 @@ class SharedHealthCheckManager:
model_list: List[Dict[str, Any]],
details: bool = True,
max_concurrency: Optional[int] = None,
health_check_skip_disabled_background_models: bool = False,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], Dict[str, Any]]:
"""
Perform health check with shared state coordination.
@ -207,6 +208,7 @@ class SharedHealthCheckManager:
model_list: List of models to check
details: Whether to include detailed information
max_concurrency: Optional limit on concurrent health check requests
health_check_skip_disabled_background_models: Remove models with disable_background_health_check: true
Returns:
Tuple of (healthy_endpoints, unhealthy_endpoints)
@ -240,6 +242,7 @@ class SharedHealthCheckManager:
model_list=model_list,
details=details,
max_concurrency=max_concurrency,
health_check_skip_disabled_background_models=health_check_skip_disabled_background_models,
)
# Cache the results
@ -260,6 +263,7 @@ class SharedHealthCheckManager:
model_list=model_list,
details=details,
max_concurrency=max_concurrency,
health_check_skip_disabled_background_models=health_check_skip_disabled_background_models,
)
# Lock not acquired — poll for cached results until the lock
@ -316,6 +320,7 @@ class SharedHealthCheckManager:
model_list=model_list,
details=details,
max_concurrency=max_concurrency,
health_check_skip_disabled_background_models=health_check_skip_disabled_background_models,
)
async def is_health_check_in_progress(self) -> bool:

View File

@ -32,6 +32,7 @@ from litellm.proxy.health_check import (
ADMIN_ONLY_HEALTH_DISPLAY_PARAMS,
_clean_endpoint_data,
_update_litellm_params_for_health_check,
health_check_filter_kwargs_from_general_settings,
perform_health_check,
run_with_timeout,
)
@ -858,6 +859,7 @@ async def _perform_health_check_and_save(
user_id,
model_id=None,
max_concurrency=None,
**perform_health_check_extra,
):
"""Helper function to perform health check and save results to database"""
healthy_endpoints, unhealthy_endpoints, _ = await perform_health_check(
@ -867,6 +869,7 @@ async def _perform_health_check_and_save(
details=details,
max_concurrency=max_concurrency,
model_id=model_id,
**perform_health_check_extra,
)
# Optionally save health check result to database (non-blocking)
@ -894,6 +897,37 @@ async def _perform_health_check_and_save(
}
def _health_endpoint_resolve_target_model_name(
model: Optional[str],
model_id: Optional[str],
llm_router,
) -> Optional[str]:
"""Map ``model_id`` (without ``model``) to ``model_name`` for live health checks."""
if not model_id or model:
return model
if llm_router is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": f"Model with ID {model_id} not found"},
)
try:
deployment = llm_router.get_deployment(model_id=model_id)
except Exception as e:
verbose_proxy_logger.error(
f"Error getting deployment for model_id {model_id}: {e}"
)
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": f"Model with ID {model_id} not found"},
) from e
if deployment is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": f"Model with ID {model_id} not found"},
)
return deployment.model_name
@router.get("/health", tags=["health"], dependencies=[Depends(user_api_key_auth)])
async def health_endpoint(
response: Response,
@ -920,10 +954,15 @@ async def health_endpoint(
background_health_checks: True
```
else, the health checks will be run on models when /health is called.
To skip deployments that set ``model_info.disable_background_health_check: true``
on ``GET /health`` as well as in the background loop, set
``general_settings.health_check_skip_disabled_background_models: true``.
"""
import time
from litellm.proxy.proxy_server import (
general_settings,
health_check_concurrency,
health_check_details,
health_check_results,
@ -934,35 +973,12 @@ async def health_endpoint(
user_model,
)
_hc_filter = health_check_filter_kwargs_from_general_settings(general_settings)
start_time = time.time()
# Handle model_id parameter - convert to model name for health check
target_model = model
if model_id and not model:
# Use get_deployment from router to find the model name
if llm_router is not None:
try:
deployment = llm_router.get_deployment(model_id=model_id)
if deployment is not None:
target_model = deployment.model_name
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": f"Model with ID {model_id} not found"},
)
except Exception as e:
verbose_proxy_logger.error(
f"Error getting deployment for model_id {model_id}: {e}"
)
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": f"Model with ID {model_id} not found"},
)
else:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail={"error": f"Model with ID {model_id} not found"},
)
target_model = _health_endpoint_resolve_target_model_name(
model, model_id, llm_router
)
is_admin = _is_proxy_admin(user_api_key_dict)
model_specific_request = bool(model or model_id)
@ -1000,6 +1016,7 @@ async def health_endpoint(
user_id=user_api_key_dict.user_id,
model_id=None, # CLI model doesn't have model_id
max_concurrency=health_check_concurrency,
**_hc_filter,
)
return _post_process(cli_result)
raise HTTPException(
@ -1085,6 +1102,7 @@ async def health_endpoint(
user_id=user_api_key_dict.user_id,
model_id=model_id,
max_concurrency=health_check_concurrency,
**_hc_filter,
)
return _post_process(router_result)
except Exception as e:

View File

@ -314,7 +314,10 @@ from litellm.proxy.guardrails.init_guardrails import (
init_guardrails_v2,
initialize_guardrails,
)
from litellm.proxy.health_check import perform_health_check
from litellm.proxy.health_check import (
health_check_filter_kwargs_from_general_settings,
perform_health_check,
)
from litellm.proxy.health_endpoints._health_endpoints import router as health_router
from litellm.proxy.hooks.model_max_budget_limiter import (
_PROXY_VirtualKeyModelMaxBudgetLimiter,
@ -2733,29 +2736,44 @@ def _rss_mb_for_log() -> str:
return f"{rss_mb:.2f}"
def _is_unexpected_keyword_argument_type_error(exc: BaseException) -> bool:
"""True when ``exc`` is a TypeError from passing a kwarg the callee does not accept."""
return isinstance(exc, TypeError) and (
"unexpected keyword argument" in str(exc).lower()
)
async def _run_direct_health_check_with_instrumentation(
model_list: list,
details: Optional[bool],
max_concurrency: Optional[int],
instrumentation_context: dict,
):
try:
return await perform_health_check(
model_list=model_list,
details=details,
max_concurrency=max_concurrency,
instrumentation_context=instrumentation_context,
)
except TypeError as e:
if "instrumentation_context" not in str(e):
raise
# Backward compatibility for monkeypatched or wrapped callables
# that do not accept instrumentation_context.
return await perform_health_check(
model_list=model_list,
details=details,
max_concurrency=max_concurrency,
)
"""Call ``perform_health_check``, retrying with fewer kwargs on unexpected-kw TypeErrors."""
_hc_filter = health_check_filter_kwargs_from_general_settings(general_settings)
last_type_error: Optional[TypeError] = None
for extra_kwargs in (
{
"instrumentation_context": instrumentation_context,
**_hc_filter,
},
{"instrumentation_context": instrumentation_context},
dict(_hc_filter),
{},
):
try:
return await perform_health_check(
model_list=model_list,
details=details,
max_concurrency=max_concurrency,
**extra_kwargs,
)
except TypeError as e:
if not _is_unexpected_keyword_argument_type_error(e):
raise
last_type_error = e
assert last_type_error is not None
raise last_type_error
def _schedule_background_health_check_db_save(
@ -3020,6 +3038,7 @@ async def _run_background_health_check():
details_bool = (
health_check_details if health_check_details is not None else True
)
_hc_filter = health_check_filter_kwargs_from_general_settings(general_settings)
if shared_health_manager is not None:
try:
@ -3031,6 +3050,7 @@ async def _run_background_health_check():
model_list=_llm_model_list,
details=details_bool,
max_concurrency=health_check_concurrency,
**_hc_filter,
)
except Exception as e:
verbose_proxy_logger.error(
@ -3043,7 +3063,7 @@ async def _run_background_health_check():
_exceptions_by_model_id,
) = await _run_direct_health_check_with_instrumentation(
_llm_model_list,
health_check_details,
details_bool,
health_check_concurrency,
instrumentation_context,
)
@ -3054,7 +3074,7 @@ async def _run_background_health_check():
_exceptions_by_model_id,
) = await _run_direct_health_check_with_instrumentation(
_llm_model_list,
health_check_details,
details_bool,
health_check_concurrency,
instrumentation_context,
)

View File

@ -495,6 +495,45 @@ async def test_perform_health_check_filters_by_model_id():
assert healthy_endpoints[0]["api_key"] == "fake-key-2"
@pytest.mark.asyncio
async def test_perform_health_check_skip_disabled_background_models():
from litellm.proxy.health_check import perform_health_check
model_list = [
{
"model_name": "a",
"model_info": {"id": "id-a"},
"litellm_params": {"model": "m-a", "api_key": "k1"},
},
{
"model_name": "b",
"model_info": {
"id": "id-b",
"disable_background_health_check": True,
},
"litellm_params": {"model": "m-b", "api_key": "k2"},
},
]
captured = []
async def mock_inner(m_list, details=True, **kwargs):
captured.append(list(m_list))
return [], [], {}
with patch(
"litellm.proxy.health_check._perform_health_check",
side_effect=mock_inner,
):
await perform_health_check(
model_list=model_list,
health_check_skip_disabled_background_models=True,
)
assert len(captured) == 1
assert len(captured[0]) == 1
assert captured[0][0]["model_name"] == "a"
@pytest.mark.asyncio
async def test_perform_health_check_with_health_check_model():
"""

View File

@ -2485,7 +2485,9 @@ async def test_background_health_check_skip_disabled_models(monkeypatch):
]
called_model_lists = []
async def fake_perform_health_check(model_list, details, max_concurrency=None):
async def fake_perform_health_check(
model_list, details, max_concurrency=None, **kwargs
):
called_model_lists.append(copy.deepcopy(model_list))
return (["healthy"], [], {})
@ -2508,6 +2510,100 @@ async def test_background_health_check_skip_disabled_models(monkeypatch):
assert called_model_lists == [[{"model_name": "model-a"}]]
@pytest.mark.asyncio
async def test_run_direct_health_check_with_instrumentation_legacy_three_arg_stub(
monkeypatch,
):
"""Monkeypatched perform_health_check with only base kwargs should still run."""
import litellm.proxy.proxy_server as proxy_server
async def fake_perform_health_check(model_list, details, max_concurrency=None):
return ([], [], {})
monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
result = await proxy_server._run_direct_health_check_with_instrumentation(
[{"model_name": "m"}],
True,
1,
{"enabled": True, "source": "test", "cycle_id": "c1"},
)
assert result == ([], [], {})
@pytest.mark.asyncio
async def test_run_direct_health_check_with_instrumentation_accepts_instrumentation_only(
monkeypatch,
):
"""Stub that accepts instrumentation_context but not health_check filter kwargs."""
import litellm.proxy.proxy_server as proxy_server
seen: list = []
async def fake_perform_health_check(
model_list, details, max_concurrency=None, instrumentation_context=None
):
seen.append(instrumentation_context)
return ([], [], {})
monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
await proxy_server._run_direct_health_check_with_instrumentation(
[],
False,
2,
{"enabled": True, "source": "test", "cycle_id": "c2"},
)
assert len(seen) == 1
assert seen[0]["cycle_id"] == "c2"
@pytest.mark.asyncio
async def test_run_direct_health_check_with_instrumentation_accepts_filter_only(
monkeypatch,
):
"""Stub that accepts health_check_skip_disabled_background_models but not instrumentation."""
import litellm.proxy.proxy_server as proxy_server
seen: list = []
async def fake_perform_health_check(
model_list,
details,
max_concurrency=None,
health_check_skip_disabled_background_models=False,
):
seen.append(health_check_skip_disabled_background_models)
return ([], [], {})
monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
await proxy_server._run_direct_health_check_with_instrumentation(
[],
True,
None,
{"enabled": False},
)
assert len(seen) == 1
assert seen[0] is False
@pytest.mark.asyncio
async def test_run_direct_health_check_with_instrumentation_non_kw_typeerror_reraises(
monkeypatch,
):
import litellm.proxy.proxy_server as proxy_server
async def fake_perform_health_check(**kwargs):
raise TypeError("unsupported operand type(s)")
monkeypatch.setattr(proxy_server, "perform_health_check", fake_perform_health_check)
with pytest.raises(TypeError, match="unsupported operand"):
await proxy_server._run_direct_health_check_with_instrumentation(
[],
True,
1,
{},
)
def test_get_timeout_from_request():
from litellm.proxy.litellm_pre_call_utils import LiteLLMProxyRequestSetup

View File

@ -566,6 +566,7 @@ async def test_perform_health_check_and_save_passes_model_id_to_perform_health_c
details=True,
model_id=None,
max_concurrency=None,
**kwargs,
):
return healthy, unhealthy, {}
@ -591,5 +592,39 @@ async def test_perform_health_check_and_save_passes_model_id_to_perform_health_c
assert result["unhealthy_count"] == 0
@pytest.mark.asyncio
async def test_perform_health_check_and_save_forwards_skip_disabled_background_flag():
"""health_check_skip_disabled_background_models should reach perform_health_check."""
model_list = [
{
"model_name": "gpt-4",
"model_info": {"id": "deployment-abc"},
"litellm_params": {"model": "gpt-4"},
},
]
async def mock_perform_health_check(**kwargs):
return [], [], {}
with patch(
"litellm.proxy.health_endpoints._health_endpoints.perform_health_check",
side_effect=mock_perform_health_check,
) as mock_perform:
await _perform_health_check_and_save(
model_list=model_list,
target_model=None,
cli_model=None,
details=True,
prisma_client=None,
start_time=0.0,
user_id="user-1",
model_id=None,
health_check_skip_disabled_background_models=True,
)
call_kwargs = mock_perform.call_args[1]
assert call_kwargs["health_check_skip_disabled_background_models"] is True
if __name__ == "__main__":
pytest.main([__file__])

View File

@ -310,7 +310,10 @@ class TestSharedHealthCheckManager:
# Should call perform_health_check and cache results
mock_perform.assert_called_once_with(
model_list=model_list, details=True, max_concurrency=None
model_list=model_list,
details=True,
max_concurrency=None,
health_check_skip_disabled_background_models=False,
)
assert healthy == expected_healthy
assert unhealthy == expected_unhealthy
@ -397,7 +400,10 @@ class TestSharedHealthCheckManager:
assert mock_sleep.call_count == 2
mock_sleep.assert_called_with(5)
mock_perform.assert_called_once_with(
model_list=model_list, details=True, max_concurrency=None
model_list=model_list,
details=True,
max_concurrency=None,
health_check_skip_disabled_background_models=False,
)
assert healthy == expected_healthy
assert unhealthy == expected_unhealthy
@ -437,7 +443,10 @@ class TestSharedHealthCheckManager:
# Should detect orphaned lock after 1 iteration and fall back immediately
mock_sleep.assert_called_once_with(5)
mock_perform.assert_called_once_with(
model_list=model_list, details=True, max_concurrency=None
model_list=model_list,
details=True,
max_concurrency=None,
health_check_skip_disabled_background_models=False,
)
assert healthy == expected_healthy
assert unhealthy == expected_unhealthy
@ -506,7 +515,10 @@ class TestSharedHealthCheckManager:
# Should NOT sleep at all — falls back to local health check immediately
mock_sleep.assert_not_called()
mock_perform.assert_called_once_with(
model_list=model_list, details=True, max_concurrency=None
model_list=model_list,
details=True,
max_concurrency=None,
health_check_skip_disabled_background_models=False,
)
assert healthy == expected_healthy
assert unhealthy == expected_unhealthy