feat(proxy): native /health/drain preStop hook for graceful shutdown (#29439)

2026-06-02 16:30:44 -07:00 · 2026-06-02 16:30:44 -07:00 · 3a1c6bba97
commit 3a1c6bba97
parent a5ccd96152
10 changed files with 669 additions and 6 deletions
--- a/.github/workflows/test-unit-proxy-endpoints.yml
+++ b/.github/workflows/test-unit-proxy-endpoints.yml
@ -36,6 +36,7 @@ jobs:
        tests/test_litellm/proxy/a2a
        tests/test_litellm/proxy/discovery_endpoints
        tests/test_litellm/proxy/health_endpoints
+        tests/test_litellm/proxy/shutdown
        tests/test_litellm/proxy/public_endpoints
        tests/test_litellm/proxy/prompts
        tests/test_litellm/proxy/rag_endpoints
--- a/2
+++ b/2
@ -146,7 +146,7 @@ test-unit-proxy-core: install-test-deps
 	$(UV_RUN) pytest tests/test_litellm/proxy/auth tests/test_litellm/proxy/client tests/test_litellm/proxy/db tests/test_litellm/proxy/hooks tests/test_litellm/proxy/policy_engine --tb=short -vv -n 4 --durations=20

 test-unit-proxy-misc: install-test-deps
-	$(UV_RUN) pytest tests/test_litellm/proxy/_experimental tests/test_litellm/proxy/agent_endpoints tests/test_litellm/proxy/anthropic_endpoints tests/test_litellm/proxy/common_utils tests/test_litellm/proxy/discovery_endpoints tests/test_litellm/proxy/experimental tests/test_litellm/proxy/google_endpoints tests/test_litellm/proxy/health_endpoints tests/test_litellm/proxy/image_endpoints tests/test_litellm/proxy/middleware tests/test_litellm/proxy/openai_files_endpoint tests/test_litellm/proxy/pass_through_endpoints tests/test_litellm/proxy/prompts tests/test_litellm/proxy/public_endpoints tests/test_litellm/proxy/response_api_endpoints tests/test_litellm/proxy/spend_tracking tests/test_litellm/proxy/ui_crud_endpoints tests/test_litellm/proxy/vector_store_endpoints tests/test_litellm/proxy/test_*.py --tb=short -vv -n 4 --durations=20
+	$(UV_RUN) pytest tests/test_litellm/proxy/_experimental tests/test_litellm/proxy/agent_endpoints tests/test_litellm/proxy/anthropic_endpoints tests/test_litellm/proxy/common_utils tests/test_litellm/proxy/discovery_endpoints tests/test_litellm/proxy/experimental tests/test_litellm/proxy/google_endpoints tests/test_litellm/proxy/health_endpoints tests/test_litellm/proxy/image_endpoints tests/test_litellm/proxy/middleware tests/test_litellm/proxy/openai_files_endpoint tests/test_litellm/proxy/pass_through_endpoints tests/test_litellm/proxy/prompts tests/test_litellm/proxy/public_endpoints tests/test_litellm/proxy/response_api_endpoints tests/test_litellm/proxy/shutdown tests/test_litellm/proxy/spend_tracking tests/test_litellm/proxy/ui_crud_endpoints tests/test_litellm/proxy/vector_store_endpoints tests/test_litellm/proxy/test_*.py --tb=short -vv -n 4 --durations=20

 test-unit-integrations: install-test-deps
 	$(UV_RUN) pytest tests/test_litellm/integrations --tb=short -vv -n 4 --durations=20
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -285,11 +285,31 @@ db:
  deployStandalone: true

 # Lifecycle hooks for the LiteLLM container
+#
+# Prefer the native /health/drain preStop hook over a fixed `sleep`: it marks
+# the pod NotReady and blocks only until in-flight requests actually finish
+# (bounded by GRACEFUL_SHUTDOWN_TIMEOUT, default 30s), instead of always
+# waiting the worst-case duration. The drain runs once (the preStop hook and
+# the SIGTERM handler share it), so set terminationGracePeriodSeconds a few
+# seconds above GRACEFUL_SHUTDOWN_TIMEOUT to leave room for teardown before
+# SIGKILL.
+#
+# /health/drain is off by default; enable it with
+# general_settings.enable_drain_endpoint: true. The kubelet calls preStop
+# hooks without proxy credentials, so when the health port is reachable from
+# other pods (the common case) also set
+# general_settings.drain_endpoint_token (or the DRAIN_ENDPOINT_TOKEN env
+# var) and send the same value on the X-Drain-Token header from the hook.
+# Calls missing/wrong the token get a 401 and have no side effect.
 # Example:
 # lifecycle:
 #   preStop:
-#     exec:
-#       command: ["/bin/sh", "-c", "sleep 10"]
+#     httpGet:
+#       path: /health/drain
+#       port: 4000
+#       httpHeaders:
+#         - name: X-Drain-Token
+#           value: <same value as drain_endpoint_token>
 lifecycle: {}

 # Settings for Bitnami postgresql chart (if db.deployStandalone is true, ignored
--- a/litellm/proxy/health_endpoints/_health_endpoints.py
+++ b/litellm/proxy/health_endpoints/_health_endpoints.py
@ -2,6 +2,7 @@ import asyncio
 import copy
 import logging
 import os
+import secrets
 import time
 import traceback
 from datetime import datetime, timedelta
@ -39,6 +40,7 @@ from litellm.proxy.health_check import (
 from litellm.proxy.middleware.in_flight_requests_middleware import (
    get_in_flight_requests,
 )
+from litellm.proxy.shutdown.graceful_shutdown_manager import GracefulShutdownManager

 #### Health ENDPOINTS ####

@ -1551,6 +1553,50 @@ def _allow_public_health_readiness_details() -> bool:
    return general_settings.get("allow_public_health_readiness_details") is True


+def _drain_endpoint_enabled() -> bool:
+    from litellm.proxy.proxy_server import general_settings
+
+    return general_settings.get("enable_drain_endpoint") is True
+
+
+def _drain_endpoint_token() -> Optional[str]:
+    """
+    Shared secret required on the X-Drain-Token header to call /health/drain.
+
+    Falls back to the ``DRAIN_ENDPOINT_TOKEN`` env var when unset in
+    general_settings so the kubelet preStop hook can supply it via
+    ``valueFrom.secretKeyRef`` without a config reload.
+    """
+    from litellm.proxy.proxy_server import general_settings
+
+    token = general_settings.get("drain_endpoint_token")
+    if isinstance(token, str) and token:
+        return token
+    env_token = os.getenv("DRAIN_ENDPOINT_TOKEN")
+    if env_token:
+        return env_token
+    return None
+
+
+def _authorize_drain_request(request: Request) -> None:
+    """
+    Reject /health/drain calls that don't carry the configured X-Drain-Token.
+
+    When no token is configured the endpoint is treated as already opted-in
+    (the ``enable_drain_endpoint`` flag is the only gate). Comparison uses
+    ``secrets.compare_digest`` to avoid timing leaks.
+    """
+    expected = _drain_endpoint_token()
+    if expected is None:
+        return
+    supplied = request.headers.get("x-drain-token") or ""
+    if not secrets.compare_digest(supplied, expected):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid or missing X-Drain-Token",
+        )
+
+
 async def _resolve_public_readiness_db(response: Response) -> str:
    """
    Return the db status string for the public probe and flip the response to
@ -1580,6 +1626,10 @@ async def health_readiness(response: Response):
    credential. Admins can opt into the legacy detailed payload with
    general_settings.allow_public_health_readiness_details.
    """
+    if GracefulShutdownManager.is_shutting_down():
+        response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+        return {"status": "shutting_down"}
+
    if _allow_public_health_readiness_details():
        return await _get_health_readiness_details(response=response)

@ -1616,6 +1666,54 @@ async def health_backlog():
    return {"in_flight_requests": get_in_flight_requests()}


+@router.get(
+    "/health/drain",
+    tags=["health"],
+)
+async def health_drain(request: Request):
+    """
+    Graceful-drain probe for Kubernetes ``preStop`` hooks.
+
+    Disabled by default and returns 404 unless ``general_settings`` sets
+    ``enable_drain_endpoint: true``. Calling it flips a process-wide
+    shutting-down flag, so a successful call permanently takes the worker out
+    of rotation until the pod restarts.
+
+    Because the kubelet calls preStop hooks without proxy credentials, the
+    endpoint does not require ``user_api_key_auth``. To prevent any
+    pod-reachable caller from triggering shutdown, set
+    ``general_settings.drain_endpoint_token`` (or the ``DRAIN_ENDPOINT_TOKEN``
+    env var) and supply the same value on the ``X-Drain-Token`` header from
+    the preStop hook. Calls without the header (or with a wrong value) get a
+    401 and have no side effect.
+
+    When enabled, it marks the worker as shutting down (so /health/readiness
+    and /health/liveliness immediately start returning 503, removing the pod
+    from service) and blocks until the in-flight request counter drains to
+    zero or ``GRACEFUL_SHUTDOWN_TIMEOUT`` elapses. Unlike a fixed ``sleep``,
+    this returns as soon as real in-flight work is done.
+
+    Wire it up as:
+
+    ```yaml
+    lifecycle:
+      preStop:
+        httpGet:
+          path: /health/drain
+          port: 4000
+          httpHeaders:
+            - name: X-Drain-Token
+              value: <same value as drain_endpoint_token>
+    ```
+    """
+    if not _drain_endpoint_enabled():
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not Found")
+    _authorize_drain_request(request)
+    GracefulShutdownManager.start_shutdown()
+    drained = await GracefulShutdownManager.wait_for_drain(exclude_self=True)
+    return {"status": "drained", "drained_requests": drained}
+
+
@router.get(
    "/health/liveliness",  # Historical LiteLLM name; doesn't match k8s terminology but kept for backwards compatibility
    tags=["health"],
@ -1624,10 +1722,16 @@ async def health_backlog():
    "/health/liveness",  # Kubernetes has "liveness" probes (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command)
    tags=["health"],
 )
-async def health_liveliness():
+async def health_liveliness(response: Response):
    """
-    Unprotected endpoint for checking if worker is alive
+    Unprotected endpoint for checking if worker is alive.
+
+    Returns 503 once graceful shutdown has begun so Kubernetes stops counting
+    the draining pod as live and terminates it on schedule.
    """
+    if GracefulShutdownManager.is_shutting_down():
+        response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+        return {"status": "shutting_down"}
    return "I'm alive!"


--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -417,6 +417,7 @@ from litellm.proxy.middleware.prometheus_auth_middleware import PrometheusAuthMi
 from litellm.proxy.middleware.request_size_limit_middleware import (
    RequestSizeLimitMiddleware,
 )
+from litellm.proxy.shutdown.graceful_shutdown_manager import GracefulShutdownManager
 from litellm.proxy.ocr_endpoints.endpoints import router as ocr_router
 from litellm.proxy.openai_files_endpoints.files_endpoints import (
    router as openai_files_router,
@ -973,6 +974,11 @@ async def proxy_startup_event(app: FastAPI):  # noqa: PLR0915
    # End of startup event
    yield

+    # Shutdown event - drain in-flight requests before tearing down dependencies
+    # so SIGTERM (rolling update, scale-down, liveness kill) doesn't drop them.
+    GracefulShutdownManager.start_shutdown()
+    await GracefulShutdownManager.wait_for_drain()
+
    # Shutdown event - close shared aiohttp session
    if shared_aiohttp_session is not None:
        try:
--- a/litellm/proxy/shutdown/init.py
+++ b/litellm/proxy/shutdown/init.py
--- a/litellm/proxy/shutdown/graceful_shutdown_manager.py
+++ b/litellm/proxy/shutdown/graceful_shutdown_manager.py
@ -0,0 +1,174 @@
+"""
+Application-level graceful shutdown coordination for the LiteLLM proxy.
+
+Kubernetes terminates a pod by sending ``SIGTERM`` and, after
+``terminationGracePeriodSeconds``, ``SIGKILL``. By default LiteLLM delegates
+the signal to uvicorn and tears down immediately, dropping any in-flight
+requests (streaming, batch inference, long-lived calls).
+
+A fixed ``preStop`` sleep can not solve this: it has to be sized for the
+*worst-case* request, so it either wastes time on every routine shutdown or is
+too short for a long-running request. This manager instead drains based on the
+*actual* in-flight request counter (already tracked by
+``InFlightRequestsMiddleware``), so a pod terminates as soon as its real
+in-flight work is done — and never waits longer than ``GRACEFUL_SHUTDOWN_TIMEOUT``.
+
+The state is process-scoped (class-level), matching the per-uvicorn-worker
+granularity of ``InFlightRequestsMiddleware``.
+"""
+
+import asyncio
+import os
+import time
+from typing import Callable, Optional
+
+from litellm._logging import verbose_proxy_logger
+from litellm.proxy.middleware.in_flight_requests_middleware import (
+    get_in_flight_requests,
+)
+
+# Keep below terminationGracePeriodSeconds so the process exits before SIGKILL.
+DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT = 30.0
+_DRAIN_POLL_INTERVAL = 0.1
+_DRAIN_LOG_INTERVAL = 5.0
+
+
+class GracefulShutdownManager:
+    """
+    Process-scoped singleton that tracks whether the worker is draining and
+    blocks until in-flight requests reach zero (or a timeout elapses).
+    """
+
+    _is_shutting_down: bool = False
+    _shutdown_started_at: Optional[float] = None
+    _drain_performed: bool = False
+
+    @classmethod
+    def is_shutting_down(cls) -> bool:
+        """Whether this worker has begun graceful shutdown."""
+        return cls._is_shutting_down
+
+    @classmethod
+    def get_timeout(cls) -> float:
+        """
+        Read GRACEFUL_SHUTDOWN_TIMEOUT (seconds) from the environment on each
+        call so deployments can tune it without code changes. Falls back to the
+        default on an unset or malformed value.
+        """
+        raw = os.getenv("GRACEFUL_SHUTDOWN_TIMEOUT")
+        if raw is None:
+            return DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT
+        try:
+            return float(raw)
+        except (TypeError, ValueError):
+            verbose_proxy_logger.warning(
+                "GRACEFUL_SHUTDOWN_TIMEOUT=%r is not a number; using default %ss",
+                raw,
+                DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT,
+            )
+            return DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT
+
+    @classmethod
+    def start_shutdown(cls) -> None:
+        """
+        Mark the worker as draining. Idempotent — repeated calls (e.g. SIGTERM
+        followed by a preStop hit on /health/drain) do not reset the clock.
+        """
+        if cls._is_shutting_down:
+            return
+        cls._is_shutting_down = True
+        cls._shutdown_started_at = time.monotonic()
+        verbose_proxy_logger.info(
+            "graceful_shutdown_started in_flight_requests=%s",
+            get_in_flight_requests(),
+        )
+
+    @classmethod
+    async def wait_for_drain(
+        cls,
+        timeout: Optional[float] = None,
+        exclude_self: bool = False,
+        count_fn: Optional[Callable[[], int]] = None,
+        poll_interval: float = _DRAIN_POLL_INTERVAL,
+        log_interval: float = _DRAIN_LOG_INTERVAL,
+    ) -> int:
+        """
+        Poll the in-flight request counter until it reaches the drain target or
+        ``timeout`` seconds elapse.
+
+        Args:
+            timeout: Max seconds to wait. Defaults to ``get_timeout()``.
+            exclude_self: When the caller is itself an in-flight HTTP request
+                (the /health/drain endpoint), set this so the caller's own
+                request is not counted as outstanding work.
+            count_fn: Source of the current in-flight count. Defaults to the
+                live ``InFlightRequestsMiddleware`` counter; injectable for tests.
+            poll_interval: Seconds between counter polls.
+            log_interval: Minimum seconds between ``drain_waiting`` log lines.
+
+        Returns:
+            Number of requests that drained while waiting (>= 0).
+        """
+        # A preStop /health/drain hook and the lifespan SIGTERM handler both
+        # drain; once one has run, the other must not wait again, otherwise the
+        # effective window is 2x the timeout and terminationGracePeriodSeconds
+        # has to be doubled to avoid a mid-drain SIGKILL.
+        if cls._drain_performed:
+            return 0
+        cls._drain_performed = True
+
+        if timeout is None:
+            timeout = cls.get_timeout()
+        if count_fn is None:
+            count_fn = get_in_flight_requests
+
+        # The /health/drain HTTP request flows through InFlightRequestsMiddleware
+        # and so counts itself; treat <=1 as "drained" in that case.
+        target = 1 if exclude_self else 0
+
+        start = time.monotonic()
+        initial = count_fn()
+        last_log = start
+
+        if timeout <= 0:
+            return max(0, initial - target)
+
+        while True:
+            current = count_fn()
+            if current <= target:
+                drained = max(0, initial - current)
+                verbose_proxy_logger.info(
+                    "graceful_shutdown_complete drained_requests=%s elapsed_s=%.2f",
+                    drained,
+                    time.monotonic() - start,
+                )
+                return drained
+
+            elapsed = time.monotonic() - start
+            if elapsed >= timeout:
+                verbose_proxy_logger.warning(
+                    "graceful_shutdown_timeout in_flight_requests=%s elapsed_s=%.2f "
+                    "timeout_s=%s — proceeding with teardown",
+                    current,
+                    elapsed,
+                    timeout,
+                )
+                return max(0, initial - current)
+
+            now = time.monotonic()
+            if now - last_log >= log_interval:
+                verbose_proxy_logger.info(
+                    "drain_waiting in_flight_requests=%s elapsed_s=%.2f",
+                    current,
+                    elapsed,
+                )
+                last_log = now
+
+            await asyncio.sleep(poll_interval)
+
+    @classmethod
+    def reset(cls) -> None:
+        """Reset state. Intended for use in tests."""
+        cls._is_shutting_down = False
+        cls._shutdown_started_at = None
+        cls._drain_performed = False
--- a/tests/test_litellm/proxy/conftest.py
+++ b/tests/test_litellm/proxy/conftest.py
@ -14,7 +14,6 @@ import pytest
 import yaml
 from fastapi.testclient import TestClient

-
 _PROXY_MODULE_GLOBALS_TO_ISOLATE = (
    "master_key",
    "prisma_client",
@ -49,6 +48,18 @@ def _isolate_proxy_module_globals():
                setattr(proxy_server, name, value)


+@pytest.fixture(autouse=True)
+def _reset_graceful_shutdown_state():
+    """Graceful shutdown state is process-scoped; keep it from leaking between tests."""
+    from litellm.proxy.shutdown.graceful_shutdown_manager import (
+        GracefulShutdownManager,
+    )
+
+    GracefulShutdownManager.reset()
+    yield
+    GracefulShutdownManager.reset()
+
+
 def build_cache_config(enable_cache: bool = True) -> Optional[Dict]:
    """
    Build Redis cache configuration from environment variables.
--- a/tests/test_litellm/proxy/health_endpoints/test_graceful_shutdown_endpoints.py
+++ b/tests/test_litellm/proxy/health_endpoints/test_graceful_shutdown_endpoints.py
@ -0,0 +1,166 @@
+"""
+Behaviour tests for the graceful-shutdown health probes.
+
+Builds a minimal FastAPI app from the health router plus
+InFlightRequestsMiddleware so the probe responses can be asserted without
+standing up the full proxy.
+"""
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from litellm.proxy.health_endpoints._health_endpoints import router
+from litellm.proxy.middleware.in_flight_requests_middleware import (
+    InFlightRequestsMiddleware,
+)
+from litellm.proxy.shutdown.graceful_shutdown_manager import GracefulShutdownManager
+
+
+@pytest.fixture(autouse=True)
+def _reset():
+    GracefulShutdownManager.reset()
+    InFlightRequestsMiddleware._in_flight = 0
+    yield
+    GracefulShutdownManager.reset()
+    InFlightRequestsMiddleware._in_flight = 0
+
+
+@pytest.fixture
+def client():
+    app = FastAPI()
+    app.include_router(router)
+    app.add_middleware(InFlightRequestsMiddleware)
+    return TestClient(app)
+
+
+@pytest.fixture
+def enable_drain(monkeypatch):
+    from litellm.proxy import proxy_server
+
+    monkeypatch.setattr(
+        proxy_server, "general_settings", {"enable_drain_endpoint": True}
+    )
+
+
+@pytest.fixture
+def enable_drain_with_token(monkeypatch):
+    from litellm.proxy import proxy_server
+
+    monkeypatch.setattr(
+        proxy_server,
+        "general_settings",
+        {"enable_drain_endpoint": True, "drain_endpoint_token": "secret-123"},
+    )
+
+
+def test_drain_disabled_by_default_returns_404_with_no_side_effect(client, monkeypatch):
+    from litellm.proxy import proxy_server
+
+    monkeypatch.setattr(proxy_server, "general_settings", {})
+    resp = client.get("/health/drain")
+    assert resp.status_code == 404
+    assert GracefulShutdownManager.is_shutting_down() is False
+
+
+def test_drain_disabled_ignores_token_header(client, monkeypatch):
+    """A token alone must not bypass the enable flag; otherwise enabling the
+    token side-channel would silently enable the endpoint."""
+    from litellm.proxy import proxy_server
+
+    monkeypatch.setattr(
+        proxy_server, "general_settings", {"drain_endpoint_token": "secret-123"}
+    )
+    resp = client.get("/health/drain", headers={"X-Drain-Token": "secret-123"})
+    assert resp.status_code == 404
+    assert GracefulShutdownManager.is_shutting_down() is False
+
+
+def test_drain_when_enabled_without_token_sets_shutting_down_and_returns_drained(
+    client, enable_drain
+):
+    resp = client.get("/health/drain")
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["status"] == "drained"
+    assert body["drained_requests"] == 0
+    assert GracefulShutdownManager.is_shutting_down() is True
+
+
+def test_drain_with_token_configured_rejects_missing_header(
+    client, enable_drain_with_token
+):
+    resp = client.get("/health/drain")
+    assert resp.status_code == 401
+    assert GracefulShutdownManager.is_shutting_down() is False
+
+
+def test_drain_with_token_configured_rejects_wrong_header(
+    client, enable_drain_with_token
+):
+    resp = client.get("/health/drain", headers={"X-Drain-Token": "wrong-value"})
+    assert resp.status_code == 401
+    assert GracefulShutdownManager.is_shutting_down() is False
+
+
+def test_drain_with_token_configured_accepts_correct_header(
+    client, enable_drain_with_token
+):
+    resp = client.get("/health/drain", headers={"X-Drain-Token": "secret-123"})
+    assert resp.status_code == 200
+    assert resp.json()["status"] == "drained"
+    assert GracefulShutdownManager.is_shutting_down() is True
+
+
+def test_drain_with_token_from_env_var(client, enable_drain, monkeypatch):
+    monkeypatch.setenv("DRAIN_ENDPOINT_TOKEN", "env-token")
+    resp = client.get("/health/drain")
+    assert resp.status_code == 401
+    resp = client.get("/health/drain", headers={"X-Drain-Token": "env-token"})
+    assert resp.status_code == 200
+
+
+def test_drain_general_settings_token_overrides_env_var(client, monkeypatch):
+    from litellm.proxy import proxy_server
+
+    monkeypatch.setattr(
+        proxy_server,
+        "general_settings",
+        {"enable_drain_endpoint": True, "drain_endpoint_token": "config-token"},
+    )
+    monkeypatch.setenv("DRAIN_ENDPOINT_TOKEN", "env-token")
+    resp = client.get("/health/drain", headers={"X-Drain-Token": "env-token"})
+    assert resp.status_code == 401
+    resp = client.get("/health/drain", headers={"X-Drain-Token": "config-token"})
+    assert resp.status_code == 200
+
+
+def test_readiness_returns_503_shutting_down_during_drain(client):
+    GracefulShutdownManager.start_shutdown()
+    resp = client.get("/health/readiness")
+    assert resp.status_code == 503
+    assert resp.json() == {"status": "shutting_down"}
+
+
+def test_readiness_does_not_report_shutting_down_normally(client):
+    resp = client.get("/health/readiness")
+    assert resp.json().get("status") != "shutting_down"
+
+
+def test_liveliness_returns_503_during_drain(client):
+    GracefulShutdownManager.start_shutdown()
+    resp = client.get("/health/liveliness")
+    assert resp.status_code == 503
+    assert resp.json() == {"status": "shutting_down"}
+
+
+def test_liveness_alias_returns_503_during_drain(client):
+    GracefulShutdownManager.start_shutdown()
+    resp = client.get("/health/liveness")
+    assert resp.status_code == 503
+
+
+def test_liveliness_returns_alive_when_not_shutting_down(client):
+    resp = client.get("/health/liveliness")
+    assert resp.status_code == 200
+    assert resp.json() == "I'm alive!"
--- a/tests/test_litellm/proxy/shutdown/test_graceful_shutdown_manager.py
+++ b/tests/test_litellm/proxy/shutdown/test_graceful_shutdown_manager.py
@ -0,0 +1,181 @@
+"""
+Tests for GracefulShutdownManager.
+
+These verify the drain logic that lets a pod terminate as soon as its real
+in-flight work is done (bounded by GRACEFUL_SHUTDOWN_TIMEOUT) rather than
+sleeping for a fixed worst-case duration.
+"""
+
+import time
+
+import pytest
+
+from litellm.proxy.shutdown.graceful_shutdown_manager import (
+    DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT,
+    GracefulShutdownManager,
+)
+
+
+@pytest.fixture(autouse=True)
+def _reset():
+    GracefulShutdownManager.reset()
+    yield
+    GracefulShutdownManager.reset()
+
+
+def _counter_that_drains_after(calls_before_zero: int):
+    """Return a count_fn that reports N in-flight until it has been polled
+    `calls_before_zero` times, then reports 0."""
+    state = {"polls": 0}
+
+    def count_fn() -> int:
+        state["polls"] += 1
+        return 0 if state["polls"] > calls_before_zero else 3
+
+    return count_fn
+
+
+# ── shutdown flag ───────────────────────────────────────────────────────────
+
+
+def test_not_shutting_down_by_default():
+    assert GracefulShutdownManager.is_shutting_down() is False
+
+
+def test_start_shutdown_sets_flag():
+    GracefulShutdownManager.start_shutdown()
+    assert GracefulShutdownManager.is_shutting_down() is True
+
+
+def test_start_shutdown_is_idempotent_and_does_not_reset_clock():
+    GracefulShutdownManager.start_shutdown()
+    first = GracefulShutdownManager._shutdown_started_at
+    time.sleep(0.01)
+    GracefulShutdownManager.start_shutdown()
+    assert GracefulShutdownManager._shutdown_started_at == first
+
+
+def test_reset_clears_flag():
+    GracefulShutdownManager.start_shutdown()
+    GracefulShutdownManager.reset()
+    assert GracefulShutdownManager.is_shutting_down() is False
+
+
+# ── timeout config ────────────────────────────────────────────────────────────
+
+
+def test_timeout_defaults_when_unset(monkeypatch):
+    monkeypatch.delenv("GRACEFUL_SHUTDOWN_TIMEOUT", raising=False)
+    assert GracefulShutdownManager.get_timeout() == DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT
+
+
+def test_timeout_reads_env(monkeypatch):
+    monkeypatch.setenv("GRACEFUL_SHUTDOWN_TIMEOUT", "5")
+    assert GracefulShutdownManager.get_timeout() == 5.0
+
+
+def test_timeout_falls_back_on_garbage(monkeypatch):
+    monkeypatch.setenv("GRACEFUL_SHUTDOWN_TIMEOUT", "not-a-number")
+    assert GracefulShutdownManager.get_timeout() == DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT
+
+
+# ── wait_for_drain ────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_returns_immediately_when_already_drained():
+    start = time.monotonic()
+    drained = await GracefulShutdownManager.wait_for_drain(
+        timeout=10, count_fn=lambda: 0
+    )
+    assert drained == 0
+    assert time.monotonic() - start < 0.5
+
+
+@pytest.mark.asyncio
+async def test_waits_until_counter_reaches_zero_then_returns_drained_count():
+    count_fn = _counter_that_drains_after(calls_before_zero=3)
+    drained = await GracefulShutdownManager.wait_for_drain(
+        timeout=10, count_fn=count_fn
+    )
+    assert drained == 3
+
+
+@pytest.mark.asyncio
+async def test_times_out_when_counter_never_drains():
+    start = time.monotonic()
+    drained = await GracefulShutdownManager.wait_for_drain(
+        timeout=0.3, count_fn=lambda: 2
+    )
+    elapsed = time.monotonic() - start
+    assert 0.3 <= elapsed < 2.0
+    assert drained == 0
+
+
+@pytest.mark.asyncio
+async def test_zero_timeout_does_not_block():
+    start = time.monotonic()
+    drained = await GracefulShutdownManager.wait_for_drain(
+        timeout=0, count_fn=lambda: 5
+    )
+    assert time.monotonic() - start < 0.2
+    assert drained == 5
+
+
+@pytest.mark.asyncio
+async def test_exclude_self_treats_one_inflight_as_drained():
+    """The /health/drain request counts itself, so a steady count of 1 must be
+    treated as fully drained rather than timing out."""
+    start = time.monotonic()
+    drained = await GracefulShutdownManager.wait_for_drain(
+        timeout=5, exclude_self=True, count_fn=lambda: 1
+    )
+    assert time.monotonic() - start < 0.5
+    assert drained == 0
+
+
+@pytest.mark.asyncio
+async def test_without_exclude_self_one_inflight_blocks_until_timeout():
+    start = time.monotonic()
+    await GracefulShutdownManager.wait_for_drain(timeout=0.3, count_fn=lambda: 1)
+    assert time.monotonic() - start >= 0.3
+
+
+@pytest.mark.asyncio
+async def test_defaults_to_get_timeout_and_live_counter(monkeypatch):
+    """With no timeout/count_fn passed, it falls back to get_timeout() and the
+    live InFlightRequestsMiddleware counter."""
+    from litellm.proxy.middleware.in_flight_requests_middleware import (
+        InFlightRequestsMiddleware,
+    )
+
+    monkeypatch.delenv("GRACEFUL_SHUTDOWN_TIMEOUT", raising=False)
+    InFlightRequestsMiddleware._in_flight = 0
+    drained = await GracefulShutdownManager.wait_for_drain()
+    assert drained == 0
+
+
+@pytest.mark.asyncio
+async def test_second_drain_is_a_noop_so_window_is_not_doubled():
+    """preStop /health/drain and the lifespan SIGTERM handler both drain; the
+    second call must return immediately rather than waiting another full
+    timeout (which would require doubling terminationGracePeriodSeconds)."""
+    await GracefulShutdownManager.wait_for_drain(timeout=0.2, count_fn=lambda: 1)
+
+    start = time.monotonic()
+    drained = await GracefulShutdownManager.wait_for_drain(
+        timeout=5, count_fn=lambda: 1
+    )
+    assert time.monotonic() - start < 0.1
+    assert drained == 0
+
+
+@pytest.mark.asyncio
+async def test_emits_periodic_drain_waiting_log_while_waiting():
+    """With a zero log interval, the periodic drain_waiting branch runs on each
+    poll until the counter finally drains."""
+    count_fn = _counter_that_drains_after(calls_before_zero=2)
+    drained = await GracefulShutdownManager.wait_for_drain(
+        timeout=10, count_fn=count_fn, poll_interval=0, log_interval=0
+    )
+    assert drained == 3