Add health check scripts and parallel execution support (#19295)

- Add health_check_client.py for monitoring model availability - Add health_check_client_README.md with usage documentation - Add health_check_requirements.txt for dependencies - Add run_parallel_health_checks.ps1 (PowerShell version) - Add run_parallel_health_checks.sh (Bash version) - Organize all scripts under scripts/health_check/ directory
2026-01-19 08:38:38 -08:00 · 2026-01-19 08:38:38 -08:00 · 0cd7763d5f
commit 0cd7763d5f
parent 0862373b38
6 changed files with 818 additions and 0 deletions
--- a/docker/Dockerfile.health_check
+++ b/docker/Dockerfile.health_check
@ -0,0 +1,16 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Copy health check script and requirements
+COPY scripts/health_check/health_check_client.py /app/health_check_client.py
+COPY scripts/health_check/health_check_requirements.txt /app/requirements.txt
+
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Make script executable
+RUN chmod +x /app/health_check_client.py
+
+# Set entrypoint
+ENTRYPOINT ["python", "/app/health_check_client.py"]
--- a/scripts/health_check/health_check_client.py
+++ b/scripts/health_check/health_check_client.py
@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+"""
+LiteLLM Health Check Client
+
+A sentinel health check tool that tests all configured models on a LiteLLM proxy.
+Similar to HRT's health check system, this script:
+- Can read models from YAML config file (like HRT) or fetch from proxy API
+- Sends a simple test request to each model concurrently
+- Reports health status for each model
+- Supports both chat/completion and embedding models
+"""
+
+import asyncio
+import json
+import os
+import sys
+import time
+from typing import Dict, List, Optional, Tuple
+
+import httpx
+import yaml
+
+
+class LiteLLMHealthCheckClient:
+    """Client for health checking LiteLLM proxy models."""
+
+    def __init__(
+        self,
+        base_url: str,
+        api_key: str,
+        timeout: int = 120,  # Match Go implementation's 120s timeout
+        completion_prompt: str = "Say this is a test",  # Match Go implementation
+        embedding_text: str = "This is a test for vectorization.",  # Match Go implementation
+    ):
+        """
+        Initialize the health check client.
+
+        Args:
+            base_url: Base URL of the LiteLLM proxy (e.g., https://litellm.example.com)
+            api_key: API key for authentication
+            timeout: Request timeout in seconds (default: 120, matching Go implementation)
+            completion_prompt: Test prompt for chat/completion models
+            embedding_text: Test text for embedding models
+        """
+        self.base_url = base_url.rstrip("/")
+        self.api_key = api_key
+        self.timeout = timeout
+        self.completion_prompt = completion_prompt
+        self.embedding_text = embedding_text
+        self.headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        }
+
+    def load_models_from_yaml(self, yaml_path: str) -> List[Dict]:
+        """
+        Load models from a YAML config file (similar to Go implementation).
+
+        Args:
+            yaml_path: Path to the YAML config file
+
+        Returns:
+            List of model dictionaries with 'id' and 'mode' keys
+        """
+        try:
+            with open(yaml_path, "r") as f:
+                config = yaml.safe_load(f)
+
+            model_list = config.get("model_list", [])
+            models = []
+
+            for entry in model_list:
+                model_name = entry.get("model_name", "")
+                litellm_params = entry.get("litellm_params", {})
+                model_info = litellm_params.get("model_info", {})
+                mode = model_info.get("mode", "")
+
+                # Use model_name as the ID (this is what gets sent to the API)
+                models.append(
+                    {
+                        "id": model_name,
+                        "mode": mode.lower() if mode else "",
+                        "provider": model_info.get("provider", ""),
+                    }
+                )
+
+            return models
+        except Exception as e:
+            print(f"Error loading models from YAML file {yaml_path}: {e}", file=sys.stderr)
+            return []
+
+    async def fetch_models(self, client: httpx.AsyncClient) -> List[Dict]:
+        """
+        Fetch all available models from the proxy API.
+
+        Returns:
+            List of model dictionaries with 'id' and 'mode' keys
+        """
+        try:
+            # Try /v1/models first (OpenAI-compatible endpoint)
+            response = await client.get(
+                f"{self.base_url}/v1/models",
+                headers=self.headers,
+                timeout=self.timeout,
+            )
+            response.raise_for_status()
+            data = response.json()
+            models_data = data.get("data", [])
+            models = []
+            for m in models_data:
+                models.append({"id": m["id"], "mode": "", "provider": ""})
+            return models
+        except Exception as e:
+            print(f"Error fetching models from /v1/models: {e}", file=sys.stderr)
+            # Fallback to /model/info endpoint which has more details
+            try:
+                response = await client.get(
+                    f"{self.base_url}/model/info",
+                    headers=self.headers,
+                    timeout=self.timeout,
+                )
+                response.raise_for_status()
+                data = response.json()
+                if isinstance(data, dict) and "data" in data:
+                    models_data = data["data"]
+                elif isinstance(data, list):
+                    models_data = data
+                else:
+                    models_data = []
+
+                models = []
+                for m in models_data:
+                    model_info = m.get("model_info", {})
+                    mode = model_info.get("mode", "")
+                    models.append(
+                        {
+                            "id": m.get("model_name", m.get("id", "unknown")),
+                            "mode": mode.lower() if mode else "",
+                            "provider": model_info.get("provider", ""),
+                        }
+                    )
+                return models
+            except Exception as e2:
+                print(f"Error fetching models from /model/info: {e2}", file=sys.stderr)
+                return []
+
+    async def check_model_health(
+        self, client: httpx.AsyncClient, model: Dict
+    ) -> Tuple[str, Dict]:
+        """
+        Check health of a single model by sending a test request.
+
+        Args:
+            client: HTTP client
+            model: Model dictionary with 'id' and 'mode' keys
+
+        Returns:
+            Tuple of (model_id, result_dict)
+        """
+        model_id = model["id"]
+        mode = model.get("mode", "")
+
+        start_time = time.time()
+        result = {
+            "model": model_id,
+            "healthy": False,
+            "error": None,
+            "response_time_ms": None,
+            "mode": mode,
+        }
+
+        try:
+            # Determine if this is an embedding model
+            # Check mode first (from config), then fall back to name-based detection
+            is_embedding = (
+                mode == "embedding"
+                or any(
+                    keyword in model_id.lower()
+                    for keyword in ["embedding", "embed", "text-embedding"]
+                )
+            )
+
+            if is_embedding:
+                # Test embedding endpoint (matching Go implementation)
+                embedding_response = await client.post(
+                    f"{self.base_url}/v1/embeddings",
+                    headers=self.headers,
+                    json={
+                        "model": model_id,
+                        "input": self.embedding_text,
+                    },
+                    timeout=self.timeout,
+                )
+                embedding_response.raise_for_status()
+                embedding_data = embedding_response.json()
+                dimensions = 0
+                if "data" in embedding_data and len(embedding_data["data"]) > 0:
+                    dimensions = len(embedding_data["data"][0].get("embedding", []))
+
+                result["healthy"] = True
+                result["mode"] = "embedding"
+                result["dimensions"] = dimensions
+            else:
+                # Test chat completion endpoint (matching Go implementation)
+                completion_response = await client.post(
+                    f"{self.base_url}/v1/chat/completions",
+                    headers=self.headers,
+                    json={
+                        "model": model_id,
+                        "messages": [
+                            {"role": "user", "content": self.completion_prompt}
+                        ],
+                        "max_tokens": 10,  # Minimal tokens for health check
+                    },
+                    timeout=self.timeout,
+                )
+                completion_response.raise_for_status()
+                completion_data = completion_response.json()
+                response_text = ""
+                if "choices" in completion_data and len(completion_data["choices"]) > 0:
+                    response_text = (
+                        completion_data["choices"][0]
+                        .get("message", {})
+                        .get("content", "")
+                    )
+
+                result["healthy"] = True
+                result["mode"] = "chat"
+                result["response_text"] = response_text[:100]  # Truncate for display
+
+            elapsed_ms = (time.time() - start_time) * 1000
+            result["response_time_ms"] = round(elapsed_ms, 2)
+
+        except httpx.HTTPStatusError as e:
+            result["error"] = f"HTTP {e.response.status_code}: {e.response.text[:200]}"
+        except httpx.TimeoutException:
+            result["error"] = f"Request timeout after {self.timeout}s"
+        except Exception as e:
+            result["error"] = str(e)[:200]
+
+        return model_id, result
+
+    async def run_health_checks(
+        self,
+        models: Optional[List[Dict]] = None,
+        models_only: Optional[List[str]] = None,
+    ) -> Dict[str, Dict]:
+        """
+        Run health checks on all models concurrently.
+
+        Args:
+            models: Optional list of models to check. If None, fetches from proxy.
+            models_only: Optional list of model IDs to check. If set, only these
+                models are health-checked (must exist in the models list).
+
+        Returns:
+            Dictionary mapping model_id to health check result
+        """
+        async with httpx.AsyncClient() as client:
+            if models is None:
+                models = await self.fetch_models(client)
+
+            if not models:
+                print("No models found to health check", file=sys.stderr)
+                return {}
+
+            if models_only:
+                allowlist = {m.strip() for m in models_only if m and m.strip()}
+                models = [m for m in models if m.get("id") in allowlist]
+                print(
+                    f"Filtering to only check {len(models)} models: {', '.join(sorted(allowlist))}",
+                    file=sys.stderr,
+                )
+                if not models:
+                    print(
+                        "No models matched LITELLM_MODELS_ONLY filter",
+                        file=sys.stderr,
+                    )
+                    return {}
+
+            print(f"Running health checks on {len(models)} models...", file=sys.stderr)
+
+            # Run all health checks concurrently
+            tasks = [self.check_model_health(client, model) for model in models]
+            results_list = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Convert to dictionary format
+            results = {}
+            for result in results_list:
+                if isinstance(result, Exception):
+                    print(
+                        f"Exception in health check task: {result}", file=sys.stderr
+                    )
+                    continue
+                # Type narrowing: after checking it's not an Exception, it's a Tuple
+                if isinstance(result, tuple) and len(result) == 2:
+                    model_id, result_dict = result
+                    results[model_id] = result_dict
+
+            return results
+
+    def print_results(self, results: Dict[str, Dict], json_output: bool = False):
+        """
+        Print health check results.
+
+        Args:
+            results: Dictionary of health check results
+            json_output: If True, output as JSON
+        """
+        if json_output:
+            print(json.dumps(results, indent=2))
+            return
+
+        healthy_count = sum(1 for r in results.values() if r.get("healthy"))
+        unhealthy_count = len(results) - healthy_count
+
+        # Print detailed results for each model (matching Go output format)
+        print(f"\n{'='*60}", file=sys.stderr)
+        print(f"Starting health check queries\n", file=sys.stderr)
+
+        for model_id, result in results.items():
+            if result.get("healthy"):
+                if result.get("mode") == "embedding":
+                    dimensions = result.get("dimensions", 0)
+                    print(
+                        f"---- {model_id} ----\n✅ Success. "
+                        f"Generated embedding vector with {dimensions} dimensions.\n\n",
+                        file=sys.stderr,
+                    )
+                else:
+                    response_text = result.get("response_text", "")
+                    print(
+                        f"---- {model_id} ----\n✅ Success. "
+                        f"Response:\n{response_text}\n\n",
+                        file=sys.stderr,
+                    )
+            else:
+                error = result.get("error", "Unknown error")
+                print(f"---- {model_id} ----\n❌ ERROR: {error}\n\n", file=sys.stderr)
+
+        print(f"{'='*60}", file=sys.stderr)
+        print(f"Health Check Summary", file=sys.stderr)
+        print(f"{'='*60}", file=sys.stderr)
+        print(f"Total models: {len(results)}", file=sys.stderr)
+        print(f"Healthy: {healthy_count}", file=sys.stderr)
+        print(f"Unhealthy: {unhealthy_count}", file=sys.stderr)
+        print(f"{'='*60}\n", file=sys.stderr)
+
+        # Exit with non-zero code if any models are unhealthy
+        if unhealthy_count > 0:
+            sys.exit(1)
+        else:
+            sys.exit(0)
+
+
+async def main():
+    """Main entry point."""
+    base_url = os.environ.get("LITELLM_BASE_URL", "http://localhost:4000")
+    api_key = os.environ.get("LITELLM_API_KEY", "sk-1234")
+    yaml_path = os.environ.get("LITELLM_MODELS_YAML")
+
+    if not base_url:
+        print("Error: LITELLM_BASE_URL environment variable not set", file=sys.stderr)
+        sys.exit(1)
+
+    if not api_key:
+        print("Error: LITELLM_API_KEY environment variable not set", file=sys.stderr)
+        sys.exit(1)
+
+    timeout = int(os.environ.get("LITELLM_TIMEOUT", "120"))  # Match Go's 120s default
+    completion_prompt = os.environ.get(
+        "LITELLM_COMPLETION_PROMPT", "Say this is a test"
+    )
+    embedding_text = os.environ.get(
+        "LITELLM_EMBEDDING_TEXT", "This is a test for vectorization."
+    )
+    json_output = os.environ.get("LITELLM_JSON_OUTPUT", "").lower() == "true"
+    # Optional: only health-check these model IDs (comma-separated). E.g.:
+    # LITELLM_MODELS_ONLY=claude-3.7-sonnet,claude-3.5-sonnet,claude-4.5-haiku
+    models_only_raw = os.environ.get("LITELLM_MODELS_ONLY", "")
+    models_only = [m.strip() for m in models_only_raw.split(",") if m.strip()] or None
+
+    client = LiteLLMHealthCheckClient(
+        base_url=base_url,
+        api_key=api_key,
+        timeout=timeout,
+        completion_prompt=completion_prompt,
+        embedding_text=embedding_text,
+    )
+
+    # Load models from YAML if provided, otherwise fetch from API
+    models = None
+    if yaml_path:
+        models = client.load_models_from_yaml(yaml_path)
+        if models:
+            print(
+                f"Successfully loaded {len(models)} models from {yaml_path}",
+                file=sys.stderr,
+            )
+
+    results = await client.run_health_checks(models=models, models_only=models_only)
+    client.print_results(results, json_output=json_output)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/scripts/health_check/health_check_client_README.md
+++ b/scripts/health_check/health_check_client_README.md
@ -0,0 +1,246 @@
+# LiteLLM Health Check Client
+
+A health check tool for testing all configured models on a LiteLLM proxy. Tests each model with completion/embedding requests and reports health status, errors, and response times.
+
+## Features
+
+- **YAML Config Support**: Reads models from YAML config file OR fetches from proxy API
+- **Smart Mode Detection**: Detects embedding vs chat models from config or model name
+- **Concurrent Testing**: Tests all models concurrently using asyncio
+- **Containerized**: Docker image for easy deployment
+- **Parallel Execution**: Supports parallel execution for stress testing
+- **Configurable**: Customizable timeouts (default 120s) and test prompts
+
+## Quick Start
+
+### As a Python Script
+
+**Option 1: Fetch models from proxy API**
+```bash
+export LITELLM_BASE_URL="https://litellm.example.com"
+export LITELLM_API_KEY="your-api-key"
+python scripts/health_check/health_check_client.py
+```
+
+**Option 2: Use YAML config file**
+```bash
+export LITELLM_BASE_URL="https://litellm.example.com"
+export LITELLM_API_KEY="your-api-key"
+export LITELLM_MODELS_YAML="/path/to/config.yaml"
+python scripts/health_check/health_check_client.py
+```
+
+### As a Docker Container
+
+1. Build the Docker image:
+
+```bash
+docker build -f docker/Dockerfile.health_check -t litellm/litellm-health-check:latest .
+```
+
+2. Run a single health check:
+
+```bash
+docker run --rm \
+  -e LITELLM_BASE_URL="https://litellm.example.com" \
+  -e LITELLM_API_KEY="your-api-key" \
+  litellm/litellm-health-check:latest
+```
+
+### Parallel Execution (Stress Testing)
+
+Run multiple health check containers in parallel:
+
+**PowerShell:**
+```powershell
+$env:LITELLM_BASE_URL="https://litellm.example.com"
+$env:LITELLM_API_KEY="your-api-key"
+.\scripts\health_check\run_parallel_health_checks.ps1 16
+```
+
+**Bash/Shell:**
+```bash
+export LITELLM_BASE_URL="https://litellm.example.com"
+export LITELLM_API_KEY="your-api-key"
+./scripts/health_check/run_parallel_health_checks.sh 16
+```
+
+
+## Configuration
+
+### Environment Variables
+
+- `LITELLM_BASE_URL` (required): Base URL of the LiteLLM proxy
+  - Example: `https://litellm.example.com`
+- `LITELLM_API_KEY` (required): API key for authentication
+- `LITELLM_MODELS_YAML` (optional): Path to YAML config file with model_list
+  - If provided, reads models from YAML instead of fetching from API
+  - Example: `/path/to/config.yaml`
+- `LITELLM_TIMEOUT` (optional): Request timeout in seconds (default: 120)
+- `LITELLM_COMPLETION_PROMPT` (optional): Test prompt for chat/completion models (default: "Say this is a test")
+- `LITELLM_EMBEDDING_TEXT` (optional): Test text for embedding models (default: "This is a test for vectorization.")
+- `LITELLM_JSON_OUTPUT` (optional): Output results as JSON (default: false)
+
+## Output
+
+### Standard Output (Human-Readable)
+
+Example output format:
+
+```
+============================================================
+Starting health check queries
+
+---- gpt-4o ----
+✅ Success. Response:
+This is a test
+
+---- text-embedding-3-small ----
+✅ Success. Generated embedding vector with 1536 dimensions.
+
+---- gpt-5-codex ----
+❌ ERROR: HTTP 503: Service unavailable
+
+============================================================
+Health Check Summary
+============================================================
+Total models: 47
+Healthy: 45
+Unhealthy: 2
+============================================================
+```
+
+Exit code: `0` if all models are healthy, `1` if any models are unhealthy.
+
+### JSON Output
+
+When `LITELLM_JSON_OUTPUT=true`, outputs JSON:
+
+```json
+{
+  "gpt-4o": {
+    "model": "gpt-4o",
+    "healthy": true,
+    "error": null,
+    "response_time_ms": 245.67,
+    "mode": "chat",
+    "response_text": "This is a test"
+  },
+  "text-embedding-3-small": {
+    "model": "text-embedding-3-small",
+    "healthy": true,
+    "error": null,
+    "response_time_ms": 123.45,
+    "mode": "embedding",
+    "dimensions": 1536
+  }
+}
+```
+
+## How It Works
+
+1. **Model Discovery**: 
+   - If `LITELLM_MODELS_YAML` is set: Reads models from YAML config file
+   - Otherwise: Queries `/v1/models` (OpenAI-compatible) or `/model/info` to get all configured models
+2. **Mode Detection**: 
+   - Checks `mode` field from YAML config, or falls back to model name patterns (embedding, embed, text-embedding)
+3. **Concurrent Testing**: 
+   - Chat models: `POST /v1/chat/completions` with configurable prompt (default: "Say this is a test")
+   - Embedding models: `POST /v1/embeddings` with configurable text (default: "This is a test for vectorization.")
+4. **Reporting**: Health status, errors, response times, and response details are reported
+
+## Use Cases
+
+### 1. Regular Health Monitoring
+
+Run as a cron job or scheduled task:
+
+```bash
+# Cron job: Run every 5 minutes
+*/5 * * * * /path/to/health_check.sh
+```
+
+### 2. Load/Stress Testing
+
+Run multiple health checks in parallel:
+
+**PowerShell:**
+```powershell
+.\scripts\health_check\run_parallel_health_checks.ps1 16
+```
+
+### 3. CI/CD Integration
+
+Add to your deployment pipeline:
+
+```yaml
+# GitHub Actions example
+- name: Health Check
+  run: |
+    docker run --rm \
+      -e LITELLM_BASE_URL="${{ secrets.LITELLM_BASE_URL }}" \
+      -e LITELLM_API_KEY="${{ secrets.LITELLM_API_KEY }}" \
+      litellm/litellm-health-check:latest
+```
+
+### 4. Kubernetes Deployment
+
+Deploy as a CronJob:
+
+```yaml
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: litellm-health-check
+spec:
+  schedule: "*/5 * * * *"  # Every 5 minutes
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          containers:
+          - name: health-check
+            image: litellm/litellm-health-check:latest
+            env:
+            - name: LITELLM_BASE_URL
+              value: "https://litellm.example.com"
+            - name: LITELLM_API_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: litellm-secrets
+                  key: api-key
+          restartPolicy: OnFailure
+```
+
+## Troubleshooting
+
+### No Models Found
+
+- Verify `LITELLM_BASE_URL` is correct
+- Check that the API key has permissions to list models
+- Ensure the proxy is running and accessible
+- If using YAML, verify `LITELLM_MODELS_YAML` path is correct
+
+### Timeout Errors
+
+- Increase `LITELLM_TIMEOUT` for slower models (default is 120s)
+- Check network connectivity to the proxy
+- Verify proxy isn't overloaded
+
+### Authentication Errors
+
+- Verify `LITELLM_API_KEY` is correct
+- Check API key has not expired
+- Ensure the key has necessary permissions
+
+## Dependencies
+
+- Python 3.11+
+- httpx (for async HTTP requests)
+- pyyaml (for YAML config file support)
+- Docker or Podman (for containerized execution)
+- PowerShell (for parallel execution script on Windows)
+
+## License
+
+Same as LiteLLM project.
--- a/scripts/health_check/health_check_requirements.txt
+++ b/scripts/health_check/health_check_requirements.txt
@ -0,0 +1,2 @@
+httpx>=0.24.0
+pyyaml>=6.0
--- a/scripts/health_check/run_parallel_health_checks.ps1
+++ b/scripts/health_check/run_parallel_health_checks.ps1
@ -0,0 +1,69 @@
+# Parallel LiteLLM Health Check Runner (PowerShell version)
+#
+# This script runs multiple health check containers in parallel.
+#
+# Usage:
+#   $env:LITELLM_BASE_URL="https://litellm.example.com"
+#   $env:LITELLM_API_KEY="your-api-key"
+#   .\run_parallel_health_checks.ps1 [num_parallel_jobs] [image_name]
+#
+# Defaults:
+#   - num_parallel_jobs: 16
+#   - image_name: litellm/litellm-health-check:latest
+
+param(
+    [int]$NumParallelJobs = 16,
+    [string]$ImageName = "litellm/litellm-health-check:latest",
+    [string]$ContainerRuntime = "docker"
+)
+
+# Set defaults for environment variables if not provided
+if (-not $env:LITELLM_BASE_URL) {
+    $env:LITELLM_BASE_URL = "https://litellm-perf-cache-and-router.onrender.com"
+    Write-Warning "LITELLM_BASE_URL not set, using default: $env:LITELLM_BASE_URL"
+}
+
+if (-not $env:LITELLM_API_KEY) {
+    $env:LITELLM_API_KEY = "sk-1234"
+    Write-Warning "LITELLM_API_KEY not set, using default: $env:LITELLM_API_KEY"
+}
+
+# Check if container runtime is available
+$runtimeExists = Get-Command $ContainerRuntime -ErrorAction SilentlyContinue
+if (-not $runtimeExists) {
+    Write-Error "Error: $ContainerRuntime is not installed"
+    exit 1
+}
+
+Write-Host "Running $NumParallelJobs parallel health check containers..." -ForegroundColor Yellow
+Write-Host "Using image: $ImageName" -ForegroundColor Yellow
+Write-Host "Container runtime: $ContainerRuntime" -ForegroundColor Yellow
+Write-Host "LiteLLM Base URL: $env:LITELLM_BASE_URL" -ForegroundColor Cyan
+Write-Host ""
+Write-Host "NOTE: This will run continuously. Press Ctrl+C to stop." -ForegroundColor Red
+Write-Host ""
+Write-Host "Troubleshooting:" -ForegroundColor Yellow
+Write-Host "  - If you see 'All connection attempts failed', check:" -ForegroundColor Yellow
+Write-Host "    1. Is the LiteLLM proxy running on the expected port?" -ForegroundColor Yellow
+Write-Host "    2. Set LITELLM_BASE_URL to the correct URL (e.g., http://host.docker.internal:PORT)" -ForegroundColor Yellow
+Write-Host "    3. On Linux, you may need to use the host IP instead of host.docker.internal" -ForegroundColor Yellow
+Write-Host ""
+
+# Run parallel health checks
+# This creates an infinite loop that keeps spawning containers
+# Each container tests all models, then exits, and a new one starts
+while ($true) {
+    # Start up to NumParallelJobs containers in parallel
+    1..$NumParallelJobs | ForEach-Object -Parallel {
+        $runtime = $using:ContainerRuntime
+        $imageName = $using:ImageName
+        $baseUrl = $env:LITELLM_BASE_URL
+        $apiKey = $env:LITELLM_API_KEY
+        
+        & $runtime run --rm `
+            -e LITELLM_BASE_URL="$baseUrl" `
+            -e LITELLM_API_KEY="$apiKey" `
+            -e LITELLM_JSON_OUTPUT="true" `
+            $imageName
+    } -ThrottleLimit $NumParallelJobs
+}
--- a/scripts/health_check/run_parallel_health_checks.sh
+++ b/scripts/health_check/run_parallel_health_checks.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+# Parallel LiteLLM Health Check Runner (Bash version)
+#
+# This script runs multiple health check containers in parallel.
+#
+# Usage:
+#   export LITELLM_BASE_URL="https://litellm.example.com"
+#   export LITELLM_API_KEY="your-api-key"
+#   ./run_parallel_health_checks.sh [num_parallel_jobs] [image_name] [container_runtime]
+#
+# Defaults:
+#   - num_parallel_jobs: 16
+#   - image_name: litellm/litellm-health-check:latest
+#   - container_runtime: docker
+
+set -e
+
+# Default values
+NUM_PARALLEL_JOBS="${1:-16}"
+IMAGE_NAME="${2:-litellm/litellm-health-check:latest}"
+CONTAINER_RUNTIME="${3:-docker}"
+
+# Set defaults for environment variables if not provided
+if [ -z "$LITELLM_BASE_URL" ]; then
+    export LITELLM_BASE_URL="https://litellm-perf-cache-and-router.onrender.com"
+    echo "Warning: LITELLM_BASE_URL not set, using default: $LITELLM_BASE_URL" >&2
+fi
+
+if [ -z "$LITELLM_API_KEY" ]; then
+    export LITELLM_API_KEY="sk-1234"
+    echo "Warning: LITELLM_API_KEY not set, using default: $LITELLM_API_KEY" >&2
+fi
+
+# Check if container runtime is available
+if ! command -v "$CONTAINER_RUNTIME" &> /dev/null; then
+    echo "Error: $CONTAINER_RUNTIME is not installed" >&2
+    exit 1
+fi
+
+# Print configuration
+echo "Running $NUM_PARALLEL_JOBS parallel health check containers..."
+echo "Using image: $IMAGE_NAME"
+echo "Container runtime: $CONTAINER_RUNTIME"
+echo "LiteLLM Base URL: $LITELLM_BASE_URL"
+echo ""
+echo "NOTE: This will run continuously. Press Ctrl+C to stop."
+echo ""
+echo "Troubleshooting:"
+echo "  - If you see 'All connection attempts failed', check:"
+echo "    1. Is the LiteLLM proxy running on the expected port?"
+echo "    2. Set LITELLM_BASE_URL to the correct URL (e.g., http://host.docker.internal:PORT)"
+echo "    3. On Linux, you may need to use the host IP instead of host.docker.internal"
+echo ""
+
+# Function to run a single health check container
+run_health_check() {
+    "$CONTAINER_RUNTIME" run --rm \
+        -e LITELLM_BASE_URL="$LITELLM_BASE_URL" \
+        -e LITELLM_API_KEY="$LITELLM_API_KEY" \
+        -e LITELLM_JSON_OUTPUT="true" \
+        "$IMAGE_NAME"
+}
+
+# Run parallel health checks
+# This creates an infinite loop that keeps spawning containers
+# Each container tests all models, then exits, and a new one starts
+while true; do
+    # Start containers in parallel using background jobs
+    pids=()
+    for ((i=1; i<=NUM_PARALLEL_JOBS; i++)); do
+        run_health_check &
+        pids+=($!)
+    done
+    
+    # Wait for all background jobs to complete
+    for pid in "${pids[@]}"; do
+        wait "$pid" 2>/dev/null || true
+    done
+done