Add health check scripts and parallel execution support (#19295)
- Add health_check_client.py for monitoring model availability - Add health_check_client_README.md with usage documentation - Add health_check_requirements.txt for dependencies - Add run_parallel_health_checks.ps1 (PowerShell version) - Add run_parallel_health_checks.sh (Bash version) - Organize all scripts under scripts/health_check/ directory
This commit is contained in:
parent
0862373b38
commit
0cd7763d5f
16
docker/Dockerfile.health_check
Normal file
16
docker/Dockerfile.health_check
Normal file
@ -0,0 +1,16 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy health check script and requirements
|
||||
COPY scripts/health_check/health_check_client.py /app/health_check_client.py
|
||||
COPY scripts/health_check/health_check_requirements.txt /app/requirements.txt
|
||||
|
||||
# Install dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Make script executable
|
||||
RUN chmod +x /app/health_check_client.py
|
||||
|
||||
# Set entrypoint
|
||||
ENTRYPOINT ["python", "/app/health_check_client.py"]
|
||||
406
scripts/health_check/health_check_client.py
Normal file
406
scripts/health_check/health_check_client.py
Normal file
@ -0,0 +1,406 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
LiteLLM Health Check Client
|
||||
|
||||
A sentinel health check tool that tests all configured models on a LiteLLM proxy.
|
||||
Similar to HRT's health check system, this script:
|
||||
- Can read models from YAML config file (like HRT) or fetch from proxy API
|
||||
- Sends a simple test request to each model concurrently
|
||||
- Reports health status for each model
|
||||
- Supports both chat/completion and embedding models
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import httpx
|
||||
import yaml
|
||||
|
||||
|
||||
class LiteLLMHealthCheckClient:
|
||||
"""Client for health checking LiteLLM proxy models."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
timeout: int = 120, # Match Go implementation's 120s timeout
|
||||
completion_prompt: str = "Say this is a test", # Match Go implementation
|
||||
embedding_text: str = "This is a test for vectorization.", # Match Go implementation
|
||||
):
|
||||
"""
|
||||
Initialize the health check client.
|
||||
|
||||
Args:
|
||||
base_url: Base URL of the LiteLLM proxy (e.g., https://litellm.example.com)
|
||||
api_key: API key for authentication
|
||||
timeout: Request timeout in seconds (default: 120, matching Go implementation)
|
||||
completion_prompt: Test prompt for chat/completion models
|
||||
embedding_text: Test text for embedding models
|
||||
"""
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.api_key = api_key
|
||||
self.timeout = timeout
|
||||
self.completion_prompt = completion_prompt
|
||||
self.embedding_text = embedding_text
|
||||
self.headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
def load_models_from_yaml(self, yaml_path: str) -> List[Dict]:
|
||||
"""
|
||||
Load models from a YAML config file (similar to Go implementation).
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the YAML config file
|
||||
|
||||
Returns:
|
||||
List of model dictionaries with 'id' and 'mode' keys
|
||||
"""
|
||||
try:
|
||||
with open(yaml_path, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
model_list = config.get("model_list", [])
|
||||
models = []
|
||||
|
||||
for entry in model_list:
|
||||
model_name = entry.get("model_name", "")
|
||||
litellm_params = entry.get("litellm_params", {})
|
||||
model_info = litellm_params.get("model_info", {})
|
||||
mode = model_info.get("mode", "")
|
||||
|
||||
# Use model_name as the ID (this is what gets sent to the API)
|
||||
models.append(
|
||||
{
|
||||
"id": model_name,
|
||||
"mode": mode.lower() if mode else "",
|
||||
"provider": model_info.get("provider", ""),
|
||||
}
|
||||
)
|
||||
|
||||
return models
|
||||
except Exception as e:
|
||||
print(f"Error loading models from YAML file {yaml_path}: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
async def fetch_models(self, client: httpx.AsyncClient) -> List[Dict]:
|
||||
"""
|
||||
Fetch all available models from the proxy API.
|
||||
|
||||
Returns:
|
||||
List of model dictionaries with 'id' and 'mode' keys
|
||||
"""
|
||||
try:
|
||||
# Try /v1/models first (OpenAI-compatible endpoint)
|
||||
response = await client.get(
|
||||
f"{self.base_url}/v1/models",
|
||||
headers=self.headers,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
models_data = data.get("data", [])
|
||||
models = []
|
||||
for m in models_data:
|
||||
models.append({"id": m["id"], "mode": "", "provider": ""})
|
||||
return models
|
||||
except Exception as e:
|
||||
print(f"Error fetching models from /v1/models: {e}", file=sys.stderr)
|
||||
# Fallback to /model/info endpoint which has more details
|
||||
try:
|
||||
response = await client.get(
|
||||
f"{self.base_url}/model/info",
|
||||
headers=self.headers,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
if isinstance(data, dict) and "data" in data:
|
||||
models_data = data["data"]
|
||||
elif isinstance(data, list):
|
||||
models_data = data
|
||||
else:
|
||||
models_data = []
|
||||
|
||||
models = []
|
||||
for m in models_data:
|
||||
model_info = m.get("model_info", {})
|
||||
mode = model_info.get("mode", "")
|
||||
models.append(
|
||||
{
|
||||
"id": m.get("model_name", m.get("id", "unknown")),
|
||||
"mode": mode.lower() if mode else "",
|
||||
"provider": model_info.get("provider", ""),
|
||||
}
|
||||
)
|
||||
return models
|
||||
except Exception as e2:
|
||||
print(f"Error fetching models from /model/info: {e2}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
async def check_model_health(
|
||||
self, client: httpx.AsyncClient, model: Dict
|
||||
) -> Tuple[str, Dict]:
|
||||
"""
|
||||
Check health of a single model by sending a test request.
|
||||
|
||||
Args:
|
||||
client: HTTP client
|
||||
model: Model dictionary with 'id' and 'mode' keys
|
||||
|
||||
Returns:
|
||||
Tuple of (model_id, result_dict)
|
||||
"""
|
||||
model_id = model["id"]
|
||||
mode = model.get("mode", "")
|
||||
|
||||
start_time = time.time()
|
||||
result = {
|
||||
"model": model_id,
|
||||
"healthy": False,
|
||||
"error": None,
|
||||
"response_time_ms": None,
|
||||
"mode": mode,
|
||||
}
|
||||
|
||||
try:
|
||||
# Determine if this is an embedding model
|
||||
# Check mode first (from config), then fall back to name-based detection
|
||||
is_embedding = (
|
||||
mode == "embedding"
|
||||
or any(
|
||||
keyword in model_id.lower()
|
||||
for keyword in ["embedding", "embed", "text-embedding"]
|
||||
)
|
||||
)
|
||||
|
||||
if is_embedding:
|
||||
# Test embedding endpoint (matching Go implementation)
|
||||
embedding_response = await client.post(
|
||||
f"{self.base_url}/v1/embeddings",
|
||||
headers=self.headers,
|
||||
json={
|
||||
"model": model_id,
|
||||
"input": self.embedding_text,
|
||||
},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
embedding_response.raise_for_status()
|
||||
embedding_data = embedding_response.json()
|
||||
dimensions = 0
|
||||
if "data" in embedding_data and len(embedding_data["data"]) > 0:
|
||||
dimensions = len(embedding_data["data"][0].get("embedding", []))
|
||||
|
||||
result["healthy"] = True
|
||||
result["mode"] = "embedding"
|
||||
result["dimensions"] = dimensions
|
||||
else:
|
||||
# Test chat completion endpoint (matching Go implementation)
|
||||
completion_response = await client.post(
|
||||
f"{self.base_url}/v1/chat/completions",
|
||||
headers=self.headers,
|
||||
json={
|
||||
"model": model_id,
|
||||
"messages": [
|
||||
{"role": "user", "content": self.completion_prompt}
|
||||
],
|
||||
"max_tokens": 10, # Minimal tokens for health check
|
||||
},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
completion_response.raise_for_status()
|
||||
completion_data = completion_response.json()
|
||||
response_text = ""
|
||||
if "choices" in completion_data and len(completion_data["choices"]) > 0:
|
||||
response_text = (
|
||||
completion_data["choices"][0]
|
||||
.get("message", {})
|
||||
.get("content", "")
|
||||
)
|
||||
|
||||
result["healthy"] = True
|
||||
result["mode"] = "chat"
|
||||
result["response_text"] = response_text[:100] # Truncate for display
|
||||
|
||||
elapsed_ms = (time.time() - start_time) * 1000
|
||||
result["response_time_ms"] = round(elapsed_ms, 2)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
result["error"] = f"HTTP {e.response.status_code}: {e.response.text[:200]}"
|
||||
except httpx.TimeoutException:
|
||||
result["error"] = f"Request timeout after {self.timeout}s"
|
||||
except Exception as e:
|
||||
result["error"] = str(e)[:200]
|
||||
|
||||
return model_id, result
|
||||
|
||||
async def run_health_checks(
|
||||
self,
|
||||
models: Optional[List[Dict]] = None,
|
||||
models_only: Optional[List[str]] = None,
|
||||
) -> Dict[str, Dict]:
|
||||
"""
|
||||
Run health checks on all models concurrently.
|
||||
|
||||
Args:
|
||||
models: Optional list of models to check. If None, fetches from proxy.
|
||||
models_only: Optional list of model IDs to check. If set, only these
|
||||
models are health-checked (must exist in the models list).
|
||||
|
||||
Returns:
|
||||
Dictionary mapping model_id to health check result
|
||||
"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
if models is None:
|
||||
models = await self.fetch_models(client)
|
||||
|
||||
if not models:
|
||||
print("No models found to health check", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
if models_only:
|
||||
allowlist = {m.strip() for m in models_only if m and m.strip()}
|
||||
models = [m for m in models if m.get("id") in allowlist]
|
||||
print(
|
||||
f"Filtering to only check {len(models)} models: {', '.join(sorted(allowlist))}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
if not models:
|
||||
print(
|
||||
"No models matched LITELLM_MODELS_ONLY filter",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return {}
|
||||
|
||||
print(f"Running health checks on {len(models)} models...", file=sys.stderr)
|
||||
|
||||
# Run all health checks concurrently
|
||||
tasks = [self.check_model_health(client, model) for model in models]
|
||||
results_list = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Convert to dictionary format
|
||||
results = {}
|
||||
for result in results_list:
|
||||
if isinstance(result, Exception):
|
||||
print(
|
||||
f"Exception in health check task: {result}", file=sys.stderr
|
||||
)
|
||||
continue
|
||||
# Type narrowing: after checking it's not an Exception, it's a Tuple
|
||||
if isinstance(result, tuple) and len(result) == 2:
|
||||
model_id, result_dict = result
|
||||
results[model_id] = result_dict
|
||||
|
||||
return results
|
||||
|
||||
def print_results(self, results: Dict[str, Dict], json_output: bool = False):
|
||||
"""
|
||||
Print health check results.
|
||||
|
||||
Args:
|
||||
results: Dictionary of health check results
|
||||
json_output: If True, output as JSON
|
||||
"""
|
||||
if json_output:
|
||||
print(json.dumps(results, indent=2))
|
||||
return
|
||||
|
||||
healthy_count = sum(1 for r in results.values() if r.get("healthy"))
|
||||
unhealthy_count = len(results) - healthy_count
|
||||
|
||||
# Print detailed results for each model (matching Go output format)
|
||||
print(f"\n{'='*60}", file=sys.stderr)
|
||||
print(f"Starting health check queries\n", file=sys.stderr)
|
||||
|
||||
for model_id, result in results.items():
|
||||
if result.get("healthy"):
|
||||
if result.get("mode") == "embedding":
|
||||
dimensions = result.get("dimensions", 0)
|
||||
print(
|
||||
f"---- {model_id} ----\n✅ Success. "
|
||||
f"Generated embedding vector with {dimensions} dimensions.\n\n",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
response_text = result.get("response_text", "")
|
||||
print(
|
||||
f"---- {model_id} ----\n✅ Success. "
|
||||
f"Response:\n{response_text}\n\n",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
error = result.get("error", "Unknown error")
|
||||
print(f"---- {model_id} ----\n❌ ERROR: {error}\n\n", file=sys.stderr)
|
||||
|
||||
print(f"{'='*60}", file=sys.stderr)
|
||||
print(f"Health Check Summary", file=sys.stderr)
|
||||
print(f"{'='*60}", file=sys.stderr)
|
||||
print(f"Total models: {len(results)}", file=sys.stderr)
|
||||
print(f"Healthy: {healthy_count}", file=sys.stderr)
|
||||
print(f"Unhealthy: {unhealthy_count}", file=sys.stderr)
|
||||
print(f"{'='*60}\n", file=sys.stderr)
|
||||
|
||||
# Exit with non-zero code if any models are unhealthy
|
||||
if unhealthy_count > 0:
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point."""
|
||||
base_url = os.environ.get("LITELLM_BASE_URL", "http://localhost:4000")
|
||||
api_key = os.environ.get("LITELLM_API_KEY", "sk-1234")
|
||||
yaml_path = os.environ.get("LITELLM_MODELS_YAML")
|
||||
|
||||
if not base_url:
|
||||
print("Error: LITELLM_BASE_URL environment variable not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not api_key:
|
||||
print("Error: LITELLM_API_KEY environment variable not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
timeout = int(os.environ.get("LITELLM_TIMEOUT", "120")) # Match Go's 120s default
|
||||
completion_prompt = os.environ.get(
|
||||
"LITELLM_COMPLETION_PROMPT", "Say this is a test"
|
||||
)
|
||||
embedding_text = os.environ.get(
|
||||
"LITELLM_EMBEDDING_TEXT", "This is a test for vectorization."
|
||||
)
|
||||
json_output = os.environ.get("LITELLM_JSON_OUTPUT", "").lower() == "true"
|
||||
# Optional: only health-check these model IDs (comma-separated). E.g.:
|
||||
# LITELLM_MODELS_ONLY=claude-3.7-sonnet,claude-3.5-sonnet,claude-4.5-haiku
|
||||
models_only_raw = os.environ.get("LITELLM_MODELS_ONLY", "")
|
||||
models_only = [m.strip() for m in models_only_raw.split(",") if m.strip()] or None
|
||||
|
||||
client = LiteLLMHealthCheckClient(
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
timeout=timeout,
|
||||
completion_prompt=completion_prompt,
|
||||
embedding_text=embedding_text,
|
||||
)
|
||||
|
||||
# Load models from YAML if provided, otherwise fetch from API
|
||||
models = None
|
||||
if yaml_path:
|
||||
models = client.load_models_from_yaml(yaml_path)
|
||||
if models:
|
||||
print(
|
||||
f"Successfully loaded {len(models)} models from {yaml_path}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
results = await client.run_health_checks(models=models, models_only=models_only)
|
||||
client.print_results(results, json_output=json_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
246
scripts/health_check/health_check_client_README.md
Normal file
246
scripts/health_check/health_check_client_README.md
Normal file
@ -0,0 +1,246 @@
|
||||
# LiteLLM Health Check Client
|
||||
|
||||
A health check tool for testing all configured models on a LiteLLM proxy. Tests each model with completion/embedding requests and reports health status, errors, and response times.
|
||||
|
||||
## Features
|
||||
|
||||
- **YAML Config Support**: Reads models from YAML config file OR fetches from proxy API
|
||||
- **Smart Mode Detection**: Detects embedding vs chat models from config or model name
|
||||
- **Concurrent Testing**: Tests all models concurrently using asyncio
|
||||
- **Containerized**: Docker image for easy deployment
|
||||
- **Parallel Execution**: Supports parallel execution for stress testing
|
||||
- **Configurable**: Customizable timeouts (default 120s) and test prompts
|
||||
|
||||
## Quick Start
|
||||
|
||||
### As a Python Script
|
||||
|
||||
**Option 1: Fetch models from proxy API**
|
||||
```bash
|
||||
export LITELLM_BASE_URL="https://litellm.example.com"
|
||||
export LITELLM_API_KEY="your-api-key"
|
||||
python scripts/health_check/health_check_client.py
|
||||
```
|
||||
|
||||
**Option 2: Use YAML config file**
|
||||
```bash
|
||||
export LITELLM_BASE_URL="https://litellm.example.com"
|
||||
export LITELLM_API_KEY="your-api-key"
|
||||
export LITELLM_MODELS_YAML="/path/to/config.yaml"
|
||||
python scripts/health_check/health_check_client.py
|
||||
```
|
||||
|
||||
### As a Docker Container
|
||||
|
||||
1. Build the Docker image:
|
||||
|
||||
```bash
|
||||
docker build -f docker/Dockerfile.health_check -t litellm/litellm-health-check:latest .
|
||||
```
|
||||
|
||||
2. Run a single health check:
|
||||
|
||||
```bash
|
||||
docker run --rm \
|
||||
-e LITELLM_BASE_URL="https://litellm.example.com" \
|
||||
-e LITELLM_API_KEY="your-api-key" \
|
||||
litellm/litellm-health-check:latest
|
||||
```
|
||||
|
||||
### Parallel Execution (Stress Testing)
|
||||
|
||||
Run multiple health check containers in parallel:
|
||||
|
||||
**PowerShell:**
|
||||
```powershell
|
||||
$env:LITELLM_BASE_URL="https://litellm.example.com"
|
||||
$env:LITELLM_API_KEY="your-api-key"
|
||||
.\scripts\health_check\run_parallel_health_checks.ps1 16
|
||||
```
|
||||
|
||||
**Bash/Shell:**
|
||||
```bash
|
||||
export LITELLM_BASE_URL="https://litellm.example.com"
|
||||
export LITELLM_API_KEY="your-api-key"
|
||||
./scripts/health_check/run_parallel_health_checks.sh 16
|
||||
```
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
- `LITELLM_BASE_URL` (required): Base URL of the LiteLLM proxy
|
||||
- Example: `https://litellm.example.com`
|
||||
- `LITELLM_API_KEY` (required): API key for authentication
|
||||
- `LITELLM_MODELS_YAML` (optional): Path to YAML config file with model_list
|
||||
- If provided, reads models from YAML instead of fetching from API
|
||||
- Example: `/path/to/config.yaml`
|
||||
- `LITELLM_TIMEOUT` (optional): Request timeout in seconds (default: 120)
|
||||
- `LITELLM_COMPLETION_PROMPT` (optional): Test prompt for chat/completion models (default: "Say this is a test")
|
||||
- `LITELLM_EMBEDDING_TEXT` (optional): Test text for embedding models (default: "This is a test for vectorization.")
|
||||
- `LITELLM_JSON_OUTPUT` (optional): Output results as JSON (default: false)
|
||||
|
||||
## Output
|
||||
|
||||
### Standard Output (Human-Readable)
|
||||
|
||||
Example output format:
|
||||
|
||||
```
|
||||
============================================================
|
||||
Starting health check queries
|
||||
|
||||
---- gpt-4o ----
|
||||
✅ Success. Response:
|
||||
This is a test
|
||||
|
||||
---- text-embedding-3-small ----
|
||||
✅ Success. Generated embedding vector with 1536 dimensions.
|
||||
|
||||
---- gpt-5-codex ----
|
||||
❌ ERROR: HTTP 503: Service unavailable
|
||||
|
||||
============================================================
|
||||
Health Check Summary
|
||||
============================================================
|
||||
Total models: 47
|
||||
Healthy: 45
|
||||
Unhealthy: 2
|
||||
============================================================
|
||||
```
|
||||
|
||||
Exit code: `0` if all models are healthy, `1` if any models are unhealthy.
|
||||
|
||||
### JSON Output
|
||||
|
||||
When `LITELLM_JSON_OUTPUT=true`, outputs JSON:
|
||||
|
||||
```json
|
||||
{
|
||||
"gpt-4o": {
|
||||
"model": "gpt-4o",
|
||||
"healthy": true,
|
||||
"error": null,
|
||||
"response_time_ms": 245.67,
|
||||
"mode": "chat",
|
||||
"response_text": "This is a test"
|
||||
},
|
||||
"text-embedding-3-small": {
|
||||
"model": "text-embedding-3-small",
|
||||
"healthy": true,
|
||||
"error": null,
|
||||
"response_time_ms": 123.45,
|
||||
"mode": "embedding",
|
||||
"dimensions": 1536
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Model Discovery**:
|
||||
- If `LITELLM_MODELS_YAML` is set: Reads models from YAML config file
|
||||
- Otherwise: Queries `/v1/models` (OpenAI-compatible) or `/model/info` to get all configured models
|
||||
2. **Mode Detection**:
|
||||
- Checks `mode` field from YAML config, or falls back to model name patterns (embedding, embed, text-embedding)
|
||||
3. **Concurrent Testing**:
|
||||
- Chat models: `POST /v1/chat/completions` with configurable prompt (default: "Say this is a test")
|
||||
- Embedding models: `POST /v1/embeddings` with configurable text (default: "This is a test for vectorization.")
|
||||
4. **Reporting**: Health status, errors, response times, and response details are reported
|
||||
|
||||
## Use Cases
|
||||
|
||||
### 1. Regular Health Monitoring
|
||||
|
||||
Run as a cron job or scheduled task:
|
||||
|
||||
```bash
|
||||
# Cron job: Run every 5 minutes
|
||||
*/5 * * * * /path/to/health_check.sh
|
||||
```
|
||||
|
||||
### 2. Load/Stress Testing
|
||||
|
||||
Run multiple health checks in parallel:
|
||||
|
||||
**PowerShell:**
|
||||
```powershell
|
||||
.\scripts\health_check\run_parallel_health_checks.ps1 16
|
||||
```
|
||||
|
||||
### 3. CI/CD Integration
|
||||
|
||||
Add to your deployment pipeline:
|
||||
|
||||
```yaml
|
||||
# GitHub Actions example
|
||||
- name: Health Check
|
||||
run: |
|
||||
docker run --rm \
|
||||
-e LITELLM_BASE_URL="${{ secrets.LITELLM_BASE_URL }}" \
|
||||
-e LITELLM_API_KEY="${{ secrets.LITELLM_API_KEY }}" \
|
||||
litellm/litellm-health-check:latest
|
||||
```
|
||||
|
||||
### 4. Kubernetes Deployment
|
||||
|
||||
Deploy as a CronJob:
|
||||
|
||||
```yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: litellm-health-check
|
||||
spec:
|
||||
schedule: "*/5 * * * *" # Every 5 minutes
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: health-check
|
||||
image: litellm/litellm-health-check:latest
|
||||
env:
|
||||
- name: LITELLM_BASE_URL
|
||||
value: "https://litellm.example.com"
|
||||
- name: LITELLM_API_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: litellm-secrets
|
||||
key: api-key
|
||||
restartPolicy: OnFailure
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### No Models Found
|
||||
|
||||
- Verify `LITELLM_BASE_URL` is correct
|
||||
- Check that the API key has permissions to list models
|
||||
- Ensure the proxy is running and accessible
|
||||
- If using YAML, verify `LITELLM_MODELS_YAML` path is correct
|
||||
|
||||
### Timeout Errors
|
||||
|
||||
- Increase `LITELLM_TIMEOUT` for slower models (default is 120s)
|
||||
- Check network connectivity to the proxy
|
||||
- Verify proxy isn't overloaded
|
||||
|
||||
### Authentication Errors
|
||||
|
||||
- Verify `LITELLM_API_KEY` is correct
|
||||
- Check API key has not expired
|
||||
- Ensure the key has necessary permissions
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Python 3.11+
|
||||
- httpx (for async HTTP requests)
|
||||
- pyyaml (for YAML config file support)
|
||||
- Docker or Podman (for containerized execution)
|
||||
- PowerShell (for parallel execution script on Windows)
|
||||
|
||||
## License
|
||||
|
||||
Same as LiteLLM project.
|
||||
2
scripts/health_check/health_check_requirements.txt
Normal file
2
scripts/health_check/health_check_requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
httpx>=0.24.0
|
||||
pyyaml>=6.0
|
||||
69
scripts/health_check/run_parallel_health_checks.ps1
Normal file
69
scripts/health_check/run_parallel_health_checks.ps1
Normal file
@ -0,0 +1,69 @@
|
||||
# Parallel LiteLLM Health Check Runner (PowerShell version)
|
||||
#
|
||||
# This script runs multiple health check containers in parallel.
|
||||
#
|
||||
# Usage:
|
||||
# $env:LITELLM_BASE_URL="https://litellm.example.com"
|
||||
# $env:LITELLM_API_KEY="your-api-key"
|
||||
# .\run_parallel_health_checks.ps1 [num_parallel_jobs] [image_name]
|
||||
#
|
||||
# Defaults:
|
||||
# - num_parallel_jobs: 16
|
||||
# - image_name: litellm/litellm-health-check:latest
|
||||
|
||||
param(
|
||||
[int]$NumParallelJobs = 16,
|
||||
[string]$ImageName = "litellm/litellm-health-check:latest",
|
||||
[string]$ContainerRuntime = "docker"
|
||||
)
|
||||
|
||||
# Set defaults for environment variables if not provided
|
||||
if (-not $env:LITELLM_BASE_URL) {
|
||||
$env:LITELLM_BASE_URL = "https://litellm-perf-cache-and-router.onrender.com"
|
||||
Write-Warning "LITELLM_BASE_URL not set, using default: $env:LITELLM_BASE_URL"
|
||||
}
|
||||
|
||||
if (-not $env:LITELLM_API_KEY) {
|
||||
$env:LITELLM_API_KEY = "sk-1234"
|
||||
Write-Warning "LITELLM_API_KEY not set, using default: $env:LITELLM_API_KEY"
|
||||
}
|
||||
|
||||
# Check if container runtime is available
|
||||
$runtimeExists = Get-Command $ContainerRuntime -ErrorAction SilentlyContinue
|
||||
if (-not $runtimeExists) {
|
||||
Write-Error "Error: $ContainerRuntime is not installed"
|
||||
exit 1
|
||||
}
|
||||
|
||||
Write-Host "Running $NumParallelJobs parallel health check containers..." -ForegroundColor Yellow
|
||||
Write-Host "Using image: $ImageName" -ForegroundColor Yellow
|
||||
Write-Host "Container runtime: $ContainerRuntime" -ForegroundColor Yellow
|
||||
Write-Host "LiteLLM Base URL: $env:LITELLM_BASE_URL" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
Write-Host "NOTE: This will run continuously. Press Ctrl+C to stop." -ForegroundColor Red
|
||||
Write-Host ""
|
||||
Write-Host "Troubleshooting:" -ForegroundColor Yellow
|
||||
Write-Host " - If you see 'All connection attempts failed', check:" -ForegroundColor Yellow
|
||||
Write-Host " 1. Is the LiteLLM proxy running on the expected port?" -ForegroundColor Yellow
|
||||
Write-Host " 2. Set LITELLM_BASE_URL to the correct URL (e.g., http://host.docker.internal:PORT)" -ForegroundColor Yellow
|
||||
Write-Host " 3. On Linux, you may need to use the host IP instead of host.docker.internal" -ForegroundColor Yellow
|
||||
Write-Host ""
|
||||
|
||||
# Run parallel health checks
|
||||
# This creates an infinite loop that keeps spawning containers
|
||||
# Each container tests all models, then exits, and a new one starts
|
||||
while ($true) {
|
||||
# Start up to NumParallelJobs containers in parallel
|
||||
1..$NumParallelJobs | ForEach-Object -Parallel {
|
||||
$runtime = $using:ContainerRuntime
|
||||
$imageName = $using:ImageName
|
||||
$baseUrl = $env:LITELLM_BASE_URL
|
||||
$apiKey = $env:LITELLM_API_KEY
|
||||
|
||||
& $runtime run --rm `
|
||||
-e LITELLM_BASE_URL="$baseUrl" `
|
||||
-e LITELLM_API_KEY="$apiKey" `
|
||||
-e LITELLM_JSON_OUTPUT="true" `
|
||||
$imageName
|
||||
} -ThrottleLimit $NumParallelJobs
|
||||
}
|
||||
79
scripts/health_check/run_parallel_health_checks.sh
Normal file
79
scripts/health_check/run_parallel_health_checks.sh
Normal file
@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
# Parallel LiteLLM Health Check Runner (Bash version)
|
||||
#
|
||||
# This script runs multiple health check containers in parallel.
|
||||
#
|
||||
# Usage:
|
||||
# export LITELLM_BASE_URL="https://litellm.example.com"
|
||||
# export LITELLM_API_KEY="your-api-key"
|
||||
# ./run_parallel_health_checks.sh [num_parallel_jobs] [image_name] [container_runtime]
|
||||
#
|
||||
# Defaults:
|
||||
# - num_parallel_jobs: 16
|
||||
# - image_name: litellm/litellm-health-check:latest
|
||||
# - container_runtime: docker
|
||||
|
||||
set -e
|
||||
|
||||
# Default values
|
||||
NUM_PARALLEL_JOBS="${1:-16}"
|
||||
IMAGE_NAME="${2:-litellm/litellm-health-check:latest}"
|
||||
CONTAINER_RUNTIME="${3:-docker}"
|
||||
|
||||
# Set defaults for environment variables if not provided
|
||||
if [ -z "$LITELLM_BASE_URL" ]; then
|
||||
export LITELLM_BASE_URL="https://litellm-perf-cache-and-router.onrender.com"
|
||||
echo "Warning: LITELLM_BASE_URL not set, using default: $LITELLM_BASE_URL" >&2
|
||||
fi
|
||||
|
||||
if [ -z "$LITELLM_API_KEY" ]; then
|
||||
export LITELLM_API_KEY="sk-1234"
|
||||
echo "Warning: LITELLM_API_KEY not set, using default: $LITELLM_API_KEY" >&2
|
||||
fi
|
||||
|
||||
# Check if container runtime is available
|
||||
if ! command -v "$CONTAINER_RUNTIME" &> /dev/null; then
|
||||
echo "Error: $CONTAINER_RUNTIME is not installed" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Print configuration
|
||||
echo "Running $NUM_PARALLEL_JOBS parallel health check containers..."
|
||||
echo "Using image: $IMAGE_NAME"
|
||||
echo "Container runtime: $CONTAINER_RUNTIME"
|
||||
echo "LiteLLM Base URL: $LITELLM_BASE_URL"
|
||||
echo ""
|
||||
echo "NOTE: This will run continuously. Press Ctrl+C to stop."
|
||||
echo ""
|
||||
echo "Troubleshooting:"
|
||||
echo " - If you see 'All connection attempts failed', check:"
|
||||
echo " 1. Is the LiteLLM proxy running on the expected port?"
|
||||
echo " 2. Set LITELLM_BASE_URL to the correct URL (e.g., http://host.docker.internal:PORT)"
|
||||
echo " 3. On Linux, you may need to use the host IP instead of host.docker.internal"
|
||||
echo ""
|
||||
|
||||
# Function to run a single health check container
|
||||
run_health_check() {
|
||||
"$CONTAINER_RUNTIME" run --rm \
|
||||
-e LITELLM_BASE_URL="$LITELLM_BASE_URL" \
|
||||
-e LITELLM_API_KEY="$LITELLM_API_KEY" \
|
||||
-e LITELLM_JSON_OUTPUT="true" \
|
||||
"$IMAGE_NAME"
|
||||
}
|
||||
|
||||
# Run parallel health checks
|
||||
# This creates an infinite loop that keeps spawning containers
|
||||
# Each container tests all models, then exits, and a new one starts
|
||||
while true; do
|
||||
# Start containers in parallel using background jobs
|
||||
pids=()
|
||||
for ((i=1; i<=NUM_PARALLEL_JOBS; i++)); do
|
||||
run_health_check &
|
||||
pids+=($!)
|
||||
done
|
||||
|
||||
# Wait for all background jobs to complete
|
||||
for pid in "${pids[@]}"; do
|
||||
wait "$pid" 2>/dev/null || true
|
||||
done
|
||||
done
|
||||
Loading…
Reference in New Issue
Block a user