feat(proxy): cleanup spend logs cron verification, fix, and docs (#19085)

This commit is contained in:
Harshit Jain 2026-01-14 22:14:48 +05:30 committed by GitHub
parent e7cc53f217
commit e8c4cad885
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 199 additions and 36 deletions

View File

@ -30,6 +30,9 @@ general_settings:
# Optional: set how frequently cleanup should run - default is daily
maximum_spend_logs_retention_interval: "1d" # Run cleanup daily
# Optional: set exact time for cleanup (Cron syntax)
maximum_spend_logs_cleanup_cron: "0 4 * * *" # Run at 04:00 AM daily
litellm_settings:
cache: true
cache_params:
@ -51,6 +54,15 @@ How long logs should be kept before deletion. Supported formats:
How often the cleanup job should run. Uses the same format as above. If not set, cleanup will run every 24 hours if and only if `maximum_spend_logs_retention_period` is set.
#### `maximum_spend_logs_cleanup_cron` (optional)
Schedule the cleanup using standard cron syntax. This takes precedence over `maximum_spend_logs_retention_interval`.
Examples:
- `"0 4 * * *"` Run at 04:00 AM daily
- `"0 0 * * 0"` Run at midnight every Sunday
- `"*/30 * * * *"` Run every 30 minutes
## How it works
### Step 1. Lock Acquisition (Optional with Redis)

View File

@ -3240,20 +3240,22 @@ class ProxyConfig:
) -> Optional[dict]:
"""
Get router_settings in priority order: Key > Team > Global
Returns:
dict: Combined router_settings, or None if no settings found
"""
if prisma_client is None:
return None
import json
import yaml
# 1. Try key-level router_settings
if user_api_key_dict is not None:
# Check if router_settings is available on the key object
key_router_settings_value = getattr(user_api_key_dict, "router_settings", None)
key_router_settings_value = getattr(
user_api_key_dict, "router_settings", None
)
if key_router_settings_value is not None:
key_router_settings = None
if isinstance(key_router_settings_value, str):
@ -3266,11 +3268,15 @@ class ProxyConfig:
pass
elif isinstance(key_router_settings_value, dict):
key_router_settings = key_router_settings_value
# If key has router_settings (non-empty dict), use it
if key_router_settings is not None and isinstance(key_router_settings, dict) and key_router_settings:
if (
key_router_settings is not None
and isinstance(key_router_settings, dict)
and key_router_settings
):
return key_router_settings
# 2. Try team-level router_settings
if user_api_key_dict is not None and user_api_key_dict.team_id is not None:
try:
@ -3278,37 +3284,51 @@ class ProxyConfig:
where={"team_id": user_api_key_dict.team_id}
)
if team_obj is not None:
team_router_settings_value = getattr(team_obj, "router_settings", None)
team_router_settings_value = getattr(
team_obj, "router_settings", None
)
if team_router_settings_value is not None:
team_router_settings = None
if isinstance(team_router_settings_value, str):
try:
team_router_settings = yaml.safe_load(team_router_settings_value)
team_router_settings = yaml.safe_load(
team_router_settings_value
)
except (yaml.YAMLError, json.JSONDecodeError):
try:
team_router_settings = json.loads(team_router_settings_value)
team_router_settings = json.loads(
team_router_settings_value
)
except json.JSONDecodeError:
pass
elif isinstance(team_router_settings_value, dict):
team_router_settings = team_router_settings_value
# If team has router_settings (non-empty dict), use it
if team_router_settings is not None and isinstance(team_router_settings, dict) and team_router_settings:
if (
team_router_settings is not None
and isinstance(team_router_settings, dict)
and team_router_settings
):
return team_router_settings
except Exception:
# If team lookup fails, continue to global settings
pass
# 3. Try global router_settings
try:
db_router_settings = await prisma_client.db.litellm_config.find_first(
where={"param_name": "router_settings"}
)
if db_router_settings is not None and isinstance(db_router_settings.param_value, dict) and db_router_settings.param_value:
if (
db_router_settings is not None
and isinstance(db_router_settings.param_value, dict)
and db_router_settings.param_value
):
return db_router_settings.param_value
except Exception:
pass
return None
async def _add_router_settings_from_db_config(
@ -4675,27 +4695,48 @@ class ProxyStartupEvent:
### SPEND LOG CLEANUP ###
if general_settings.get("maximum_spend_logs_retention_period") is not None:
spend_log_cleanup = SpendLogCleanup()
# Get the interval from config or default to 1 day
retention_interval = general_settings.get(
"maximum_spend_logs_retention_interval", "1d"
)
try:
interval_seconds = duration_in_seconds(retention_interval)
scheduler.add_job(
spend_log_cleanup.cleanup_old_spend_logs,
"interval",
seconds=interval_seconds
+ random.randint(0, 60), # Add small random offset
# REMOVED jitter parameter - major cause of memory leak
args=[prisma_client],
id="spend_log_cleanup_job",
replace_existing=True,
misfire_grace_time=APSCHEDULER_MISFIRE_GRACE_TIME,
)
except ValueError:
verbose_proxy_logger.error(
"Invalid maximum_spend_logs_retention_interval value"
cleanup_cron = general_settings.get("maximum_spend_logs_cleanup_cron")
if cleanup_cron:
from apscheduler.triggers.cron import CronTrigger
try:
cron_trigger = CronTrigger.from_crontab(cleanup_cron)
scheduler.add_job(
spend_log_cleanup.cleanup_old_spend_logs,
cron_trigger,
args=[prisma_client],
id="spend_log_cleanup_job",
replace_existing=True,
misfire_grace_time=APSCHEDULER_MISFIRE_GRACE_TIME,
)
verbose_proxy_logger.info(
f"Spend log cleanup scheduled with cron: {cleanup_cron}"
)
except ValueError:
verbose_proxy_logger.error(
f"Invalid maximum_spend_logs_cleanup_cron value: {cleanup_cron}"
)
else:
# Interval-based scheduling (existing behavior)
retention_interval = general_settings.get(
"maximum_spend_logs_retention_interval", "1d"
)
try:
interval_seconds = duration_in_seconds(retention_interval)
scheduler.add_job(
spend_log_cleanup.cleanup_old_spend_logs,
"interval",
seconds=interval_seconds + random.randint(0, 60),
args=[prisma_client],
id="spend_log_cleanup_job",
replace_existing=True,
misfire_grace_time=APSCHEDULER_MISFIRE_GRACE_TIME,
)
except ValueError:
verbose_proxy_logger.error(
"Invalid maximum_spend_logs_retention_interval value"
)
### CHECK BATCH COST ###
if llm_router is not None:
try:
@ -9885,7 +9926,9 @@ async def get_config(): # noqa: PLR0915
_success_callbacks = normalize_callback(_success_callbacks)
_failure_callbacks = normalize_callback(_failure_callbacks)
_success_and_failure_callbacks = normalize_callback(_success_and_failure_callbacks)
_success_and_failure_callbacks = normalize_callback(
_success_and_failure_callbacks
)
_data_to_return = []
"""

View File

@ -10,6 +10,114 @@ import pytest
from litellm.proxy.db.db_transaction_queue.spend_log_cleanup import SpendLogCleanup
def test_spend_log_cleanup_cron_scheduling():
"""Test that cron expressions are correctly parsed for spend log cleanup scheduling"""
from apscheduler.triggers.cron import CronTrigger
# Valid cron expressions
cron_expr = "0 4 * * *" # 4:00 AM daily
trigger = CronTrigger.from_crontab(cron_expr)
assert trigger is not None
# Every minute (useful for testing)
trigger_minute = CronTrigger.from_crontab("*/1 * * * *")
assert trigger_minute is not None
# Specific day and hour
trigger_weekly = CronTrigger.from_crontab("0 3 * * 0") # 3 AM every Sunday
assert trigger_weekly is not None
# Invalid cron expression should raise ValueError
with pytest.raises(ValueError):
CronTrigger.from_crontab("invalid cron")
with pytest.raises(ValueError):
CronTrigger.from_crontab("60 25 * * *") # Invalid minute and hour
def test_spend_log_cleanup_cron_scheduler_integration():
"""
Integration test: Verify the proxy_server scheduler logic correctly adds
cron-based cleanup job when maximum_spend_logs_cleanup_cron is configured.
This tests the logic in proxy_server.py lines 4671-4717 without requiring
a real database connection.
"""
from unittest.mock import MagicMock
from apscheduler.triggers.cron import CronTrigger
# Mock scheduler
mock_scheduler = MagicMock()
mock_prisma_client = MagicMock()
mock_cleanup_instance = MagicMock()
# Test Case 1: Cron-based scheduling
general_settings_cron = {
"maximum_spend_logs_retention_period": "7d",
"maximum_spend_logs_cleanup_cron": "0 4 * * *", # 4 AM daily
}
cleanup_cron = general_settings_cron.get("maximum_spend_logs_cleanup_cron")
assert cleanup_cron is not None
# Simulate the scheduler logic from proxy_server.py
cron_trigger = CronTrigger.from_crontab(cleanup_cron)
mock_scheduler.add_job(
mock_cleanup_instance.cleanup_old_spend_logs,
cron_trigger,
args=[mock_prisma_client],
id="spend_log_cleanup_job",
replace_existing=True,
misfire_grace_time=3600,
)
# Verify scheduler was called correctly
mock_scheduler.add_job.assert_called_once()
call_args = mock_scheduler.add_job.call_args
# Verify the trigger is a CronTrigger
assert isinstance(call_args[0][1], CronTrigger)
# Verify job ID
assert call_args[1]["id"] == "spend_log_cleanup_job"
assert call_args[1]["replace_existing"] is True
# Test Case 2: Interval-based scheduling (fallback)
mock_scheduler.reset_mock()
general_settings_interval = {
"maximum_spend_logs_retention_period": "7d",
# No cron, so it should fall back to interval
}
cleanup_cron_fallback = general_settings_interval.get(
"maximum_spend_logs_cleanup_cron"
)
assert cleanup_cron_fallback is None # No cron configured
# Simulate interval-based scheduling fallback
retention_interval = general_settings_interval.get(
"maximum_spend_logs_retention_interval", "1d"
)
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
interval_seconds = duration_in_seconds(retention_interval)
mock_scheduler.add_job(
mock_cleanup_instance.cleanup_old_spend_logs,
"interval",
seconds=interval_seconds,
args=[mock_prisma_client],
id="spend_log_cleanup_job",
replace_existing=True,
)
# Verify interval scheduling was called
mock_scheduler.add_job.assert_called_once()
interval_call_args = mock_scheduler.add_job.call_args
assert interval_call_args[0][1] == "interval"
assert interval_call_args[1]["seconds"] == 86400 # 1 day in seconds
@pytest.mark.asyncio
async def test_should_delete_spend_logs():
# Test case 1: No retention set