High-volume deployments see LiteLLM_SpendLogs grow unbounded because retention via DELETE leaves dead tuples that autovacuum cannot reclaim fast enough. With a range-partitioned table, retention drops whole partitions instead: an instant metadata operation that returns disk to the OS immediately. The feature is gated behind general_settings.use_spend_logs_partitioning (default false). With the flag off, the cleanup job never queries the catalog and behaves exactly as today. With it on, the job verifies the table is partitioned, pre-creates upcoming partitions, and drops expired ones; expired rows the drops cannot reach (DEFAULT partition, partitions spanning the cutoff) are still deleted row-wise so retention is never bypassed. If the table is not partitioned it falls back to batched DELETE only. Converting an existing table is a manual, documented operation in db_scripts/partition_spend_logs.sql; db_scripts/unpartition_spend_logs.sql rolls it back. Both scripts rename the old table's indexes aside before recreating them, since a table rename keeps the schema-unique index names and would otherwise silently skip the CREATE INDEX IF NOT EXISTS block. Granularity and pre-create lookahead are tunable via SPEND_LOG_PARTITION_INTERVAL (day/week/month, invalid values fall back to day) and SPEND_LOG_PARTITION_PRECREATE_AHEAD.
631 lines
23 KiB
Python
631 lines
23 KiB
Python
"""
|
|
Test cases for spend log cleanup functionality
|
|
"""
|
|
|
|
from datetime import datetime, timedelta, timezone
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
import pytest
|
|
|
|
from litellm.proxy.db.db_transaction_queue.spend_log_cleanup import SpendLogCleanup
|
|
|
|
|
|
def test_spend_log_cleanup_cron_scheduling():
|
|
"""Test that cron expressions are correctly parsed for spend log cleanup scheduling"""
|
|
from apscheduler.triggers.cron import CronTrigger
|
|
|
|
# Valid cron expressions
|
|
cron_expr = "0 4 * * *" # 4:00 AM daily
|
|
trigger = CronTrigger.from_crontab(cron_expr)
|
|
assert trigger is not None
|
|
|
|
# Every minute (useful for testing)
|
|
trigger_minute = CronTrigger.from_crontab("*/1 * * * *")
|
|
assert trigger_minute is not None
|
|
|
|
# Specific day and hour
|
|
trigger_weekly = CronTrigger.from_crontab("0 3 * * 0") # 3 AM every Sunday
|
|
assert trigger_weekly is not None
|
|
|
|
# Invalid cron expression should raise ValueError
|
|
with pytest.raises(ValueError):
|
|
CronTrigger.from_crontab("invalid cron")
|
|
|
|
with pytest.raises(ValueError):
|
|
CronTrigger.from_crontab("60 25 * * *") # Invalid minute and hour
|
|
|
|
|
|
def test_spend_log_cleanup_cron_scheduler_integration():
|
|
"""
|
|
Integration test: Verify the proxy_server scheduler logic correctly adds
|
|
cron-based cleanup job when maximum_spend_logs_cleanup_cron is configured.
|
|
|
|
This tests the logic in proxy_server.py lines 4671-4717 without requiring
|
|
a real database connection.
|
|
"""
|
|
from unittest.mock import MagicMock
|
|
from apscheduler.triggers.cron import CronTrigger
|
|
|
|
# Mock scheduler
|
|
mock_scheduler = MagicMock()
|
|
mock_prisma_client = MagicMock()
|
|
mock_cleanup_instance = MagicMock()
|
|
|
|
# Test Case 1: Cron-based scheduling
|
|
general_settings_cron = {
|
|
"maximum_spend_logs_retention_period": "7d",
|
|
"maximum_spend_logs_cleanup_cron": "0 4 * * *", # 4 AM daily
|
|
}
|
|
|
|
cleanup_cron = general_settings_cron.get("maximum_spend_logs_cleanup_cron")
|
|
assert cleanup_cron is not None
|
|
|
|
# Simulate the scheduler logic from proxy_server.py
|
|
cron_trigger = CronTrigger.from_crontab(cleanup_cron)
|
|
mock_scheduler.add_job(
|
|
mock_cleanup_instance.cleanup_old_spend_logs,
|
|
cron_trigger,
|
|
args=[mock_prisma_client],
|
|
id="spend_log_cleanup_job",
|
|
replace_existing=True,
|
|
misfire_grace_time=3600,
|
|
)
|
|
|
|
# Verify scheduler was called correctly
|
|
mock_scheduler.add_job.assert_called_once()
|
|
call_args = mock_scheduler.add_job.call_args
|
|
|
|
# Verify the trigger is a CronTrigger
|
|
assert isinstance(call_args[0][1], CronTrigger)
|
|
|
|
# Verify job ID
|
|
assert call_args[1]["id"] == "spend_log_cleanup_job"
|
|
assert call_args[1]["replace_existing"] is True
|
|
|
|
# Test Case 2: Interval-based scheduling (fallback)
|
|
mock_scheduler.reset_mock()
|
|
general_settings_interval = {
|
|
"maximum_spend_logs_retention_period": "7d",
|
|
# No cron, so it should fall back to interval
|
|
}
|
|
|
|
cleanup_cron_fallback = general_settings_interval.get(
|
|
"maximum_spend_logs_cleanup_cron"
|
|
)
|
|
assert cleanup_cron_fallback is None # No cron configured
|
|
|
|
# Simulate interval-based scheduling fallback
|
|
retention_interval = general_settings_interval.get(
|
|
"maximum_spend_logs_retention_interval", "1d"
|
|
)
|
|
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
|
|
|
|
interval_seconds = duration_in_seconds(retention_interval)
|
|
|
|
mock_scheduler.add_job(
|
|
mock_cleanup_instance.cleanup_old_spend_logs,
|
|
"interval",
|
|
seconds=interval_seconds,
|
|
args=[mock_prisma_client],
|
|
id="spend_log_cleanup_job",
|
|
replace_existing=True,
|
|
)
|
|
|
|
# Verify interval scheduling was called
|
|
mock_scheduler.add_job.assert_called_once()
|
|
interval_call_args = mock_scheduler.add_job.call_args
|
|
assert interval_call_args[0][1] == "interval"
|
|
assert interval_call_args[1]["seconds"] == 86400 # 1 day in seconds
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_should_delete_spend_logs():
|
|
# Test case 1: No retention set
|
|
cleaner = SpendLogCleanup(general_settings={})
|
|
assert cleaner._should_delete_spend_logs() is False
|
|
|
|
# Test case 2: Valid seconds string
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "3600s"}
|
|
)
|
|
assert cleaner._should_delete_spend_logs() is True
|
|
|
|
# Test case 3: Valid days string
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "30d"}
|
|
)
|
|
assert cleaner._should_delete_spend_logs() is True
|
|
|
|
# Test case 4: Valid hours string
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "24h"}
|
|
)
|
|
assert cleaner._should_delete_spend_logs() is True
|
|
|
|
# Test case 5: Invalid format
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "invalid"}
|
|
)
|
|
assert cleaner._should_delete_spend_logs() is False
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_old_spend_logs_batch_deletion():
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
# Setup Prisma client
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
|
|
# Mock execute_raw to return deleted counts
|
|
mock_db.execute_raw = AsyncMock(side_effect=[1000, 500, 0])
|
|
|
|
# Wire up mocks
|
|
mock_prisma_client.db = mock_db
|
|
|
|
# Mock Redis cache and pod_lock_manager
|
|
mock_redis_cache = MagicMock()
|
|
mock_pod_lock_manager = MagicMock()
|
|
mock_pod_lock_manager.redis_cache = mock_redis_cache
|
|
mock_pod_lock_manager.acquire_lock = AsyncMock(return_value=True)
|
|
mock_pod_lock_manager.release_lock = AsyncMock()
|
|
|
|
# Run cleanup with mocked pod_lock_manager
|
|
test_settings = {"maximum_spend_logs_retention_period": "7d"}
|
|
cleaner = SpendLogCleanup(general_settings=test_settings)
|
|
cleaner.pod_lock_manager = mock_pod_lock_manager
|
|
assert cleaner._should_delete_spend_logs() is True
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
# Validate batching and deletion via raw SQL
|
|
assert mock_db.execute_raw.call_count == 3
|
|
|
|
# Check the first call argument
|
|
call_args_sql = mock_db.execute_raw.call_args_list[0][0][0]
|
|
assert 'DELETE FROM "LiteLLM_SpendLogs"' in call_args_sql
|
|
# must match on the full composite identity: on a partitioned table
|
|
# request_id alone is not unique, and deleting by it would let a client
|
|
# reusing x-litellm-call-id take out a fresh row alongside the expired one
|
|
assert 'WHERE ("request_id", "startTime") IN' in call_args_sql
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_old_spend_logs_retention_period_cutoff():
|
|
"""
|
|
Test that logs are filtered using correct cutoff based on retention
|
|
"""
|
|
# Setup Prisma client
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
mock_db.execute_raw = AsyncMock(return_value=0)
|
|
mock_prisma_client.db = mock_db
|
|
|
|
# Mock Redis cache and pod_lock_manager
|
|
mock_redis_cache = MagicMock()
|
|
mock_pod_lock_manager = MagicMock()
|
|
mock_pod_lock_manager.redis_cache = mock_redis_cache
|
|
mock_pod_lock_manager.acquire_lock = AsyncMock(return_value=True)
|
|
mock_pod_lock_manager.release_lock = AsyncMock()
|
|
|
|
# Run cleanup with mocked pod_lock_manager
|
|
test_settings = {"maximum_spend_logs_retention_period": "24h"}
|
|
cleaner = SpendLogCleanup(general_settings=test_settings)
|
|
cleaner.pod_lock_manager = mock_pod_lock_manager
|
|
assert cleaner._should_delete_spend_logs() is True
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
# Verify the cutoff date is correct
|
|
cutoff_date = mock_db.execute_raw.call_args[0][1]
|
|
expected_cutoff = datetime.now(timezone.utc) - timedelta(seconds=86400)
|
|
assert (
|
|
abs((cutoff_date - expected_cutoff).total_seconds()) < 1
|
|
) # Allow 1 second difference for test execution time
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_drops_partitions_when_enabled_and_partitioned():
|
|
"""
|
|
With use_spend_logs_partitioning enabled and a partitioned table, cleanup
|
|
must reclaim disk by dropping partitions AND still delete expired rows the
|
|
drops cannot reach (DEFAULT partition, cutoff-spanning partitions), so
|
|
retention is never bypassed.
|
|
"""
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
mock_prisma_client = MagicMock()
|
|
mock_prisma_client.db.execute_raw = AsyncMock(return_value=0)
|
|
|
|
partition_manager = MagicMock()
|
|
partition_manager.is_partitioned = AsyncMock(return_value=True)
|
|
partition_manager.ensure_partitions = AsyncMock(return_value=["p1"])
|
|
partition_manager.drop_partitions_older_than = AsyncMock(
|
|
return_value=["LiteLLM_SpendLogs_p20260601"]
|
|
)
|
|
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={
|
|
"maximum_spend_logs_retention_period": "7d",
|
|
"use_spend_logs_partitioning": True,
|
|
},
|
|
partition_manager=partition_manager,
|
|
)
|
|
cleaner.pod_lock_manager = MagicMock()
|
|
cleaner.pod_lock_manager.redis_cache = None
|
|
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
partition_manager.ensure_partitions.assert_awaited_once()
|
|
partition_manager.drop_partitions_older_than.assert_awaited_once()
|
|
delete_sql = mock_prisma_client.db.execute_raw.call_args_list[0][0][0]
|
|
assert 'DELETE FROM "LiteLLM_SpendLogs"' in delete_sql
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_uses_delete_when_partitioning_not_enabled():
|
|
"""
|
|
Even against a partitioned table, the partition path must stay off until
|
|
use_spend_logs_partitioning is explicitly enabled, so existing deployments
|
|
see zero behavior change. The catalog must not even be queried.
|
|
"""
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
mock_prisma_client = MagicMock()
|
|
mock_prisma_client.db.execute_raw = AsyncMock(side_effect=[10, 0])
|
|
|
|
partition_manager = MagicMock()
|
|
partition_manager.is_partitioned = AsyncMock(return_value=True)
|
|
partition_manager.ensure_partitions = AsyncMock()
|
|
partition_manager.drop_partitions_older_than = AsyncMock()
|
|
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"},
|
|
partition_manager=partition_manager,
|
|
)
|
|
cleaner.pod_lock_manager = MagicMock()
|
|
cleaner.pod_lock_manager.redis_cache = None
|
|
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
partition_manager.is_partitioned.assert_not_awaited()
|
|
partition_manager.drop_partitions_older_than.assert_not_awaited()
|
|
delete_sql = mock_prisma_client.db.execute_raw.call_args_list[0][0][0]
|
|
assert 'DELETE FROM "LiteLLM_SpendLogs"' in delete_sql
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_uses_delete_when_not_partitioned():
|
|
"""
|
|
With the feature enabled but the table not actually partitioned (script not
|
|
run yet), cleanup must keep using the batched DELETE path.
|
|
"""
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
mock_prisma_client = MagicMock()
|
|
mock_prisma_client.db.execute_raw = AsyncMock(side_effect=[10, 0])
|
|
|
|
partition_manager = MagicMock()
|
|
partition_manager.is_partitioned = AsyncMock(return_value=False)
|
|
partition_manager.drop_partitions_older_than = AsyncMock()
|
|
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={
|
|
"maximum_spend_logs_retention_period": "7d",
|
|
"use_spend_logs_partitioning": True,
|
|
},
|
|
partition_manager=partition_manager,
|
|
)
|
|
cleaner.pod_lock_manager = MagicMock()
|
|
cleaner.pod_lock_manager.redis_cache = None
|
|
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
partition_manager.drop_partitions_older_than.assert_not_awaited()
|
|
assert mock_prisma_client.db.execute_raw.await_count == 2
|
|
delete_sql = mock_prisma_client.db.execute_raw.call_args_list[0][0][0]
|
|
assert 'DELETE FROM "LiteLLM_SpendLogs"' in delete_sql
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_old_spend_logs_no_retention_period():
|
|
"""
|
|
Test that no logs are deleted when no retention period is set
|
|
"""
|
|
mock_prisma_client = MagicMock()
|
|
mock_prisma_client.db.execute_raw = AsyncMock()
|
|
|
|
cleaner = SpendLogCleanup(general_settings={}) # no retention
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
mock_prisma_client.db.execute_raw.assert_not_called()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_lock_not_released_when_not_acquired():
|
|
"""
|
|
Lock release should be skipped when _should_delete_spend_logs returns False
|
|
before the lock is ever acquired.
|
|
"""
|
|
mock_prisma_client = MagicMock()
|
|
mock_prisma_client.db.execute_raw = AsyncMock()
|
|
|
|
mock_redis_cache = MagicMock()
|
|
mock_pod_lock_manager = MagicMock()
|
|
mock_pod_lock_manager.redis_cache = mock_redis_cache
|
|
mock_pod_lock_manager.acquire_lock = AsyncMock(return_value=True)
|
|
mock_pod_lock_manager.release_lock = AsyncMock()
|
|
|
|
# No retention setting → _should_delete_spend_logs() returns False before lock is acquired
|
|
cleaner = SpendLogCleanup(general_settings={})
|
|
cleaner.pod_lock_manager = mock_pod_lock_manager
|
|
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
mock_pod_lock_manager.acquire_lock.assert_not_called()
|
|
mock_pod_lock_manager.release_lock.assert_not_called()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integer_retention_treated_as_days():
|
|
"""
|
|
An integer value for maximum_spend_logs_retention_period should be treated
|
|
as days (e.g., 3 → '3d' → 259200 seconds).
|
|
"""
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": 3}
|
|
)
|
|
result = cleaner._should_delete_spend_logs()
|
|
assert result is True
|
|
assert cleaner.retention_seconds == 3 * 86400 # 3 days in seconds
|
|
|
|
|
|
def test_string_retention_still_works():
|
|
"""
|
|
String values like '3d', '24h', '3600s' should continue to parse correctly.
|
|
"""
|
|
cases = [
|
|
("3d", 3 * 86400),
|
|
("24h", 24 * 3600),
|
|
("3600s", 3600),
|
|
("2w", 2 * 604800),
|
|
]
|
|
for setting, expected_seconds in cases:
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": setting}
|
|
)
|
|
assert cleaner._should_delete_spend_logs() is True, f"Failed for {setting}"
|
|
assert (
|
|
cleaner.retention_seconds == expected_seconds
|
|
), f"Expected {expected_seconds} for {setting}, got {cleaner.retention_seconds}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delete_old_logs_aborts_on_non_int_execute_raw_return():
|
|
"""should abort deletion loop immediately when execute_raw returns a non-int
|
|
(e.g. None or dict), preventing an infinite loop."""
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
mock_db.execute_raw = AsyncMock(return_value=None)
|
|
mock_prisma_client.db = mock_db
|
|
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"}
|
|
)
|
|
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)
|
|
total_deleted = await cleaner._delete_old_logs(mock_prisma_client, cutoff_date)
|
|
|
|
assert mock_db.execute_raw.call_count == 1
|
|
assert total_deleted == 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delete_old_logs_continues_on_valid_int_return():
|
|
"""should continue deletion loop across batches when execute_raw returns valid int counts."""
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
mock_db.execute_raw = AsyncMock(side_effect=[500, 300, 0])
|
|
mock_prisma_client.db = mock_db
|
|
|
|
cleaner = SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"}
|
|
)
|
|
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)
|
|
total_deleted = await cleaner._delete_old_logs(mock_prisma_client, cutoff_date)
|
|
|
|
assert mock_db.execute_raw.call_count == 3
|
|
assert total_deleted == 800
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delete_old_logs_continues_after_single_batch_failure(monkeypatch):
|
|
"""A single batch failure (e.g. DB timeout) must not abort the whole run —
|
|
subsequent batches should still execute and their counts accumulate."""
|
|
import litellm.proxy.db.db_transaction_queue.spend_log_cleanup as cleanup_module
|
|
|
|
# Zero out the failure backoff so the test doesn't take ~0.5s of real sleep.
|
|
monkeypatch.setattr(
|
|
cleanup_module, "SPEND_LOG_CLEANUP_BATCH_FAILURE_BACKOFF_SECONDS", 0.0
|
|
)
|
|
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
# batch 1 succeeds, batch 2 raises (one-off DB timeout), batches 3-4 succeed,
|
|
# batch 5 returns 0 → loop exits naturally.
|
|
mock_db.execute_raw = AsyncMock(
|
|
side_effect=[100, TimeoutError("simulated DB timeout"), 200, 50, 0]
|
|
)
|
|
mock_prisma_client.db = mock_db
|
|
|
|
cleaner = cleanup_module.SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"}
|
|
)
|
|
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)
|
|
total_deleted = await cleaner._delete_old_logs(mock_prisma_client, cutoff_date)
|
|
|
|
# All 5 batches should have been attempted; 100 + 200 + 50 = 350 deleted.
|
|
assert mock_db.execute_raw.call_count == 5
|
|
assert total_deleted == 350
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delete_old_logs_aborts_after_consecutive_failures(monkeypatch):
|
|
"""If batch failures persist for SPEND_LOG_CLEANUP_MAX_CONSECUTIVE_BATCH_FAILURES
|
|
in a row (e.g. DB is down), the loop must abort instead of hot-looping."""
|
|
import litellm.proxy.db.db_transaction_queue.spend_log_cleanup as cleanup_module
|
|
|
|
# Lower the threshold so the test is fast and deterministic.
|
|
monkeypatch.setattr(
|
|
cleanup_module, "SPEND_LOG_CLEANUP_MAX_CONSECUTIVE_BATCH_FAILURES", 3
|
|
)
|
|
monkeypatch.setattr(
|
|
cleanup_module, "SPEND_LOG_CLEANUP_BATCH_FAILURE_BACKOFF_SECONDS", 0.0
|
|
)
|
|
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
# Every batch raises — must abort after exactly 3 attempts, not loop forever.
|
|
mock_db.execute_raw = AsyncMock(
|
|
side_effect=ConnectionError("simulated persistent DB outage")
|
|
)
|
|
mock_prisma_client.db = mock_db
|
|
|
|
cleaner = cleanup_module.SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"}
|
|
)
|
|
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)
|
|
total_deleted = await cleaner._delete_old_logs(mock_prisma_client, cutoff_date)
|
|
|
|
assert mock_db.execute_raw.call_count == 3
|
|
assert total_deleted == 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delete_old_logs_resets_consecutive_failures_on_success(monkeypatch):
|
|
"""A success between failures must reset the consecutive-failure counter so
|
|
intermittent timeouts don't trip the abort threshold."""
|
|
import litellm.proxy.db.db_transaction_queue.spend_log_cleanup as cleanup_module
|
|
|
|
monkeypatch.setattr(
|
|
cleanup_module, "SPEND_LOG_CLEANUP_MAX_CONSECUTIVE_BATCH_FAILURES", 3
|
|
)
|
|
monkeypatch.setattr(
|
|
cleanup_module, "SPEND_LOG_CLEANUP_BATCH_FAILURE_BACKOFF_SECONDS", 0.0
|
|
)
|
|
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
# Pattern: fail, fail, success (resets counter), fail, fail, success, done.
|
|
# Without reset, three of these would trip abort; with reset, they don't.
|
|
mock_db.execute_raw = AsyncMock(
|
|
side_effect=[
|
|
TimeoutError("t1"),
|
|
TimeoutError("t2"),
|
|
100,
|
|
TimeoutError("t3"),
|
|
TimeoutError("t4"),
|
|
50,
|
|
0,
|
|
]
|
|
)
|
|
mock_prisma_client.db = mock_db
|
|
|
|
cleaner = cleanup_module.SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"}
|
|
)
|
|
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)
|
|
total_deleted = await cleaner._delete_old_logs(mock_prisma_client, cutoff_date)
|
|
|
|
assert mock_db.execute_raw.call_count == 7
|
|
assert total_deleted == 150
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_uses_logger_exception_for_full_traceback(monkeypatch):
|
|
"""The outer error handler must call logger.exception() (not .error(str(e)))
|
|
so Prisma/DB timeouts surface a full traceback and exception type."""
|
|
import litellm.proxy.db.db_transaction_queue.spend_log_cleanup as cleanup_module
|
|
|
|
mock_logger = MagicMock()
|
|
monkeypatch.setattr(cleanup_module, "verbose_proxy_logger", mock_logger)
|
|
|
|
mock_prisma_client = MagicMock()
|
|
# Force the outer try/except to fire by making _should_delete_spend_logs raise.
|
|
cleaner = cleanup_module.SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"}
|
|
)
|
|
cleaner.pod_lock_manager = None
|
|
|
|
def boom():
|
|
raise RuntimeError("simulated prisma timeout")
|
|
|
|
cleaner._should_delete_spend_logs = boom # type: ignore[assignment]
|
|
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
assert mock_logger.exception.called, "expected logger.exception() to be called"
|
|
# The exception type name must appear in the formatted args so operators can
|
|
# tell *what* failed, not just "Error during cleanup:".
|
|
call_args = mock_logger.exception.call_args
|
|
formatted = call_args[0][0] % call_args[0][1:]
|
|
assert "RuntimeError" in formatted
|
|
assert "simulated prisma timeout" in formatted
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_releases_lock_after_persistent_batch_failures(monkeypatch):
|
|
"""Even when batch deletion aborts due to consecutive failures, the pod lock
|
|
must still be released so the next scheduled run isn't permanently blocked."""
|
|
import litellm.proxy.db.db_transaction_queue.spend_log_cleanup as cleanup_module
|
|
|
|
monkeypatch.setattr(
|
|
cleanup_module, "SPEND_LOG_CLEANUP_MAX_CONSECUTIVE_BATCH_FAILURES", 2
|
|
)
|
|
monkeypatch.setattr(
|
|
cleanup_module, "SPEND_LOG_CLEANUP_BATCH_FAILURE_BACKOFF_SECONDS", 0.0
|
|
)
|
|
|
|
mock_prisma_client = MagicMock()
|
|
mock_db = MagicMock()
|
|
mock_db.execute_raw = AsyncMock(side_effect=TimeoutError("DB down"))
|
|
mock_prisma_client.db = mock_db
|
|
|
|
mock_pod_lock_manager = MagicMock()
|
|
mock_pod_lock_manager.redis_cache = MagicMock()
|
|
mock_pod_lock_manager.acquire_lock = AsyncMock(return_value=True)
|
|
mock_pod_lock_manager.release_lock = AsyncMock()
|
|
|
|
cleaner = cleanup_module.SpendLogCleanup(
|
|
general_settings={"maximum_spend_logs_retention_period": "7d"}
|
|
)
|
|
cleaner.pod_lock_manager = mock_pod_lock_manager
|
|
|
|
await cleaner.cleanup_old_spend_logs(mock_prisma_client)
|
|
|
|
# Cleanup didn't crash; the abort-after-failures path returned cleanly.
|
|
mock_pod_lock_manager.release_lock.assert_awaited_once()
|
|
|
|
|
|
def test_cleanup_batch_size_env_var(monkeypatch):
|
|
"""Ensure batch size is configurable via environment variable"""
|
|
import importlib
|
|
|
|
import litellm.constants as constants_module
|
|
import litellm.proxy.db.db_transaction_queue.spend_log_cleanup as cleanup_module
|
|
|
|
# Set env var and reload modules to pick up new value
|
|
monkeypatch.setenv("SPEND_LOG_CLEANUP_BATCH_SIZE", "25")
|
|
importlib.reload(constants_module)
|
|
importlib.reload(cleanup_module)
|
|
|
|
cleaner = cleanup_module.SpendLogCleanup(general_settings={})
|
|
assert cleaner.batch_size == 25
|
|
|
|
# Remove env var and reload to restore default for other tests
|
|
monkeypatch.delenv("SPEND_LOG_CLEANUP_BATCH_SIZE", raising=False)
|
|
importlib.reload(constants_module)
|
|
importlib.reload(cleanup_module)
|