chore(caching): remove allow_legacy_unscoped_cache_hits opt-in

The flag was an opt-in escape hatch for the cross-tenant leak the rest
of the patch closes — flipping it on (env var or constructor param)
re-enables exactly the VERIA-54 primitive on either backend. There is
no operational need that the secure path doesn't already meet:

- Qdrant: legacy points without ``litellm_cache_key`` payload are
  excluded by the must-clause filter and treated as misses; new sets
  populate the cache key, so cold-start lasts only as long as the
  natural cache rebuild.
- Redis: existing unscoped index can't carry the new schema; the init
  path falls back to ``{name}_isolated`` (and recreates it on stale
  schema), leaving the legacy index untouched.

Drop the constructor param, env-var fallback, ``_using_legacy_unscoped_index``
flag, the legacy-reuse branch in ``_init_semantic_cache``, and the
matching guards in set/get paths. Update tests to drop the legacy-mode
cases and assert the secure-only behaviour.
This commit is contained in:
user 2026-05-04 22:16:30 +00:00
parent 7d7244986e
commit a2473ef0c2
No known key found for this signature in database
4 changed files with 19 additions and 198 deletions

View File

@ -12,7 +12,7 @@ import ast
import asyncio
import json
import os
from typing import Any, Dict, Optional, cast
from typing import Any, Dict, cast
import litellm
from litellm._logging import print_verbose
@ -27,9 +27,6 @@ from .base_cache import BaseCache
class QdrantSemanticCache(BaseCache):
CACHE_KEY_FIELD_NAME = "litellm_cache_key"
ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR = (
"LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS"
)
def __init__( # noqa: PLR0915
self,
@ -41,7 +38,6 @@ class QdrantSemanticCache(BaseCache):
embedding_model="text-embedding-ada-002",
host_type=None,
vector_size=None,
allow_legacy_unscoped_cache_hits: Optional[bool] = None,
):
from litellm.llms.custom_httpx.http_handler import (
_get_httpx_client,
@ -62,16 +58,6 @@ class QdrantSemanticCache(BaseCache):
raise Exception("similarity_threshold must be provided, passed None")
self.similarity_threshold = similarity_threshold
self.embedding_model = embedding_model
self.allow_legacy_unscoped_cache_hits = (
self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits)
)
if self.allow_legacy_unscoped_cache_hits:
print_verbose(
"Qdrant semantic-cache legacy unscoped hits are enabled via "
f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; searches may return "
"pre-isolation cache entries without cache-key payloads. Disable "
"this after warming the key-scoped semantic cache."
)
self.vector_size = (
vector_size if vector_size is not None else QDRANT_VECTOR_SIZE
)
@ -180,14 +166,6 @@ class QdrantSemanticCache(BaseCache):
else:
raise Exception("Error while creating new collection")
@classmethod
def _get_allow_legacy_unscoped_cache_hits(
cls, allow_legacy_unscoped_cache_hits: Optional[bool]
) -> bool:
if allow_legacy_unscoped_cache_hits is not None:
return allow_legacy_unscoped_cache_hits
return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true"
def _get_cache_logic(self, cached_response: Any):
if cached_response is None:
return cached_response
@ -210,8 +188,6 @@ class QdrantSemanticCache(BaseCache):
}
def _add_cache_key_filter_to_search_data(self, data: dict, key: str) -> None:
if getattr(self, "allow_legacy_unscoped_cache_hits", False):
return
data["filter"] = self._get_qdrant_cache_key_filter(key)
def _ensure_cache_key_payload_index(self) -> None:
@ -236,14 +212,11 @@ class QdrantSemanticCache(BaseCache):
)
def _payload_matches_cache_key(self, payload: dict, key: str) -> bool:
# Legacy Qdrant semantic-cache points stored only prompt text and
# response. They cannot be reassigned to the generated LiteLLM cache key
# without risking cross-scope hits, so secure mode treats them as misses.
# Pre-isolation points stored only prompt + response with no cache-key
# payload field. Reassigning them to a caller's key would risk
# cross-scope hits, so they're treated as misses and re-populated on
# the next set_cache.
cached_key = payload.get(self.CACHE_KEY_FIELD_NAME)
if cached_key is None and getattr(
self, "allow_legacy_unscoped_cache_hits", False
):
return True
return cached_key is not None and str(cached_key) == str(key)
async def _get_async_embedding(self, prompt: str, **kwargs) -> Any:

View File

@ -36,9 +36,6 @@ class RedisSemanticCache(BaseCache):
DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
CACHE_KEY_FIELD_NAME: str = "litellm_cache_key"
ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR: str = (
"LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS"
)
def __init__(
self,
@ -49,7 +46,6 @@ class RedisSemanticCache(BaseCache):
similarity_threshold: Optional[float] = None,
embedding_model: str = "text-embedding-ada-002",
index_name: Optional[str] = None,
allow_legacy_unscoped_cache_hits: Optional[bool] = None,
**kwargs,
):
"""
@ -91,10 +87,6 @@ class RedisSemanticCache(BaseCache):
# While similarity: 1 = most similar, 0 = least similar
self.distance_threshold = 1 - similarity_threshold
self.embedding_model = embedding_model
self.allow_legacy_unscoped_cache_hits = (
self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits)
)
self._using_legacy_unscoped_index = False
# Set up Redis connection
if redis_url is None:
@ -125,14 +117,6 @@ class RedisSemanticCache(BaseCache):
cache_vectorizer=cache_vectorizer,
)
@classmethod
def _get_allow_legacy_unscoped_cache_hits(
cls, allow_legacy_unscoped_cache_hits: Optional[bool]
) -> bool:
if allow_legacy_unscoped_cache_hits is not None:
return allow_legacy_unscoped_cache_hits
return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true"
@classmethod
def _cache_key_filterable_field(cls) -> Dict[str, str]:
return {
@ -167,22 +151,6 @@ class RedisSemanticCache(BaseCache):
if not _is_schema_mismatch(exc):
raise
if self.allow_legacy_unscoped_cache_hits:
self._using_legacy_unscoped_index = True
print_verbose(
"Redis semantic-cache legacy unscoped hits are enabled via "
f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; reusing existing "
"index without cache-key isolation. Disable this after warming "
"the isolated semantic cache."
)
return semantic_cache_cls(
name=index_name,
redis_url=redis_url,
vectorizer=cache_vectorizer,
distance_threshold=self.distance_threshold,
overwrite=False,
)
isolated_index_name = f"{index_name}_isolated"
print_verbose(
"Redis semantic-cache existing index schema is not isolated; "
@ -223,11 +191,11 @@ class RedisSemanticCache(BaseCache):
return Tag(self.CACHE_KEY_FIELD_NAME) == str(key)
def _cache_hit_matches_key(self, cache_hit: Dict[str, Any], key: str) -> bool:
# Pre-isolation entries with no ``litellm_cache_key`` field cannot be
# safely reassigned to a caller's scope and are treated as misses.
cached_key = cache_hit.get(self.CACHE_KEY_FIELD_NAME)
if isinstance(cached_key, bytes):
cached_key = cached_key.decode("utf-8")
if cached_key is None and getattr(self, "_using_legacy_unscoped_index", False):
return True
return cached_key is not None and str(cached_key) == str(key)
def _get_ttl(self, **kwargs) -> Optional[int]:
@ -319,9 +287,9 @@ class RedisSemanticCache(BaseCache):
prompt = get_str_from_messages(messages)
value_str = str(value)
store_kwargs: Dict[str, Any] = {}
if not getattr(self, "_using_legacy_unscoped_index", False):
store_kwargs["filters"] = self._get_cache_filters(key)
store_kwargs: Dict[str, Any] = {
"filters": self._get_cache_filters(key),
}
# Get TTL and store in Redis semantic cache
ttl = self._get_ttl(**kwargs)
@ -357,11 +325,10 @@ class RedisSemanticCache(BaseCache):
prompt = get_str_from_messages(messages)
# Check the cache for semantically similar prompts in this exact
# LiteLLM cache-key scope.
check_kwargs: Dict[str, Any] = {"prompt": prompt}
if not getattr(self, "_using_legacy_unscoped_index", False):
check_kwargs["filter_expression"] = (
self._get_cache_key_filter_expression(key)
)
check_kwargs: Dict[str, Any] = {
"prompt": prompt,
"filter_expression": self._get_cache_key_filter_expression(key),
}
results = self.llmcache.check(**check_kwargs)
# Return None if no similar prompts found
@ -475,9 +442,8 @@ class RedisSemanticCache(BaseCache):
store_kwargs: Dict[str, Any] = {
"vector": prompt_embedding,
"filters": self._get_cache_filters(key),
}
if not getattr(self, "_using_legacy_unscoped_index", False):
store_kwargs["filters"] = self._get_cache_filters(key)
# Get TTL and store in Redis semantic cache
ttl = self._get_ttl(**kwargs)
@ -522,11 +488,8 @@ class RedisSemanticCache(BaseCache):
check_kwargs: Dict[str, Any] = {
"prompt": prompt,
"vector": prompt_embedding,
"filter_expression": self._get_cache_key_filter_expression(key),
}
if not getattr(self, "_using_legacy_unscoped_index", False):
check_kwargs["filter_expression"] = (
self._get_cache_key_filter_expression(key)
)
results = await self.llmcache.acheck(**check_kwargs)
# handle results / cache hit

View File

@ -208,76 +208,6 @@ def test_qdrant_semantic_cache_rejects_unscoped_cache_hit():
assert metadata["semantic-similarity"] == 0.0
def test_qdrant_semantic_cache_allows_legacy_unscoped_hit_with_flag(monkeypatch):
monkeypatch.setenv("LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS", "true")
with (
patch(
"litellm.llms.custom_httpx.http_handler._get_httpx_client"
) as mock_sync_client,
patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client"),
):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {"result": {"exists": True}}
mock_sync_client_instance = MagicMock()
mock_sync_client_instance.get.return_value = mock_response
mock_sync_client.return_value = mock_sync_client_instance
from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
qdrant_cache = QdrantSemanticCache(
collection_name="test_collection",
qdrant_api_base="http://test.qdrant.local",
qdrant_api_key="test_key",
similarity_threshold=0.8,
)
mock_search_response = MagicMock()
mock_search_response.status_code = 200
mock_search_response.json.return_value = {
"result": [
{
"payload": {
"text": "What is the capital of France?",
"response": '{"id": "test-123"}',
},
"score": 0.9,
}
]
}
qdrant_cache.sync_client.post = MagicMock(return_value=mock_search_response)
with patch(
"litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}
):
metadata = {}
result = qdrant_cache.get_cache(
key="test_key",
messages=[{"content": "What is the capital of France?"}],
metadata=metadata,
)
assert result == {"id": "test-123"}
assert metadata["semantic-similarity"] == 0.9
assert "filter" not in qdrant_cache.sync_client.post.call_args.kwargs["json"]
def test_qdrant_semantic_cache_legacy_mode_rejects_wrong_key_hit():
from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
qdrant_cache = QdrantSemanticCache.__new__(QdrantSemanticCache)
qdrant_cache.allow_legacy_unscoped_cache_hits = True
assert qdrant_cache._payload_matches_cache_key(payload={}, key="test_key")
assert not qdrant_cache._payload_matches_cache_key(
payload={QdrantSemanticCache.CACHE_KEY_FIELD_NAME: "other_key"},
key="test_key",
)
def test_qdrant_semantic_cache_payload_index_failure_is_non_blocking():
from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache

View File

@ -232,46 +232,6 @@ def test_redis_semantic_cache_uses_isolated_index_for_old_schema(monkeypatch):
]
def test_redis_semantic_cache_can_reuse_legacy_unscoped_index(monkeypatch):
fallback_cache_mock = MagicMock()
semantic_cache_mock = MagicMock(
side_effect=[
ValueError("Existing index schema does not match"),
fallback_cache_mock,
]
)
custom_vectorizer_mock = MagicMock()
with patch.dict(
"sys.modules",
{
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
"redisvl.utils.vectorize": MagicMock(
CustomTextVectorizer=custom_vectorizer_mock
),
},
):
from litellm.caching.redis_semantic_cache import RedisSemanticCache
monkeypatch.setenv("REDIS_HOST", "localhost")
monkeypatch.setenv("REDIS_PORT", "6379")
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
monkeypatch.setenv(
RedisSemanticCache.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "true"
)
redis_semantic_cache = RedisSemanticCache(
similarity_threshold=0.8,
index_name="existing_index",
)
assert redis_semantic_cache.llmcache is fallback_cache_mock
assert redis_semantic_cache._using_legacy_unscoped_index is True
assert semantic_cache_mock.call_count == 2
assert semantic_cache_mock.call_args_list[1].kwargs["name"] == "existing_index"
assert "filterable_fields" not in semantic_cache_mock.call_args_list[1].kwargs
def test_redis_semantic_cache_overwrites_stale_isolated_index(monkeypatch):
fallback_cache_mock = MagicMock()
semantic_cache_mock = MagicMock(
@ -372,11 +332,12 @@ def test_redis_semantic_cache_matches_bytes_cache_key():
)
def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode():
def test_redis_semantic_cache_rejects_pre_isolation_unscoped_hit():
"""Pre-isolation entries with no cache-key field cannot be safely
reassigned to a caller's scope and are treated as misses."""
from litellm.caching.redis_semantic_cache import RedisSemanticCache
redis_semantic_cache = RedisSemanticCache.__new__(RedisSemanticCache)
redis_semantic_cache._using_legacy_unscoped_index = False
cache_hit = {
"prompt": "What is the capital of France?",
@ -388,12 +349,6 @@ def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode():
key="test_key",
)
redis_semantic_cache._using_legacy_unscoped_index = True
assert redis_semantic_cache._cache_hit_matches_key(
cache_hit=cache_hit,
key="test_key",
)
def test_redis_semantic_cache_builds_filter_expression(monkeypatch):
class FakeTag: