chore(caching): remove allow_legacy_unscoped_cache_hits opt-in
The flag was an opt-in escape hatch for the cross-tenant leak the rest
of the patch closes — flipping it on (env var or constructor param)
re-enables exactly the VERIA-54 primitive on either backend. There is
no operational need that the secure path doesn't already meet:
- Qdrant: legacy points without ``litellm_cache_key`` payload are
excluded by the must-clause filter and treated as misses; new sets
populate the cache key, so cold-start lasts only as long as the
natural cache rebuild.
- Redis: existing unscoped index can't carry the new schema; the init
path falls back to ``{name}_isolated`` (and recreates it on stale
schema), leaving the legacy index untouched.
Drop the constructor param, env-var fallback, ``_using_legacy_unscoped_index``
flag, the legacy-reuse branch in ``_init_semantic_cache``, and the
matching guards in set/get paths. Update tests to drop the legacy-mode
cases and assert the secure-only behaviour.
This commit is contained in:
parent
7d7244986e
commit
a2473ef0c2
@ -12,7 +12,7 @@ import ast
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict, Optional, cast
|
||||
from typing import Any, Dict, cast
|
||||
|
||||
import litellm
|
||||
from litellm._logging import print_verbose
|
||||
@ -27,9 +27,6 @@ from .base_cache import BaseCache
|
||||
|
||||
class QdrantSemanticCache(BaseCache):
|
||||
CACHE_KEY_FIELD_NAME = "litellm_cache_key"
|
||||
ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR = (
|
||||
"LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS"
|
||||
)
|
||||
|
||||
def __init__( # noqa: PLR0915
|
||||
self,
|
||||
@ -41,7 +38,6 @@ class QdrantSemanticCache(BaseCache):
|
||||
embedding_model="text-embedding-ada-002",
|
||||
host_type=None,
|
||||
vector_size=None,
|
||||
allow_legacy_unscoped_cache_hits: Optional[bool] = None,
|
||||
):
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
_get_httpx_client,
|
||||
@ -62,16 +58,6 @@ class QdrantSemanticCache(BaseCache):
|
||||
raise Exception("similarity_threshold must be provided, passed None")
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.embedding_model = embedding_model
|
||||
self.allow_legacy_unscoped_cache_hits = (
|
||||
self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits)
|
||||
)
|
||||
if self.allow_legacy_unscoped_cache_hits:
|
||||
print_verbose(
|
||||
"Qdrant semantic-cache legacy unscoped hits are enabled via "
|
||||
f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; searches may return "
|
||||
"pre-isolation cache entries without cache-key payloads. Disable "
|
||||
"this after warming the key-scoped semantic cache."
|
||||
)
|
||||
self.vector_size = (
|
||||
vector_size if vector_size is not None else QDRANT_VECTOR_SIZE
|
||||
)
|
||||
@ -180,14 +166,6 @@ class QdrantSemanticCache(BaseCache):
|
||||
else:
|
||||
raise Exception("Error while creating new collection")
|
||||
|
||||
@classmethod
|
||||
def _get_allow_legacy_unscoped_cache_hits(
|
||||
cls, allow_legacy_unscoped_cache_hits: Optional[bool]
|
||||
) -> bool:
|
||||
if allow_legacy_unscoped_cache_hits is not None:
|
||||
return allow_legacy_unscoped_cache_hits
|
||||
return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true"
|
||||
|
||||
def _get_cache_logic(self, cached_response: Any):
|
||||
if cached_response is None:
|
||||
return cached_response
|
||||
@ -210,8 +188,6 @@ class QdrantSemanticCache(BaseCache):
|
||||
}
|
||||
|
||||
def _add_cache_key_filter_to_search_data(self, data: dict, key: str) -> None:
|
||||
if getattr(self, "allow_legacy_unscoped_cache_hits", False):
|
||||
return
|
||||
data["filter"] = self._get_qdrant_cache_key_filter(key)
|
||||
|
||||
def _ensure_cache_key_payload_index(self) -> None:
|
||||
@ -236,14 +212,11 @@ class QdrantSemanticCache(BaseCache):
|
||||
)
|
||||
|
||||
def _payload_matches_cache_key(self, payload: dict, key: str) -> bool:
|
||||
# Legacy Qdrant semantic-cache points stored only prompt text and
|
||||
# response. They cannot be reassigned to the generated LiteLLM cache key
|
||||
# without risking cross-scope hits, so secure mode treats them as misses.
|
||||
# Pre-isolation points stored only prompt + response with no cache-key
|
||||
# payload field. Reassigning them to a caller's key would risk
|
||||
# cross-scope hits, so they're treated as misses and re-populated on
|
||||
# the next set_cache.
|
||||
cached_key = payload.get(self.CACHE_KEY_FIELD_NAME)
|
||||
if cached_key is None and getattr(
|
||||
self, "allow_legacy_unscoped_cache_hits", False
|
||||
):
|
||||
return True
|
||||
return cached_key is not None and str(cached_key) == str(key)
|
||||
|
||||
async def _get_async_embedding(self, prompt: str, **kwargs) -> Any:
|
||||
|
||||
@ -36,9 +36,6 @@ class RedisSemanticCache(BaseCache):
|
||||
|
||||
DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
|
||||
CACHE_KEY_FIELD_NAME: str = "litellm_cache_key"
|
||||
ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR: str = (
|
||||
"LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -49,7 +46,6 @@ class RedisSemanticCache(BaseCache):
|
||||
similarity_threshold: Optional[float] = None,
|
||||
embedding_model: str = "text-embedding-ada-002",
|
||||
index_name: Optional[str] = None,
|
||||
allow_legacy_unscoped_cache_hits: Optional[bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@ -91,10 +87,6 @@ class RedisSemanticCache(BaseCache):
|
||||
# While similarity: 1 = most similar, 0 = least similar
|
||||
self.distance_threshold = 1 - similarity_threshold
|
||||
self.embedding_model = embedding_model
|
||||
self.allow_legacy_unscoped_cache_hits = (
|
||||
self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits)
|
||||
)
|
||||
self._using_legacy_unscoped_index = False
|
||||
|
||||
# Set up Redis connection
|
||||
if redis_url is None:
|
||||
@ -125,14 +117,6 @@ class RedisSemanticCache(BaseCache):
|
||||
cache_vectorizer=cache_vectorizer,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_allow_legacy_unscoped_cache_hits(
|
||||
cls, allow_legacy_unscoped_cache_hits: Optional[bool]
|
||||
) -> bool:
|
||||
if allow_legacy_unscoped_cache_hits is not None:
|
||||
return allow_legacy_unscoped_cache_hits
|
||||
return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true"
|
||||
|
||||
@classmethod
|
||||
def _cache_key_filterable_field(cls) -> Dict[str, str]:
|
||||
return {
|
||||
@ -167,22 +151,6 @@ class RedisSemanticCache(BaseCache):
|
||||
if not _is_schema_mismatch(exc):
|
||||
raise
|
||||
|
||||
if self.allow_legacy_unscoped_cache_hits:
|
||||
self._using_legacy_unscoped_index = True
|
||||
print_verbose(
|
||||
"Redis semantic-cache legacy unscoped hits are enabled via "
|
||||
f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; reusing existing "
|
||||
"index without cache-key isolation. Disable this after warming "
|
||||
"the isolated semantic cache."
|
||||
)
|
||||
return semantic_cache_cls(
|
||||
name=index_name,
|
||||
redis_url=redis_url,
|
||||
vectorizer=cache_vectorizer,
|
||||
distance_threshold=self.distance_threshold,
|
||||
overwrite=False,
|
||||
)
|
||||
|
||||
isolated_index_name = f"{index_name}_isolated"
|
||||
print_verbose(
|
||||
"Redis semantic-cache existing index schema is not isolated; "
|
||||
@ -223,11 +191,11 @@ class RedisSemanticCache(BaseCache):
|
||||
return Tag(self.CACHE_KEY_FIELD_NAME) == str(key)
|
||||
|
||||
def _cache_hit_matches_key(self, cache_hit: Dict[str, Any], key: str) -> bool:
|
||||
# Pre-isolation entries with no ``litellm_cache_key`` field cannot be
|
||||
# safely reassigned to a caller's scope and are treated as misses.
|
||||
cached_key = cache_hit.get(self.CACHE_KEY_FIELD_NAME)
|
||||
if isinstance(cached_key, bytes):
|
||||
cached_key = cached_key.decode("utf-8")
|
||||
if cached_key is None and getattr(self, "_using_legacy_unscoped_index", False):
|
||||
return True
|
||||
return cached_key is not None and str(cached_key) == str(key)
|
||||
|
||||
def _get_ttl(self, **kwargs) -> Optional[int]:
|
||||
@ -319,9 +287,9 @@ class RedisSemanticCache(BaseCache):
|
||||
prompt = get_str_from_messages(messages)
|
||||
value_str = str(value)
|
||||
|
||||
store_kwargs: Dict[str, Any] = {}
|
||||
if not getattr(self, "_using_legacy_unscoped_index", False):
|
||||
store_kwargs["filters"] = self._get_cache_filters(key)
|
||||
store_kwargs: Dict[str, Any] = {
|
||||
"filters": self._get_cache_filters(key),
|
||||
}
|
||||
|
||||
# Get TTL and store in Redis semantic cache
|
||||
ttl = self._get_ttl(**kwargs)
|
||||
@ -357,11 +325,10 @@ class RedisSemanticCache(BaseCache):
|
||||
prompt = get_str_from_messages(messages)
|
||||
# Check the cache for semantically similar prompts in this exact
|
||||
# LiteLLM cache-key scope.
|
||||
check_kwargs: Dict[str, Any] = {"prompt": prompt}
|
||||
if not getattr(self, "_using_legacy_unscoped_index", False):
|
||||
check_kwargs["filter_expression"] = (
|
||||
self._get_cache_key_filter_expression(key)
|
||||
)
|
||||
check_kwargs: Dict[str, Any] = {
|
||||
"prompt": prompt,
|
||||
"filter_expression": self._get_cache_key_filter_expression(key),
|
||||
}
|
||||
results = self.llmcache.check(**check_kwargs)
|
||||
|
||||
# Return None if no similar prompts found
|
||||
@ -475,9 +442,8 @@ class RedisSemanticCache(BaseCache):
|
||||
|
||||
store_kwargs: Dict[str, Any] = {
|
||||
"vector": prompt_embedding,
|
||||
"filters": self._get_cache_filters(key),
|
||||
}
|
||||
if not getattr(self, "_using_legacy_unscoped_index", False):
|
||||
store_kwargs["filters"] = self._get_cache_filters(key)
|
||||
|
||||
# Get TTL and store in Redis semantic cache
|
||||
ttl = self._get_ttl(**kwargs)
|
||||
@ -522,11 +488,8 @@ class RedisSemanticCache(BaseCache):
|
||||
check_kwargs: Dict[str, Any] = {
|
||||
"prompt": prompt,
|
||||
"vector": prompt_embedding,
|
||||
"filter_expression": self._get_cache_key_filter_expression(key),
|
||||
}
|
||||
if not getattr(self, "_using_legacy_unscoped_index", False):
|
||||
check_kwargs["filter_expression"] = (
|
||||
self._get_cache_key_filter_expression(key)
|
||||
)
|
||||
results = await self.llmcache.acheck(**check_kwargs)
|
||||
|
||||
# handle results / cache hit
|
||||
|
||||
@ -208,76 +208,6 @@ def test_qdrant_semantic_cache_rejects_unscoped_cache_hit():
|
||||
assert metadata["semantic-similarity"] == 0.0
|
||||
|
||||
|
||||
def test_qdrant_semantic_cache_allows_legacy_unscoped_hit_with_flag(monkeypatch):
|
||||
monkeypatch.setenv("LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS", "true")
|
||||
|
||||
with (
|
||||
patch(
|
||||
"litellm.llms.custom_httpx.http_handler._get_httpx_client"
|
||||
) as mock_sync_client,
|
||||
patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client"),
|
||||
):
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {"result": {"exists": True}}
|
||||
|
||||
mock_sync_client_instance = MagicMock()
|
||||
mock_sync_client_instance.get.return_value = mock_response
|
||||
mock_sync_client.return_value = mock_sync_client_instance
|
||||
|
||||
from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
|
||||
|
||||
qdrant_cache = QdrantSemanticCache(
|
||||
collection_name="test_collection",
|
||||
qdrant_api_base="http://test.qdrant.local",
|
||||
qdrant_api_key="test_key",
|
||||
similarity_threshold=0.8,
|
||||
)
|
||||
|
||||
mock_search_response = MagicMock()
|
||||
mock_search_response.status_code = 200
|
||||
mock_search_response.json.return_value = {
|
||||
"result": [
|
||||
{
|
||||
"payload": {
|
||||
"text": "What is the capital of France?",
|
||||
"response": '{"id": "test-123"}',
|
||||
},
|
||||
"score": 0.9,
|
||||
}
|
||||
]
|
||||
}
|
||||
qdrant_cache.sync_client.post = MagicMock(return_value=mock_search_response)
|
||||
|
||||
with patch(
|
||||
"litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}
|
||||
):
|
||||
metadata = {}
|
||||
result = qdrant_cache.get_cache(
|
||||
key="test_key",
|
||||
messages=[{"content": "What is the capital of France?"}],
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
assert result == {"id": "test-123"}
|
||||
assert metadata["semantic-similarity"] == 0.9
|
||||
assert "filter" not in qdrant_cache.sync_client.post.call_args.kwargs["json"]
|
||||
|
||||
|
||||
def test_qdrant_semantic_cache_legacy_mode_rejects_wrong_key_hit():
|
||||
from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
|
||||
|
||||
qdrant_cache = QdrantSemanticCache.__new__(QdrantSemanticCache)
|
||||
qdrant_cache.allow_legacy_unscoped_cache_hits = True
|
||||
|
||||
assert qdrant_cache._payload_matches_cache_key(payload={}, key="test_key")
|
||||
assert not qdrant_cache._payload_matches_cache_key(
|
||||
payload={QdrantSemanticCache.CACHE_KEY_FIELD_NAME: "other_key"},
|
||||
key="test_key",
|
||||
)
|
||||
|
||||
|
||||
def test_qdrant_semantic_cache_payload_index_failure_is_non_blocking():
|
||||
from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
|
||||
|
||||
|
||||
@ -232,46 +232,6 @@ def test_redis_semantic_cache_uses_isolated_index_for_old_schema(monkeypatch):
|
||||
]
|
||||
|
||||
|
||||
def test_redis_semantic_cache_can_reuse_legacy_unscoped_index(monkeypatch):
|
||||
fallback_cache_mock = MagicMock()
|
||||
semantic_cache_mock = MagicMock(
|
||||
side_effect=[
|
||||
ValueError("Existing index schema does not match"),
|
||||
fallback_cache_mock,
|
||||
]
|
||||
)
|
||||
custom_vectorizer_mock = MagicMock()
|
||||
|
||||
with patch.dict(
|
||||
"sys.modules",
|
||||
{
|
||||
"redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
|
||||
"redisvl.utils.vectorize": MagicMock(
|
||||
CustomTextVectorizer=custom_vectorizer_mock
|
||||
),
|
||||
},
|
||||
):
|
||||
from litellm.caching.redis_semantic_cache import RedisSemanticCache
|
||||
|
||||
monkeypatch.setenv("REDIS_HOST", "localhost")
|
||||
monkeypatch.setenv("REDIS_PORT", "6379")
|
||||
monkeypatch.setenv("REDIS_PASSWORD", "test_password")
|
||||
monkeypatch.setenv(
|
||||
RedisSemanticCache.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "true"
|
||||
)
|
||||
|
||||
redis_semantic_cache = RedisSemanticCache(
|
||||
similarity_threshold=0.8,
|
||||
index_name="existing_index",
|
||||
)
|
||||
|
||||
assert redis_semantic_cache.llmcache is fallback_cache_mock
|
||||
assert redis_semantic_cache._using_legacy_unscoped_index is True
|
||||
assert semantic_cache_mock.call_count == 2
|
||||
assert semantic_cache_mock.call_args_list[1].kwargs["name"] == "existing_index"
|
||||
assert "filterable_fields" not in semantic_cache_mock.call_args_list[1].kwargs
|
||||
|
||||
|
||||
def test_redis_semantic_cache_overwrites_stale_isolated_index(monkeypatch):
|
||||
fallback_cache_mock = MagicMock()
|
||||
semantic_cache_mock = MagicMock(
|
||||
@ -372,11 +332,12 @@ def test_redis_semantic_cache_matches_bytes_cache_key():
|
||||
)
|
||||
|
||||
|
||||
def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode():
|
||||
def test_redis_semantic_cache_rejects_pre_isolation_unscoped_hit():
|
||||
"""Pre-isolation entries with no cache-key field cannot be safely
|
||||
reassigned to a caller's scope and are treated as misses."""
|
||||
from litellm.caching.redis_semantic_cache import RedisSemanticCache
|
||||
|
||||
redis_semantic_cache = RedisSemanticCache.__new__(RedisSemanticCache)
|
||||
redis_semantic_cache._using_legacy_unscoped_index = False
|
||||
|
||||
cache_hit = {
|
||||
"prompt": "What is the capital of France?",
|
||||
@ -388,12 +349,6 @@ def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode():
|
||||
key="test_key",
|
||||
)
|
||||
|
||||
redis_semantic_cache._using_legacy_unscoped_index = True
|
||||
assert redis_semantic_cache._cache_hit_matches_key(
|
||||
cache_hit=cache_hit,
|
||||
key="test_key",
|
||||
)
|
||||
|
||||
|
||||
def test_redis_semantic_cache_builds_filter_expression(monkeypatch):
|
||||
class FakeTag:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user