diff --git a/litellm/caching/qdrant_semantic_cache.py b/litellm/caching/qdrant_semantic_cache.py index 9206f326ec..cb521efca0 100644 --- a/litellm/caching/qdrant_semantic_cache.py +++ b/litellm/caching/qdrant_semantic_cache.py @@ -12,7 +12,7 @@ import ast import asyncio import json import os -from typing import Any, Dict, Optional, cast +from typing import Any, Dict, cast import litellm from litellm._logging import print_verbose @@ -27,9 +27,6 @@ from .base_cache import BaseCache class QdrantSemanticCache(BaseCache): CACHE_KEY_FIELD_NAME = "litellm_cache_key" - ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR = ( - "LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS" - ) def __init__( # noqa: PLR0915 self, @@ -41,7 +38,6 @@ class QdrantSemanticCache(BaseCache): embedding_model="text-embedding-ada-002", host_type=None, vector_size=None, - allow_legacy_unscoped_cache_hits: Optional[bool] = None, ): from litellm.llms.custom_httpx.http_handler import ( _get_httpx_client, @@ -62,16 +58,6 @@ class QdrantSemanticCache(BaseCache): raise Exception("similarity_threshold must be provided, passed None") self.similarity_threshold = similarity_threshold self.embedding_model = embedding_model - self.allow_legacy_unscoped_cache_hits = ( - self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits) - ) - if self.allow_legacy_unscoped_cache_hits: - print_verbose( - "Qdrant semantic-cache legacy unscoped hits are enabled via " - f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; searches may return " - "pre-isolation cache entries without cache-key payloads. Disable " - "this after warming the key-scoped semantic cache." - ) self.vector_size = ( vector_size if vector_size is not None else QDRANT_VECTOR_SIZE ) @@ -180,14 +166,6 @@ class QdrantSemanticCache(BaseCache): else: raise Exception("Error while creating new collection") - @classmethod - def _get_allow_legacy_unscoped_cache_hits( - cls, allow_legacy_unscoped_cache_hits: Optional[bool] - ) -> bool: - if allow_legacy_unscoped_cache_hits is not None: - return allow_legacy_unscoped_cache_hits - return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true" - def _get_cache_logic(self, cached_response: Any): if cached_response is None: return cached_response @@ -210,8 +188,6 @@ class QdrantSemanticCache(BaseCache): } def _add_cache_key_filter_to_search_data(self, data: dict, key: str) -> None: - if getattr(self, "allow_legacy_unscoped_cache_hits", False): - return data["filter"] = self._get_qdrant_cache_key_filter(key) def _ensure_cache_key_payload_index(self) -> None: @@ -236,14 +212,11 @@ class QdrantSemanticCache(BaseCache): ) def _payload_matches_cache_key(self, payload: dict, key: str) -> bool: - # Legacy Qdrant semantic-cache points stored only prompt text and - # response. They cannot be reassigned to the generated LiteLLM cache key - # without risking cross-scope hits, so secure mode treats them as misses. + # Pre-isolation points stored only prompt + response with no cache-key + # payload field. Reassigning them to a caller's key would risk + # cross-scope hits, so they're treated as misses and re-populated on + # the next set_cache. cached_key = payload.get(self.CACHE_KEY_FIELD_NAME) - if cached_key is None and getattr( - self, "allow_legacy_unscoped_cache_hits", False - ): - return True return cached_key is not None and str(cached_key) == str(key) async def _get_async_embedding(self, prompt: str, **kwargs) -> Any: diff --git a/litellm/caching/redis_semantic_cache.py b/litellm/caching/redis_semantic_cache.py index 35fed977cc..da9e7b1e58 100644 --- a/litellm/caching/redis_semantic_cache.py +++ b/litellm/caching/redis_semantic_cache.py @@ -36,9 +36,6 @@ class RedisSemanticCache(BaseCache): DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index" CACHE_KEY_FIELD_NAME: str = "litellm_cache_key" - ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR: str = ( - "LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS" - ) def __init__( self, @@ -49,7 +46,6 @@ class RedisSemanticCache(BaseCache): similarity_threshold: Optional[float] = None, embedding_model: str = "text-embedding-ada-002", index_name: Optional[str] = None, - allow_legacy_unscoped_cache_hits: Optional[bool] = None, **kwargs, ): """ @@ -91,10 +87,6 @@ class RedisSemanticCache(BaseCache): # While similarity: 1 = most similar, 0 = least similar self.distance_threshold = 1 - similarity_threshold self.embedding_model = embedding_model - self.allow_legacy_unscoped_cache_hits = ( - self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits) - ) - self._using_legacy_unscoped_index = False # Set up Redis connection if redis_url is None: @@ -125,14 +117,6 @@ class RedisSemanticCache(BaseCache): cache_vectorizer=cache_vectorizer, ) - @classmethod - def _get_allow_legacy_unscoped_cache_hits( - cls, allow_legacy_unscoped_cache_hits: Optional[bool] - ) -> bool: - if allow_legacy_unscoped_cache_hits is not None: - return allow_legacy_unscoped_cache_hits - return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true" - @classmethod def _cache_key_filterable_field(cls) -> Dict[str, str]: return { @@ -167,22 +151,6 @@ class RedisSemanticCache(BaseCache): if not _is_schema_mismatch(exc): raise - if self.allow_legacy_unscoped_cache_hits: - self._using_legacy_unscoped_index = True - print_verbose( - "Redis semantic-cache legacy unscoped hits are enabled via " - f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; reusing existing " - "index without cache-key isolation. Disable this after warming " - "the isolated semantic cache." - ) - return semantic_cache_cls( - name=index_name, - redis_url=redis_url, - vectorizer=cache_vectorizer, - distance_threshold=self.distance_threshold, - overwrite=False, - ) - isolated_index_name = f"{index_name}_isolated" print_verbose( "Redis semantic-cache existing index schema is not isolated; " @@ -223,11 +191,11 @@ class RedisSemanticCache(BaseCache): return Tag(self.CACHE_KEY_FIELD_NAME) == str(key) def _cache_hit_matches_key(self, cache_hit: Dict[str, Any], key: str) -> bool: + # Pre-isolation entries with no ``litellm_cache_key`` field cannot be + # safely reassigned to a caller's scope and are treated as misses. cached_key = cache_hit.get(self.CACHE_KEY_FIELD_NAME) if isinstance(cached_key, bytes): cached_key = cached_key.decode("utf-8") - if cached_key is None and getattr(self, "_using_legacy_unscoped_index", False): - return True return cached_key is not None and str(cached_key) == str(key) def _get_ttl(self, **kwargs) -> Optional[int]: @@ -319,9 +287,9 @@ class RedisSemanticCache(BaseCache): prompt = get_str_from_messages(messages) value_str = str(value) - store_kwargs: Dict[str, Any] = {} - if not getattr(self, "_using_legacy_unscoped_index", False): - store_kwargs["filters"] = self._get_cache_filters(key) + store_kwargs: Dict[str, Any] = { + "filters": self._get_cache_filters(key), + } # Get TTL and store in Redis semantic cache ttl = self._get_ttl(**kwargs) @@ -357,11 +325,10 @@ class RedisSemanticCache(BaseCache): prompt = get_str_from_messages(messages) # Check the cache for semantically similar prompts in this exact # LiteLLM cache-key scope. - check_kwargs: Dict[str, Any] = {"prompt": prompt} - if not getattr(self, "_using_legacy_unscoped_index", False): - check_kwargs["filter_expression"] = ( - self._get_cache_key_filter_expression(key) - ) + check_kwargs: Dict[str, Any] = { + "prompt": prompt, + "filter_expression": self._get_cache_key_filter_expression(key), + } results = self.llmcache.check(**check_kwargs) # Return None if no similar prompts found @@ -475,9 +442,8 @@ class RedisSemanticCache(BaseCache): store_kwargs: Dict[str, Any] = { "vector": prompt_embedding, + "filters": self._get_cache_filters(key), } - if not getattr(self, "_using_legacy_unscoped_index", False): - store_kwargs["filters"] = self._get_cache_filters(key) # Get TTL and store in Redis semantic cache ttl = self._get_ttl(**kwargs) @@ -522,11 +488,8 @@ class RedisSemanticCache(BaseCache): check_kwargs: Dict[str, Any] = { "prompt": prompt, "vector": prompt_embedding, + "filter_expression": self._get_cache_key_filter_expression(key), } - if not getattr(self, "_using_legacy_unscoped_index", False): - check_kwargs["filter_expression"] = ( - self._get_cache_key_filter_expression(key) - ) results = await self.llmcache.acheck(**check_kwargs) # handle results / cache hit diff --git a/tests/test_litellm/caching/test_qdrant_semantic_cache.py b/tests/test_litellm/caching/test_qdrant_semantic_cache.py index 9b987a6d4f..949e6ccc29 100644 --- a/tests/test_litellm/caching/test_qdrant_semantic_cache.py +++ b/tests/test_litellm/caching/test_qdrant_semantic_cache.py @@ -208,76 +208,6 @@ def test_qdrant_semantic_cache_rejects_unscoped_cache_hit(): assert metadata["semantic-similarity"] == 0.0 -def test_qdrant_semantic_cache_allows_legacy_unscoped_hit_with_flag(monkeypatch): - monkeypatch.setenv("LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS", "true") - - with ( - patch( - "litellm.llms.custom_httpx.http_handler._get_httpx_client" - ) as mock_sync_client, - patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client"), - ): - - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = {"result": {"exists": True}} - - mock_sync_client_instance = MagicMock() - mock_sync_client_instance.get.return_value = mock_response - mock_sync_client.return_value = mock_sync_client_instance - - from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache - - qdrant_cache = QdrantSemanticCache( - collection_name="test_collection", - qdrant_api_base="http://test.qdrant.local", - qdrant_api_key="test_key", - similarity_threshold=0.8, - ) - - mock_search_response = MagicMock() - mock_search_response.status_code = 200 - mock_search_response.json.return_value = { - "result": [ - { - "payload": { - "text": "What is the capital of France?", - "response": '{"id": "test-123"}', - }, - "score": 0.9, - } - ] - } - qdrant_cache.sync_client.post = MagicMock(return_value=mock_search_response) - - with patch( - "litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]} - ): - metadata = {} - result = qdrant_cache.get_cache( - key="test_key", - messages=[{"content": "What is the capital of France?"}], - metadata=metadata, - ) - - assert result == {"id": "test-123"} - assert metadata["semantic-similarity"] == 0.9 - assert "filter" not in qdrant_cache.sync_client.post.call_args.kwargs["json"] - - -def test_qdrant_semantic_cache_legacy_mode_rejects_wrong_key_hit(): - from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache - - qdrant_cache = QdrantSemanticCache.__new__(QdrantSemanticCache) - qdrant_cache.allow_legacy_unscoped_cache_hits = True - - assert qdrant_cache._payload_matches_cache_key(payload={}, key="test_key") - assert not qdrant_cache._payload_matches_cache_key( - payload={QdrantSemanticCache.CACHE_KEY_FIELD_NAME: "other_key"}, - key="test_key", - ) - - def test_qdrant_semantic_cache_payload_index_failure_is_non_blocking(): from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache diff --git a/tests/test_litellm/caching/test_redis_semantic_cache.py b/tests/test_litellm/caching/test_redis_semantic_cache.py index bebe1f757b..b50a35ef50 100644 --- a/tests/test_litellm/caching/test_redis_semantic_cache.py +++ b/tests/test_litellm/caching/test_redis_semantic_cache.py @@ -232,46 +232,6 @@ def test_redis_semantic_cache_uses_isolated_index_for_old_schema(monkeypatch): ] -def test_redis_semantic_cache_can_reuse_legacy_unscoped_index(monkeypatch): - fallback_cache_mock = MagicMock() - semantic_cache_mock = MagicMock( - side_effect=[ - ValueError("Existing index schema does not match"), - fallback_cache_mock, - ] - ) - custom_vectorizer_mock = MagicMock() - - with patch.dict( - "sys.modules", - { - "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock), - "redisvl.utils.vectorize": MagicMock( - CustomTextVectorizer=custom_vectorizer_mock - ), - }, - ): - from litellm.caching.redis_semantic_cache import RedisSemanticCache - - monkeypatch.setenv("REDIS_HOST", "localhost") - monkeypatch.setenv("REDIS_PORT", "6379") - monkeypatch.setenv("REDIS_PASSWORD", "test_password") - monkeypatch.setenv( - RedisSemanticCache.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "true" - ) - - redis_semantic_cache = RedisSemanticCache( - similarity_threshold=0.8, - index_name="existing_index", - ) - - assert redis_semantic_cache.llmcache is fallback_cache_mock - assert redis_semantic_cache._using_legacy_unscoped_index is True - assert semantic_cache_mock.call_count == 2 - assert semantic_cache_mock.call_args_list[1].kwargs["name"] == "existing_index" - assert "filterable_fields" not in semantic_cache_mock.call_args_list[1].kwargs - - def test_redis_semantic_cache_overwrites_stale_isolated_index(monkeypatch): fallback_cache_mock = MagicMock() semantic_cache_mock = MagicMock( @@ -372,11 +332,12 @@ def test_redis_semantic_cache_matches_bytes_cache_key(): ) -def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode(): +def test_redis_semantic_cache_rejects_pre_isolation_unscoped_hit(): + """Pre-isolation entries with no cache-key field cannot be safely + reassigned to a caller's scope and are treated as misses.""" from litellm.caching.redis_semantic_cache import RedisSemanticCache redis_semantic_cache = RedisSemanticCache.__new__(RedisSemanticCache) - redis_semantic_cache._using_legacy_unscoped_index = False cache_hit = { "prompt": "What is the capital of France?", @@ -388,12 +349,6 @@ def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode(): key="test_key", ) - redis_semantic_cache._using_legacy_unscoped_index = True - assert redis_semantic_cache._cache_hit_matches_key( - cache_hit=cache_hit, - key="test_key", - ) - def test_redis_semantic_cache_builds_filter_expression(monkeypatch): class FakeTag: