chore(caching): remove allow_legacy_unscoped_cache_hits opt-in

The flag was an opt-in escape hatch for the cross-tenant leak the rest of the patch closes — flipping it on (env var or constructor param) re-enables exactly the VERIA-54 primitive on either backend. There is no operational need that the secure path doesn't already meet: - Qdrant: legacy points without ``litellm_cache_key`` payload are excluded by the must-clause filter and treated as misses; new sets populate the cache key, so cold-start lasts only as long as the natural cache rebuild. - Redis: existing unscoped index can't carry the new schema; the init path falls back to ``{name}_isolated`` (and recreates it on stale schema), leaving the legacy index untouched. Drop the constructor param, env-var fallback, ``_using_legacy_unscoped_index`` flag, the legacy-reuse branch in ``_init_semantic_cache``, and the matching guards in set/get paths. Update tests to drop the legacy-mode cases and assert the secure-only behaviour.
2026-05-04 22:16:30 +00:00 · 2026-05-04 22:16:30 +00:00 · a2473ef0c2
commit a2473ef0c2
parent 7d7244986e
4 changed files with 19 additions and 198 deletions
--- a/litellm/caching/qdrant_semantic_cache.py
+++ b/litellm/caching/qdrant_semantic_cache.py
@ -12,7 +12,7 @@ import ast
 import asyncio
 import json
 import os
-from typing import Any, Dict, Optional, cast
+from typing import Any, Dict, cast

 import litellm
 from litellm._logging import print_verbose
@ -27,9 +27,6 @@ from .base_cache import BaseCache

 class QdrantSemanticCache(BaseCache):
    CACHE_KEY_FIELD_NAME = "litellm_cache_key"
-    ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR = (
-        "LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS"
-    )

    def __init__(  # noqa: PLR0915
        self,
@ -41,7 +38,6 @@ class QdrantSemanticCache(BaseCache):
        embedding_model="text-embedding-ada-002",
        host_type=None,
        vector_size=None,
-        allow_legacy_unscoped_cache_hits: Optional[bool] = None,
    ):
        from litellm.llms.custom_httpx.http_handler import (
            _get_httpx_client,
@ -62,16 +58,6 @@ class QdrantSemanticCache(BaseCache):
            raise Exception("similarity_threshold must be provided, passed None")
        self.similarity_threshold = similarity_threshold
        self.embedding_model = embedding_model
-        self.allow_legacy_unscoped_cache_hits = (
-            self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits)
-        )
-        if self.allow_legacy_unscoped_cache_hits:
-            print_verbose(
-                "Qdrant semantic-cache legacy unscoped hits are enabled via "
-                f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; searches may return "
-                "pre-isolation cache entries without cache-key payloads. Disable "
-                "this after warming the key-scoped semantic cache."
-            )
        self.vector_size = (
            vector_size if vector_size is not None else QDRANT_VECTOR_SIZE
        )
@ -180,14 +166,6 @@ class QdrantSemanticCache(BaseCache):
            else:
                raise Exception("Error while creating new collection")

-    @classmethod
-    def _get_allow_legacy_unscoped_cache_hits(
-        cls, allow_legacy_unscoped_cache_hits: Optional[bool]
-    ) -> bool:
-        if allow_legacy_unscoped_cache_hits is not None:
-            return allow_legacy_unscoped_cache_hits
-        return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true"
-
    def _get_cache_logic(self, cached_response: Any):
        if cached_response is None:
            return cached_response
@ -210,8 +188,6 @@ class QdrantSemanticCache(BaseCache):
        }

    def _add_cache_key_filter_to_search_data(self, data: dict, key: str) -> None:
-        if getattr(self, "allow_legacy_unscoped_cache_hits", False):
-            return
        data["filter"] = self._get_qdrant_cache_key_filter(key)

    def _ensure_cache_key_payload_index(self) -> None:
@ -236,14 +212,11 @@ class QdrantSemanticCache(BaseCache):
            )

    def _payload_matches_cache_key(self, payload: dict, key: str) -> bool:
-        # Legacy Qdrant semantic-cache points stored only prompt text and
-        # response. They cannot be reassigned to the generated LiteLLM cache key
-        # without risking cross-scope hits, so secure mode treats them as misses.
+        # Pre-isolation points stored only prompt + response with no cache-key
+        # payload field. Reassigning them to a caller's key would risk
+        # cross-scope hits, so they're treated as misses and re-populated on
+        # the next set_cache.
        cached_key = payload.get(self.CACHE_KEY_FIELD_NAME)
-        if cached_key is None and getattr(
-            self, "allow_legacy_unscoped_cache_hits", False
-        ):
-            return True
        return cached_key is not None and str(cached_key) == str(key)

    async def _get_async_embedding(self, prompt: str, **kwargs) -> Any:
--- a/litellm/caching/redis_semantic_cache.py
+++ b/litellm/caching/redis_semantic_cache.py
@ -36,9 +36,6 @@ class RedisSemanticCache(BaseCache):

    DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
    CACHE_KEY_FIELD_NAME: str = "litellm_cache_key"
-    ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR: str = (
-        "LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS"
-    )

    def __init__(
        self,
@ -49,7 +46,6 @@ class RedisSemanticCache(BaseCache):
        similarity_threshold: Optional[float] = None,
        embedding_model: str = "text-embedding-ada-002",
        index_name: Optional[str] = None,
-        allow_legacy_unscoped_cache_hits: Optional[bool] = None,
        **kwargs,
    ):
        """
@ -91,10 +87,6 @@ class RedisSemanticCache(BaseCache):
        # While similarity: 1 = most similar, 0 = least similar
        self.distance_threshold = 1 - similarity_threshold
        self.embedding_model = embedding_model
-        self.allow_legacy_unscoped_cache_hits = (
-            self._get_allow_legacy_unscoped_cache_hits(allow_legacy_unscoped_cache_hits)
-        )
-        self._using_legacy_unscoped_index = False

        # Set up Redis connection
        if redis_url is None:
@ -125,14 +117,6 @@ class RedisSemanticCache(BaseCache):
            cache_vectorizer=cache_vectorizer,
        )

-    @classmethod
-    def _get_allow_legacy_unscoped_cache_hits(
-        cls, allow_legacy_unscoped_cache_hits: Optional[bool]
-    ) -> bool:
-        if allow_legacy_unscoped_cache_hits is not None:
-            return allow_legacy_unscoped_cache_hits
-        return os.getenv(cls.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "").lower() == "true"
-
    @classmethod
    def _cache_key_filterable_field(cls) -> Dict[str, str]:
        return {
@ -167,22 +151,6 @@ class RedisSemanticCache(BaseCache):
            if not _is_schema_mismatch(exc):
                raise

-            if self.allow_legacy_unscoped_cache_hits:
-                self._using_legacy_unscoped_index = True
-                print_verbose(
-                    "Redis semantic-cache legacy unscoped hits are enabled via "
-                    f"{self.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR}; reusing existing "
-                    "index without cache-key isolation. Disable this after warming "
-                    "the isolated semantic cache."
-                )
-                return semantic_cache_cls(
-                    name=index_name,
-                    redis_url=redis_url,
-                    vectorizer=cache_vectorizer,
-                    distance_threshold=self.distance_threshold,
-                    overwrite=False,
-                )
-
            isolated_index_name = f"{index_name}_isolated"
            print_verbose(
                "Redis semantic-cache existing index schema is not isolated; "
@ -223,11 +191,11 @@ class RedisSemanticCache(BaseCache):
        return Tag(self.CACHE_KEY_FIELD_NAME) == str(key)

    def _cache_hit_matches_key(self, cache_hit: Dict[str, Any], key: str) -> bool:
+        # Pre-isolation entries with no ``litellm_cache_key`` field cannot be
+        # safely reassigned to a caller's scope and are treated as misses.
        cached_key = cache_hit.get(self.CACHE_KEY_FIELD_NAME)
        if isinstance(cached_key, bytes):
            cached_key = cached_key.decode("utf-8")
-        if cached_key is None and getattr(self, "_using_legacy_unscoped_index", False):
-            return True
        return cached_key is not None and str(cached_key) == str(key)

    def _get_ttl(self, **kwargs) -> Optional[int]:
@ -319,9 +287,9 @@ class RedisSemanticCache(BaseCache):
            prompt = get_str_from_messages(messages)
            value_str = str(value)

-            store_kwargs: Dict[str, Any] = {}
-            if not getattr(self, "_using_legacy_unscoped_index", False):
-                store_kwargs["filters"] = self._get_cache_filters(key)
+            store_kwargs: Dict[str, Any] = {
+                "filters": self._get_cache_filters(key),
+            }

            # Get TTL and store in Redis semantic cache
            ttl = self._get_ttl(**kwargs)
@ -357,11 +325,10 @@ class RedisSemanticCache(BaseCache):
            prompt = get_str_from_messages(messages)
            # Check the cache for semantically similar prompts in this exact
            # LiteLLM cache-key scope.
-            check_kwargs: Dict[str, Any] = {"prompt": prompt}
-            if not getattr(self, "_using_legacy_unscoped_index", False):
-                check_kwargs["filter_expression"] = (
-                    self._get_cache_key_filter_expression(key)
-                )
+            check_kwargs: Dict[str, Any] = {
+                "prompt": prompt,
+                "filter_expression": self._get_cache_key_filter_expression(key),
+            }
            results = self.llmcache.check(**check_kwargs)

            # Return None if no similar prompts found
@ -475,9 +442,8 @@ class RedisSemanticCache(BaseCache):

            store_kwargs: Dict[str, Any] = {
                "vector": prompt_embedding,
+                "filters": self._get_cache_filters(key),
            }
-            if not getattr(self, "_using_legacy_unscoped_index", False):
-                store_kwargs["filters"] = self._get_cache_filters(key)

            # Get TTL and store in Redis semantic cache
            ttl = self._get_ttl(**kwargs)
@ -522,11 +488,8 @@ class RedisSemanticCache(BaseCache):
            check_kwargs: Dict[str, Any] = {
                "prompt": prompt,
                "vector": prompt_embedding,
+                "filter_expression": self._get_cache_key_filter_expression(key),
            }
-            if not getattr(self, "_using_legacy_unscoped_index", False):
-                check_kwargs["filter_expression"] = (
-                    self._get_cache_key_filter_expression(key)
-                )
            results = await self.llmcache.acheck(**check_kwargs)

            # handle results / cache hit
--- a/tests/test_litellm/caching/test_qdrant_semantic_cache.py
+++ b/tests/test_litellm/caching/test_qdrant_semantic_cache.py
@ -208,76 +208,6 @@ def test_qdrant_semantic_cache_rejects_unscoped_cache_hit():
        assert metadata["semantic-similarity"] == 0.0


-def test_qdrant_semantic_cache_allows_legacy_unscoped_hit_with_flag(monkeypatch):
-    monkeypatch.setenv("LITELLM_SEMANTIC_CACHE_ALLOW_LEGACY_UNSCOPED_HITS", "true")
-
-    with (
-        patch(
-            "litellm.llms.custom_httpx.http_handler._get_httpx_client"
-        ) as mock_sync_client,
-        patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client"),
-    ):
-
-        mock_response = MagicMock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {"result": {"exists": True}}
-
-        mock_sync_client_instance = MagicMock()
-        mock_sync_client_instance.get.return_value = mock_response
-        mock_sync_client.return_value = mock_sync_client_instance
-
-        from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
-
-        qdrant_cache = QdrantSemanticCache(
-            collection_name="test_collection",
-            qdrant_api_base="http://test.qdrant.local",
-            qdrant_api_key="test_key",
-            similarity_threshold=0.8,
-        )
-
-        mock_search_response = MagicMock()
-        mock_search_response.status_code = 200
-        mock_search_response.json.return_value = {
-            "result": [
-                {
-                    "payload": {
-                        "text": "What is the capital of France?",
-                        "response": '{"id": "test-123"}',
-                    },
-                    "score": 0.9,
-                }
-            ]
-        }
-        qdrant_cache.sync_client.post = MagicMock(return_value=mock_search_response)
-
-        with patch(
-            "litellm.embedding", return_value={"data": [{"embedding": [0.1, 0.2, 0.3]}]}
-        ):
-            metadata = {}
-            result = qdrant_cache.get_cache(
-                key="test_key",
-                messages=[{"content": "What is the capital of France?"}],
-                metadata=metadata,
-            )
-
-        assert result == {"id": "test-123"}
-        assert metadata["semantic-similarity"] == 0.9
-        assert "filter" not in qdrant_cache.sync_client.post.call_args.kwargs["json"]
-
-
-def test_qdrant_semantic_cache_legacy_mode_rejects_wrong_key_hit():
-    from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
-
-    qdrant_cache = QdrantSemanticCache.__new__(QdrantSemanticCache)
-    qdrant_cache.allow_legacy_unscoped_cache_hits = True
-
-    assert qdrant_cache._payload_matches_cache_key(payload={}, key="test_key")
-    assert not qdrant_cache._payload_matches_cache_key(
-        payload={QdrantSemanticCache.CACHE_KEY_FIELD_NAME: "other_key"},
-        key="test_key",
-    )
-
-
 def test_qdrant_semantic_cache_payload_index_failure_is_non_blocking():
    from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache

--- a/tests/test_litellm/caching/test_redis_semantic_cache.py
+++ b/tests/test_litellm/caching/test_redis_semantic_cache.py
@ -232,46 +232,6 @@ def test_redis_semantic_cache_uses_isolated_index_for_old_schema(monkeypatch):
        ]


-def test_redis_semantic_cache_can_reuse_legacy_unscoped_index(monkeypatch):
-    fallback_cache_mock = MagicMock()
-    semantic_cache_mock = MagicMock(
-        side_effect=[
-            ValueError("Existing index schema does not match"),
-            fallback_cache_mock,
-        ]
-    )
-    custom_vectorizer_mock = MagicMock()
-
-    with patch.dict(
-        "sys.modules",
-        {
-            "redisvl.extensions.llmcache": MagicMock(SemanticCache=semantic_cache_mock),
-            "redisvl.utils.vectorize": MagicMock(
-                CustomTextVectorizer=custom_vectorizer_mock
-            ),
-        },
-    ):
-        from litellm.caching.redis_semantic_cache import RedisSemanticCache
-
-        monkeypatch.setenv("REDIS_HOST", "localhost")
-        monkeypatch.setenv("REDIS_PORT", "6379")
-        monkeypatch.setenv("REDIS_PASSWORD", "test_password")
-        monkeypatch.setenv(
-            RedisSemanticCache.ALLOW_LEGACY_UNSCOPED_HITS_ENV_VAR, "true"
-        )
-
-        redis_semantic_cache = RedisSemanticCache(
-            similarity_threshold=0.8,
-            index_name="existing_index",
-        )
-
-        assert redis_semantic_cache.llmcache is fallback_cache_mock
-        assert redis_semantic_cache._using_legacy_unscoped_index is True
-        assert semantic_cache_mock.call_count == 2
-        assert semantic_cache_mock.call_args_list[1].kwargs["name"] == "existing_index"
-        assert "filterable_fields" not in semantic_cache_mock.call_args_list[1].kwargs
-
-
 def test_redis_semantic_cache_overwrites_stale_isolated_index(monkeypatch):
    fallback_cache_mock = MagicMock()
    semantic_cache_mock = MagicMock(
@ -372,11 +332,12 @@ def test_redis_semantic_cache_matches_bytes_cache_key():
    )


-def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode():
+def test_redis_semantic_cache_rejects_pre_isolation_unscoped_hit():
+    """Pre-isolation entries with no cache-key field cannot be safely
+    reassigned to a caller's scope and are treated as misses."""
    from litellm.caching.redis_semantic_cache import RedisSemanticCache

    redis_semantic_cache = RedisSemanticCache.__new__(RedisSemanticCache)
-    redis_semantic_cache._using_legacy_unscoped_index = False

    cache_hit = {
        "prompt": "What is the capital of France?",
@ -388,12 +349,6 @@ def test_redis_semantic_cache_allows_unscoped_hit_only_in_legacy_mode():
        key="test_key",
    )

-    redis_semantic_cache._using_legacy_unscoped_index = True
-    assert redis_semantic_cache._cache_hit_matches_key(
-        cache_hit=cache_hit,
-        key="test_key",
-    )
-

 def test_redis_semantic_cache_builds_filter_expression(monkeypatch):
    class FakeTag: