From 76f56c3283bbeb3ac9448a71331c4e407ca0beed Mon Sep 17 00:00:00 2001
From: yuneng-jiang <yuneng@berri.ai>
Date: Thu, 28 May 2026 17:12:02 -0700
Subject: [PATCH] fix(tests/vcr): mint Google OAuth tokens live to prevent
 stale-token replay (#29229)

The Redis-backed VCR layer was recording and replaying the Google
OAuth2/STS token-mint call. The replayed ya29.* access token is
long-expired, but its recorded expires_in keeps credentials.expired
False, so litellm never refreshes it and sends the stale token to a live
Vertex/Gemini endpoint, which returns 401 ACCESS_TOKEN_EXPIRED. This
broke live partner-model tests whose completion call is not itself
cassette-backed (e.g. test_vertex_ai_llama_tool_calling).

Force credential-exchange hosts to pass through live (never recorded,
never replayed) by returning None from before_record_request, mirroring
the existing telemetry passthrough, so a fresh token is minted each run.

Regression from #28826, which added OAuth-token matcher tolerance plus
TTL-refresh-on-read so a stale token episode matched and never expired.
---
 tests/_vcr_conftest_common.py             | 19 ++++++++
 tests/llm_translation/test_vcr_filters.py | 56 ++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/tests/_vcr_conftest_common.py b/tests/_vcr_conftest_common.py
index 6d3d34d4ac..d08b87bd58 100644
--- a/tests/_vcr_conftest_common.py
+++ b/tests/_vcr_conftest_common.py
@@ -644,6 +644,23 @@ def _should_drop_telemetry_record(request) -> bool:
     return not _current_test_records_telemetry()
 
 
+def _should_passthrough_credential_exchange(request) -> bool:
+    """Force the Google OAuth2/STS token mint to run live, never from cassette.
+
+    The mint returns a short-lived ``ya29.*`` access token. Recording it lets a
+    *stale* token replay on a later run; litellm caches it (the recorded
+    ``expires_in`` keeps ``credentials.expired`` False, so it is never
+    refreshed) and sends it to a live Vertex/Gemini endpoint, which rejects it
+    with ``ACCESS_TOKEN_EXPIRED``. The token body carries nothing a test asserts
+    on, so always mint it live: returning ``None`` from ``before_record_request``
+    makes vcrpy neither store nor replay the call. Inert during
+    ``Cassette._load`` for the same reason as ``_should_drop_telemetry_record``.
+    """
+    if _vcr_load_in_progress():
+        return False
+    return _is_credential_exchange_request(request)
+
+
 # Google APIs (Vertex AI, Gemini, OAuth2/STS). Auth is a ``ya29.*`` OAuth2
 # access token minted fresh on every run, so the per-request key fingerprint
 # rotates and never matches a recording. The logical credential — the GCP
@@ -931,6 +948,8 @@ def _before_record_request(request):
     # store the interaction; the request passes through live (fire-and-forget).
     if _should_drop_telemetry_record(request):
         return None
+    if _should_passthrough_credential_exchange(request):
+        return None
     headers = getattr(request, "headers", None)
     if headers is None:
         return request
diff --git a/tests/llm_translation/test_vcr_filters.py b/tests/llm_translation/test_vcr_filters.py
index 0389168278..2b5a6b32a7 100644
--- a/tests/llm_translation/test_vcr_filters.py
+++ b/tests/llm_translation/test_vcr_filters.py
@@ -21,11 +21,13 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..",
 from tests._vcr_conftest_common import (  # noqa: E402
     VCR_FIXED_MULTIPART_BOUNDARY,
     VCR_IMAGE_B64_PLACEHOLDER,
+    _before_record_request,
     _normalize_multipart_boundary,
+    _should_passthrough_credential_exchange,
     _strip_image_b64_payloads,
+    _vcr_load_guard,
 )
 
-
 # ---------------------------------------------------------------------------
 # Image b64 stripper
 # ---------------------------------------------------------------------------
@@ -218,3 +220,55 @@ def test_normalize_multipart_handles_quoted_boundary():
     _normalize_multipart_boundary(req)
     assert b"quoted-boundary" not in req.body
     assert VCR_FIXED_MULTIPART_BOUNDARY.encode("utf-8") in req.body
+
+
+# ---------------------------------------------------------------------------
+# Credential-exchange passthrough (Google OAuth2/STS token mint must run live)
+# ---------------------------------------------------------------------------
+
+
+def _oauth_token_request() -> Request:
+    return Request(
+        method="POST",
+        uri="https://oauth2.googleapis.com/token",
+        body=b"assertion=eyJhbGciOiJSUzI1NiJ9.signed-jwt&grant_type=urn",
+        headers={"content-type": "application/x-www-form-urlencoded"},
+    )
+
+
+def test_before_record_request_drops_oauth_token_mint():
+    # The token mint must never be stored or replayed, else a stale ya29.* token
+    # gets sent to a live Vertex/Gemini endpoint -> ACCESS_TOKEN_EXPIRED.
+    assert _before_record_request(_oauth_token_request()) is None
+
+
+def test_before_record_request_keeps_normal_request():
+    req = Request(
+        method="POST",
+        uri="https://api.openai.com/v1/chat/completions",
+        body=b'{"model":"gpt-4o"}',
+        headers={"content-type": "application/json"},
+    )
+    assert _before_record_request(req) is req
+
+
+def test_credential_exchange_passthrough_inert_during_cassette_load():
+    # During Cassette._load stored episodes are replayed through this hook;
+    # dropping there would mutate the cassette on read. The guard makes it inert.
+    _vcr_load_guard.active = True
+    try:
+        assert _should_passthrough_credential_exchange(_oauth_token_request()) is False
+        assert _before_record_request(_oauth_token_request()) is not None
+    finally:
+        _vcr_load_guard.active = False
+
+
+def test_credential_exchange_passthrough_covers_sts_and_metadata_hosts():
+    for host in ("sts.googleapis.com", "metadata.google.internal", "169.254.169.254"):
+        req = Request(
+            method="POST",
+            uri=f"https://{host}/token",
+            body=b"grant_type=urn",
+            headers={},
+        )
+        assert _should_passthrough_credential_exchange(req) is True