Merge pull request #27409 from BerriAI/litellm_/inspiring-allen-ec64a4

[Fix] Tests: Reduce VCR cassette bloat and fix multipart caching
2026-05-07 12:39:58 -07:00 · 2026-05-07 12:39:58 -07:00 · b9b315157b
commit b9b315157b
parent db8198faba 2f9519d286
5 changed files with 563 additions and 112 deletions
--- a/tests/_vcr_conftest_common.py
+++ b/tests/_vcr_conftest_common.py
@ -7,7 +7,9 @@ from __future__ import annotations

 import atexit
 import hashlib
+import json
 import os
+import re
 import sys
 from typing import Iterable

@ -74,6 +76,17 @@ FILTERED_RESPONSE_HEADERS = (
    "date",
 )

+# Tiny placeholder used to replace base64 image payloads in cassettes.
+# Decodes to b"test" — short, valid base64 so test code that decodes
+# the field still succeeds.
+VCR_IMAGE_B64_PLACEHOLDER = "dGVzdA=="
+
+# Fixed boundary substituted into multipart request bodies so the
+# ``safe_body`` matcher sees the same bytes across record and replay.
+# httpx generates a fresh random boundary per request via os.urandom,
+# which otherwise turns every multipart cassette into a permanent miss.
+VCR_FIXED_MULTIPART_BOUNDARY = "vcr-static-boundary"
+

 def _scrub_response(response):
    if not isinstance(response, dict):
@ -86,8 +99,88 @@ def _scrub_response(response):
    return response


+def _replace_b64_json_in_place(obj) -> bool:
+    """Recursively replace ``b64_json`` string values in a JSON tree.
+
+    Returns ``True`` if any value was rewritten. The check on the
+    existing value's length keeps the function idempotent — once a
+    value has been swapped to the placeholder, subsequent invocations
+    are no-ops.
+    """
+    changed = False
+    if isinstance(obj, dict):
+        for key, value in obj.items():
+            if (
+                key == "b64_json"
+                and isinstance(value, str)
+                and len(value) > len(VCR_IMAGE_B64_PLACEHOLDER)
+            ):
+                obj[key] = VCR_IMAGE_B64_PLACEHOLDER
+                changed = True
+            elif _replace_b64_json_in_place(value):
+                changed = True
+    elif isinstance(obj, list):
+        for item in obj:
+            if _replace_b64_json_in_place(item):
+                changed = True
+    return changed
+
+
+def _strip_image_b64_payloads(response):
+    """Replace ``b64_json`` payloads in image-gen responses before save.
+
+    Image-edit and image-generation responses carry the full base64
+    PNG/JPEG (1-10+ MB) in ``data[*].b64_json``. The image_gen tests
+    only assert response shape — the field decodes, schema validates —
+    they never inspect pixel content. Swapping to a 4-byte placeholder
+    preserves all those checks while shrinking cassettes by ~99%.
+    """
+    if not isinstance(response, dict):
+        return response
+    body = response.get("body")
+    if not isinstance(body, dict):
+        return response
+    raw = body.get("string")
+    if raw is None:
+        return response
+
+    if isinstance(raw, (bytes, bytearray)):
+        try:
+            text = bytes(raw).decode("utf-8")
+        except UnicodeDecodeError:
+            return response
+        was_bytes = True
+    elif isinstance(raw, str):
+        text = raw
+        was_bytes = False
+    else:
+        return response
+
+    try:
+        payload = json.loads(text)
+    except (ValueError, TypeError):
+        return response
+
+    if not _replace_b64_json_in_place(payload):
+        return response
+
+    new_text = json.dumps(payload, separators=(",", ":"))
+    body["string"] = new_text.encode("utf-8") if was_bytes else new_text
+
+    headers = response.get("headers")
+    if isinstance(headers, dict):
+        new_len_value = str(len(new_text.encode("utf-8")))
+        for key in list(headers):
+            if str(key).lower() == "content-length":
+                value = headers[key]
+                headers[key] = (
+                    [new_len_value] if isinstance(value, list) else new_len_value
+                )
+    return response
+
+
 def _before_record_response(response):
-    return filter_non_2xx_response(_scrub_response(response))
+    return filter_non_2xx_response(_scrub_response(_strip_image_b64_payloads(response)))


 def _safe_body_matcher(r1, r2) -> None:
@ -172,8 +265,84 @@ def _strip_headers(headers, names: Iterable[str]) -> None:
                pass


+def _normalize_multipart_boundary(request) -> None:
+    """Rewrite random multipart boundaries to a fixed string in-place.
+
+    httpx generates a fresh ``boundary=<random hex>`` for every
+    multipart request via ``os.urandom``. Without normalization, the
+    request body bytes differ across runs even when everything else is
+    identical, the ``safe_body`` matcher misses, and the persister
+    keeps appending new episodes until ``MAX_EPISODES_PER_CASSETTE``
+    refuses the save — leaving audio-transcription tests effectively
+    unmocked. Replacing the boundary in both the Content-Type header
+    and the body bytes makes the request deterministic.
+
+    Idempotent — vcrpy invokes this hook multiple times per request,
+    so the second invocation sees ``boundary=vcr-static-boundary``
+    already and short-circuits.
+    """
+    headers = getattr(request, "headers", None)
+    if headers is None:
+        return
+
+    content_type_key = None
+    content_type_value = None
+    try:
+        for key in list(headers.keys()):
+            if str(key).lower() == "content-type":
+                content_type_key = key
+                value = headers[key]
+                content_type_value = value if isinstance(value, str) else str(value)
+                break
+    except AttributeError:
+        return
+
+    if not content_type_value or "multipart/" not in content_type_value.lower():
+        return
+
+    fixed_param = f"boundary={VCR_FIXED_MULTIPART_BOUNDARY}"
+    if fixed_param in content_type_value:
+        return
+
+    match = re.search(r"boundary=([^\s;]+)", content_type_value)
+    if not match:
+        return
+    current_boundary = match.group(1).strip('"')
+    if current_boundary == VCR_FIXED_MULTIPART_BOUNDARY:
+        return
+
+    try:
+        headers[content_type_key] = content_type_value.replace(
+            match.group(0), fixed_param
+        )
+    except (TypeError, AttributeError):
+        return
+
+    body = getattr(request, "body", None)
+    if body is None:
+        return
+
+    if isinstance(body, (bytes, bytearray)):
+        try:
+            new_body = bytes(body).replace(
+                current_boundary.encode("utf-8"),
+                VCR_FIXED_MULTIPART_BOUNDARY.encode("utf-8"),
+            )
+        except (TypeError, ValueError):
+            return
+    elif isinstance(body, str):
+        new_body = body.replace(current_boundary, VCR_FIXED_MULTIPART_BOUNDARY)
+    else:
+        return
+
+    try:
+        request.body = new_body
+    except (AttributeError, TypeError):
+        pass
+
+
 def _before_record_request(request):
-    """Fingerprint API keys, then scrub them.
+    """Fingerprint API keys, scrub them, and normalize multipart boundaries.

    Order matters in two ways:

@ -187,7 +356,8 @@ def _before_record_request(request):
       auth headers we already stripped, so re-hashing would yield
       ``"no-key"`` and the stored vs. incoming fingerprints would
       diverge. Skip the recompute when the header is already set so
-       this hook is idempotent.
+       this hook is idempotent. The boundary normalizer is also
+       idempotent for the same reason.
    """
    headers = getattr(request, "headers", None)
    if headers is None:
@ -199,6 +369,7 @@ def _before_record_request(request):
        except (TypeError, AttributeError):
            pass
    _strip_headers(headers, FILTERED_REQUEST_HEADERS)
+    _normalize_multipart_boundary(request)
    return request


--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@ -853,7 +853,11 @@ class BaseLLMChatTest(ABC):
    @pytest.mark.parametrize(
        "image_url",
        [
-            "http://img1.etsystatic.com/260/0/7813604/il_fullxfull.4226713999_q86e.jpg",
+            # In-repo logo served via jsdelivr (sha-pinned, immutable).
+            # Bedrock fetches the URL and base64-embeds it in the
+            # Converse request body; using a multi-MB hosted product
+            # photo here previously bloated cassettes to ~60 MB each.
+            "https://cdn.jsdelivr.net/gh/BerriAI/litellm@d769e81c90d453240c61fc572cdb27fae06a89d0/ui/litellm-dashboard/public/assets/logos/litellm_logo.jpg",
            "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png",
        ],
    )
--- a/tests/llm_translation/test_evals_api.py
+++ b/tests/llm_translation/test_evals_api.py
@ -2,6 +2,7 @@
 Tests for Evals API operations across providers
 """

+import hashlib
 import os
 import sys
 from abc import ABC, abstractmethod
@ -20,6 +21,46 @@ from litellm.types.llms.openai_evals import (
 )


+def _stable_eval_name(test_node_name: str, suffix: str = "") -> str:
+    """Deterministic eval name keyed off the test's node name.
+
+    The previous ``f"Test Eval {int(time.time())}"`` pattern embedded a
+    fresh value into the request body every run, defeating VCR's
+    ``safe_body`` matcher and forcing a real OpenAI ``create`` call on
+    every CI run. With a stable per-test name the cassette matches on
+    replay, and provider-side resources stay bounded because each test
+    deletes the eval it owns on teardown.
+    """
+    nonce = hashlib.sha1(test_node_name.encode()).hexdigest()[:12]
+    return f"vcr-managed-{nonce}{suffix}"
+
+
+_TESTING_CRITERIA = [
+    {
+        "type": "label_model",
+        "model": "gpt-4o",
+        "input": [
+            {
+                "role": "developer",
+                "content": "Classify the sentiment as 'positive' or 'negative'",
+            },
+            {"role": "user", "content": "Statement: {{item.input}}"},
+        ],
+        "passing_labels": ["positive"],
+        "labels": ["positive", "negative"],
+        "name": "Sentiment grader",
+    }
+]
+
+
+_PROVIDER_FLAKINESS = (
+    litellm.InternalServerError,
+    litellm.APIConnectionError,
+    litellm.Timeout,
+    litellm.ServiceUnavailableError,
+)
+
+
 class BaseEvalsAPITest(ABC):
    """
    Base test class for Evals API operations.
@ -41,13 +82,64 @@ class BaseEvalsAPITest(ABC):
        """Return the API base URL for the provider"""
        pass

+    @pytest.fixture
+    def managed_eval(self, request):
+        """Create a stable-named eval for this test; delete on teardown.
+
+        Function-scoped so each cassette captures the full
+        create→test→delete cycle. A class-scoped fixture would push
+        the create into whichever test ran first and the delete into
+        whichever ran last, which is fragile under reordering.
+
+        Replaces the prior ``list_evals().data[0].id`` pattern, which
+        made the URL of ``get_eval`` / ``update_eval`` vary across
+        runs (the "first" eval depends on what other runs left
+        behind).
+        """
+        custom_llm_provider = self.get_custom_llm_provider()
+        api_key = self.get_api_key()
+        api_base = self.get_api_base()
+
+        if not api_key:
+            pytest.skip(f"No API key provided for {custom_llm_provider}")
+
+        try:
+            created = litellm.create_eval(
+                name=_stable_eval_name(request.node.name),
+                data_source_config={
+                    "type": "stored_completions",
+                    "metadata": {"usecase": "chatbot", "vcr": "managed"},
+                },
+                testing_criteria=_TESTING_CRITERIA,
+                custom_llm_provider=custom_llm_provider,
+                api_key=api_key,
+                api_base=api_base,
+            )
+        except _PROVIDER_FLAKINESS:
+            pytest.skip("Provider service unavailable")
+        except litellm.RateLimitError:
+            pytest.skip("Rate limit exceeded")
+
+        yield created
+
+        # Best-effort cleanup. OpenAI eval names are not unique-keyed
+        # (only IDs are), so a failed delete doesn't block the next
+        # run's create.
+        try:
+            litellm.delete_eval(
+                eval_id=created.id,
+                custom_llm_provider=custom_llm_provider,
+                api_key=api_key,
+                api_base=api_base,
+            )
+        except Exception:
+            pass
+
    @pytest.mark.flaky(retries=3, delay=2)
-    def test_create_eval(self):
+    def test_create_eval(self, request):
        """
        Test creating an evaluation.
        """
-        import time
-
        custom_llm_provider = self.get_custom_llm_provider()
        api_key = self.get_api_key()
        api_base = self.get_api_base()
@ -56,53 +148,45 @@ class BaseEvalsAPITest(ABC):
            pytest.skip(f"No API key provided for {custom_llm_provider}")

        litellm.set_verbose = True
+        unique_name = _stable_eval_name(request.node.name)

-        # Create eval with stored_completions data source
-        unique_name = f"Test Eval {int(time.time())}"
-
+        created_id = None
        try:
-            response = litellm.create_eval(
-                name=unique_name,
-                data_source_config={
-                    "type": "stored_completions",
-                    "metadata": {"usecase": "chatbot"},
-                },
-                testing_criteria=[
-                    {
-                        "type": "label_model",
-                        "model": "gpt-4o",
-                        "input": [
-                            {
-                                "role": "developer",
-                                "content": "Classify the sentiment as 'positive' or 'negative'",
-                            },
-                            {"role": "user", "content": "Statement: {{item.input}}"},
-                        ],
-                        "passing_labels": ["positive"],
-                        "labels": ["positive", "negative"],
-                        "name": "Sentiment grader",
-                    }
-                ],
-                custom_llm_provider=custom_llm_provider,
-                api_key=api_key,
-                api_base=api_base,
-            )
-        except (
-            litellm.InternalServerError,
-            litellm.APIConnectionError,
-            litellm.Timeout,
-            litellm.ServiceUnavailableError,
-        ):
-            pytest.skip("Provider service unavailable")
-        except litellm.RateLimitError:
-            pytest.skip("Rate limit exceeded")
+            try:
+                response = litellm.create_eval(
+                    name=unique_name,
+                    data_source_config={
+                        "type": "stored_completions",
+                        "metadata": {"usecase": "chatbot"},
+                    },
+                    testing_criteria=_TESTING_CRITERIA,
+                    custom_llm_provider=custom_llm_provider,
+                    api_key=api_key,
+                    api_base=api_base,
+                )
+            except _PROVIDER_FLAKINESS:
+                pytest.skip("Provider service unavailable")
+            except litellm.RateLimitError:
+                pytest.skip("Rate limit exceeded")

-        assert response is not None
-        assert isinstance(response, Eval)
-        assert response.id is not None
-        assert response.name == unique_name
-        print(f"Created eval: {response}")
-        print(f"Eval ID: {response.id}")
+            assert response is not None
+            assert isinstance(response, Eval)
+            assert response.id is not None
+            assert response.name == unique_name
+            created_id = response.id
+            print(f"Created eval: {response}")
+            print(f"Eval ID: {response.id}")
+        finally:
+            if created_id is not None:
+                try:
+                    litellm.delete_eval(
+                        eval_id=created_id,
+                        custom_llm_provider=custom_llm_provider,
+                        api_key=api_key,
+                        api_base=api_base,
+                    )
+                except Exception:
+                    pass

    def test_list_evals(self):
        """
@ -130,7 +214,7 @@ class BaseEvalsAPITest(ABC):
        assert hasattr(response, "has_more")
        print(f"Listed evals: {len(response.data)} evaluations")

-    def test_get_eval(self):
+    def test_get_eval(self, managed_eval):
        """
        Test getting a specific evaluation by ID.
        """
@ -138,89 +222,54 @@ class BaseEvalsAPITest(ABC):
        api_key = self.get_api_key()
        api_base = self.get_api_base()

-        if not api_key:
-            pytest.skip(f"No API key provided for {custom_llm_provider}")
-
        litellm.set_verbose = True

-        # First list existing evals to get an ID
-        list_response = litellm.list_evals(
-            limit=1,
+        response = litellm.get_eval(
+            eval_id=managed_eval.id,
            custom_llm_provider=custom_llm_provider,
            api_key=api_key,
            api_base=api_base,
        )

-        assert isinstance(list_response, ListEvalsResponse)
+        assert response is not None
+        assert isinstance(response, Eval)
+        assert response.id == managed_eval.id
+        print(f"Retrieved eval: {response}")

-        if list_response.data and len(list_response.data) > 0:
-            eval_id = list_response.data[0].id
-            print(f"Testing with eval ID: {eval_id}")
-
-            # Get the eval
-            response = litellm.get_eval(
-                eval_id=eval_id,
-                custom_llm_provider=custom_llm_provider,
-                api_key=api_key,
-                api_base=api_base,
-            )
-
-            assert response is not None
-            assert isinstance(response, Eval)
-            assert response.id == eval_id
-            print(f"Retrieved eval: {response}")
-        else:
-            pytest.skip("No existing evals to test with")
-
-    def test_update_eval(self):
+    @pytest.mark.flaky(retries=3, delay=2)
+    def test_update_eval(self, request, managed_eval):
        """
        Test updating an evaluation.
        """
-        import time
-
        custom_llm_provider = self.get_custom_llm_provider()
        api_key = self.get_api_key()
        api_base = self.get_api_base()

-        if not api_key:
-            pytest.skip(f"No API key provided for {custom_llm_provider}")
-
        litellm.set_verbose = True
+        updated_name = _stable_eval_name(request.node.name, suffix="-updated")

-        # First list existing evals
-        list_response = litellm.list_evals(
-            limit=1,
+        response = litellm.update_eval(
+            eval_id=managed_eval.id,
+            name=updated_name,
            custom_llm_provider=custom_llm_provider,
            api_key=api_key,
            api_base=api_base,
        )

-        assert isinstance(list_response, ListEvalsResponse)
-
-        if list_response.data and len(list_response.data) > 0:
-            eval_id = list_response.data[0].id
-            updated_name = f"Updated Eval {int(time.time())}"
-
-            # Update the eval
-            response = litellm.update_eval(
-                eval_id=eval_id,
-                name=updated_name,
-                custom_llm_provider=custom_llm_provider,
-                api_key=api_key,
-                api_base=api_base,
-            )
-
-            assert response is not None
-            assert isinstance(response, Eval)
-            assert response.id == eval_id
-            assert response.name == updated_name
-            print(f"Updated eval: {response}")
-        else:
-            pytest.skip("No existing evals to test with")
+        assert response is not None
+        assert isinstance(response, Eval)
+        assert response.id == managed_eval.id
+        assert response.name == updated_name
+        print(f"Updated eval: {response}")

    def test_delete_eval(self):
        """
        Test deleting an evaluation.
+
+        Real delete coverage now lives in the ``managed_eval`` fixture
+        teardown and in ``test_create_eval``'s ``finally`` block, so
+        this stays a no-op skip rather than creating a fresh resource
+        just to delete it.
        """
        custom_llm_provider = self.get_custom_llm_provider()
        api_key = self.get_api_key()
@ -229,8 +278,7 @@ class BaseEvalsAPITest(ABC):
        if not api_key:
            pytest.skip(f"No API key provided for {custom_llm_provider}")

-        # Skip this test to avoid deleting production evals
-        pytest.skip("Skipping delete test to preserve existing evals")
+        pytest.skip("Delete is exercised via managed_eval fixture teardown.")


 class TestOpenAIEvalsAPI(BaseEvalsAPITest):
--- a/tests/llm_translation/test_vcr_filters.py
+++ b/tests/llm_translation/test_vcr_filters.py
@ -0,0 +1,220 @@
+"""Unit tests for the VCR record-time filters that keep cassettes small.
+
+Covers:
+- ``_strip_image_b64_payloads`` — replaces base64 image bodies in
+  image-gen responses so cassettes don't carry MB-class PNG payloads.
+- ``_normalize_multipart_boundary`` — rewrites random multipart
+  boundaries to a fixed string so audio-transcription request bodies
+  match across record and replay.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+
+from vcr.request import Request
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+
+from tests._vcr_conftest_common import (  # noqa: E402
+    VCR_FIXED_MULTIPART_BOUNDARY,
+    VCR_IMAGE_B64_PLACEHOLDER,
+    _normalize_multipart_boundary,
+    _strip_image_b64_payloads,
+)
+
+
+# ---------------------------------------------------------------------------
+# Image b64 stripper
+# ---------------------------------------------------------------------------
+
+
+def _image_response(b64_payload: str, body_type: str = "bytes") -> dict:
+    body_text = json.dumps({"data": [{"b64_json": b64_payload}]})
+    body_string = body_text.encode("utf-8") if body_type == "bytes" else body_text
+    return {
+        "status": {"code": 200, "message": "OK"},
+        "headers": {
+            "content-type": ["application/json"],
+            "content-length": [str(len(body_text.encode("utf-8")))],
+        },
+        "body": {"string": body_string},
+    }
+
+
+def test_strip_image_b64_replaces_payload_when_body_is_bytes():
+    response = _image_response("A" * 5000, body_type="bytes")
+    out = _strip_image_b64_payloads(response)
+    payload = json.loads(out["body"]["string"].decode("utf-8"))
+    assert payload["data"][0]["b64_json"] == VCR_IMAGE_B64_PLACEHOLDER
+
+
+def test_strip_image_b64_replaces_payload_when_body_is_str():
+    response = _image_response("A" * 5000, body_type="str")
+    out = _strip_image_b64_payloads(response)
+    payload = json.loads(out["body"]["string"])
+    assert payload["data"][0]["b64_json"] == VCR_IMAGE_B64_PLACEHOLDER
+
+
+def test_strip_image_b64_updates_content_length():
+    response = _image_response("A" * 5000)
+    out = _strip_image_b64_payloads(response)
+    expected_len = len(out["body"]["string"])
+    assert out["headers"]["content-length"] == [str(expected_len)]
+
+
+def test_strip_image_b64_is_idempotent():
+    response = _image_response("A" * 5000)
+    once = _strip_image_b64_payloads(response)
+    twice = _strip_image_b64_payloads(once)
+    assert once["body"]["string"] == twice["body"]["string"]
+
+
+def test_strip_image_b64_handles_nested_data():
+    body_text = json.dumps(
+        {
+            "outer": {
+                "data": [
+                    {"b64_json": "X" * 4000, "label": "first"},
+                    {"b64_json": "Y" * 4000, "label": "second"},
+                ]
+            }
+        }
+    )
+    response = {
+        "status": {"code": 200, "message": "OK"},
+        "headers": {"content-type": ["application/json"]},
+        "body": {"string": body_text.encode("utf-8")},
+    }
+    out = _strip_image_b64_payloads(response)
+    payload = json.loads(out["body"]["string"].decode("utf-8"))
+    assert payload["outer"]["data"][0]["b64_json"] == VCR_IMAGE_B64_PLACEHOLDER
+    assert payload["outer"]["data"][1]["b64_json"] == VCR_IMAGE_B64_PLACEHOLDER
+    assert payload["outer"]["data"][0]["label"] == "first"
+
+
+def test_strip_image_b64_leaves_non_image_response_unchanged():
+    body_text = json.dumps({"choices": [{"message": {"content": "hello"}}]})
+    response = {
+        "status": {"code": 200, "message": "OK"},
+        "headers": {"content-type": ["application/json"]},
+        "body": {"string": body_text.encode("utf-8")},
+    }
+    out = _strip_image_b64_payloads(response)
+    assert json.loads(out["body"]["string"].decode("utf-8")) == json.loads(body_text)
+
+
+def test_strip_image_b64_leaves_invalid_json_unchanged():
+    response = {
+        "status": {"code": 200, "message": "OK"},
+        "headers": {"content-type": ["application/octet-stream"]},
+        "body": {"string": b"\x89PNG\r\n\x1a\n binary stuff not json"},
+    }
+    out = _strip_image_b64_payloads(response)
+    assert out["body"]["string"] == b"\x89PNG\r\n\x1a\n binary stuff not json"
+
+
+def test_strip_image_b64_skips_short_values():
+    """Already-placeholder values aren't re-replaced (idempotency guard)."""
+    body_text = json.dumps({"data": [{"b64_json": VCR_IMAGE_B64_PLACEHOLDER}]})
+    response = {
+        "status": {"code": 200, "message": "OK"},
+        "headers": {"content-type": ["application/json"]},
+        "body": {"string": body_text.encode("utf-8")},
+    }
+    out = _strip_image_b64_payloads(response)
+    payload = json.loads(out["body"]["string"].decode("utf-8"))
+    assert payload["data"][0]["b64_json"] == VCR_IMAGE_B64_PLACEHOLDER
+
+
+# ---------------------------------------------------------------------------
+# Multipart boundary normalizer
+# ---------------------------------------------------------------------------
+
+
+def _multipart_request(boundary: str):
+    body_text = (
+        f"--{boundary}\r\n"
+        'Content-Disposition: form-data; name="file"; filename="audio.wav"\r\n'
+        "Content-Type: audio/wav\r\n"
+        "\r\n"
+        "fake-audio-bytes\r\n"
+        f"--{boundary}--\r\n"
+    )
+    return Request(
+        method="POST",
+        uri="https://api.openai.com/v1/audio/transcriptions",
+        body=body_text.encode("utf-8"),
+        headers={
+            "content-type": f"multipart/form-data; boundary={boundary}",
+        },
+    )
+
+
+def test_normalize_multipart_rewrites_header_and_body():
+    req = _multipart_request("abc123random")
+    _normalize_multipart_boundary(req)
+    assert (
+        req.headers["content-type"]
+        == f"multipart/form-data; boundary={VCR_FIXED_MULTIPART_BOUNDARY}"
+    )
+    assert b"abc123random" not in req.body
+    assert VCR_FIXED_MULTIPART_BOUNDARY.encode("utf-8") in req.body
+
+
+def test_normalize_multipart_is_idempotent():
+    req = _multipart_request("abc123random")
+    _normalize_multipart_boundary(req)
+    body_first = req.body
+    header_first = req.headers["content-type"]
+    _normalize_multipart_boundary(req)
+    assert req.body == body_first
+    assert req.headers["content-type"] == header_first
+
+
+def test_normalize_multipart_two_distinct_boundaries_match_after_normalize():
+    """Whisper-style: two requests with different random boundaries should
+    end up with byte-identical bodies after normalization."""
+    req1 = _multipart_request("boundaryAAA")
+    req2 = _multipart_request("boundaryBBB")
+    _normalize_multipart_boundary(req1)
+    _normalize_multipart_boundary(req2)
+    assert req1.body == req2.body
+    assert req1.headers["content-type"] == req2.headers["content-type"]
+
+
+def test_normalize_multipart_skips_non_multipart_requests():
+    req = Request(
+        method="POST",
+        uri="https://api.openai.com/v1/chat/completions",
+        body=b'{"model":"gpt-4o"}',
+        headers={"content-type": "application/json"},
+    )
+    _normalize_multipart_boundary(req)
+    assert req.headers["content-type"] == "application/json"
+    assert req.body == b'{"model":"gpt-4o"}'
+
+
+def test_normalize_multipart_skips_request_without_content_type():
+    req = Request(
+        method="POST",
+        uri="https://api.openai.com/v1/chat/completions",
+        body=b"unknown body",
+        headers={},
+    )
+    _normalize_multipart_boundary(req)
+    assert req.body == b"unknown body"
+
+
+def test_normalize_multipart_handles_quoted_boundary():
+    req = Request(
+        method="POST",
+        uri="https://api.openai.com/v1/audio/transcriptions",
+        body=b"--quoted-boundary--body content--quoted-boundary--",
+        headers={"content-type": 'multipart/form-data; boundary="quoted-boundary"'},
+    )
+    _normalize_multipart_boundary(req)
+    assert b"quoted-boundary" not in req.body
+    assert VCR_FIXED_MULTIPART_BOUNDARY.encode("utf-8") in req.body
--- a/tests/ocr_tests/base_ocr_unit_tests.py
+++ b/tests/ocr_tests/base_ocr_unit_tests.py
@ -12,7 +12,15 @@ from abc import ABC, abstractmethod

 # Test resources
 TEST_IMAGE_PATH = "test_image_edit.png"
-TEST_PDF_URL = "https://arxiv.org/pdf/2201.04234"
+# Tiny in-repo PDF served via jsdelivr (sha-pinned, immutable). The arxiv
+# PDF previously used here was several MB — once base64-encoded into the
+# Vertex OCR request it ballooned cassettes past 100 MB per test. Keep
+# the URL stable across runs so cassettes don't churn.
+TEST_PDF_URL = (
+    "https://cdn.jsdelivr.net/gh/BerriAI/litellm"
+    "@d769e81c90d453240c61fc572cdb27fae06a89d0"
+    "/tests/llm_translation/fixtures/dummy.pdf"
+)


 class BaseOCRTest(ABC):