diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index aa63369ef2..96f0870bc1 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -1,15 +1,5 @@ model_list: - - model_name: bedrock/* + - model_name: gemini/* litellm_params: - model: bedrock/* + model: gemini/* - -litellm_settings: - callbacks: ["s3_v2"] - s3_callback_params: - s3_bucket_name: litellm-logs # AWS Bucket Name for S3 - s3_region_name: us-west-2 - -general_settings: - cold_storage_custom_logger: s3_v2 - store_prompts_in_cold_storage: true \ No newline at end of file diff --git a/litellm/responses/litellm_completion_transformation/handler.py b/litellm/responses/litellm_completion_transformation/handler.py index 7dc182747d..7b3243309b 100644 --- a/litellm/responses/litellm_completion_transformation/handler.py +++ b/litellm/responses/litellm_completion_transformation/handler.py @@ -84,6 +84,8 @@ class LiteLLMCompletionTransformationHandler: litellm_custom_stream_wrapper=litellm_completion_response, request_input=input, responses_api_request=responses_api_request, + custom_llm_provider=custom_llm_provider, + litellm_metadata=kwargs.get("litellm_metadata", {}), ) async def async_response_api_handler( @@ -129,4 +131,6 @@ class LiteLLMCompletionTransformationHandler: litellm_custom_stream_wrapper=litellm_completion_response, request_input=request_input, responses_api_request=responses_api_request, + custom_llm_provider=litellm_completion_request.get("custom_llm_provider"), + litellm_metadata=kwargs.get("litellm_metadata", {}), ) diff --git a/litellm/responses/litellm_completion_transformation/streaming_iterator.py b/litellm/responses/litellm_completion_transformation/streaming_iterator.py index 5f9fa8525c..64ea93028f 100644 --- a/litellm/responses/litellm_completion_transformation/streaming_iterator.py +++ b/litellm/responses/litellm_completion_transformation/streaming_iterator.py @@ -6,6 +6,7 @@ from litellm.responses.litellm_completion_transformation.transformation import ( LiteLLMCompletionResponsesConfig, ) from litellm.responses.streaming_iterator import ResponsesAPIStreamingIterator +from litellm.responses.utils import ResponsesAPIRequestUtils from litellm.types.llms.openai import ( OutputTextDeltaEvent, ReasoningSummaryTextDeltaEvent, @@ -34,6 +35,8 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator): litellm_custom_stream_wrapper: litellm.CustomStreamWrapper, request_input: Union[str, ResponseInputParam], responses_api_request: ResponsesAPIOptionalRequestParams, + custom_llm_provider: Optional[str] = None, + litellm_metadata: Optional[dict] = None, ): self.litellm_custom_stream_wrapper: litellm.CustomStreamWrapper = ( litellm_custom_stream_wrapper @@ -42,6 +45,8 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator): self.responses_api_request: ResponsesAPIOptionalRequestParams = ( responses_api_request ) + self.custom_llm_provider: Optional[str] = custom_llm_provider + self.litellm_metadata: Optional[dict] = litellm_metadata or {} self.collected_chat_completion_chunks: List[ModelResponseStream] = [] self.finished: bool = False @@ -164,14 +169,23 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator): Union[ModelResponse, TextCompletionResponse] ] = stream_chunk_builder(chunks=self.collected_chat_completion_chunks) if litellm_model_response and isinstance(litellm_model_response, ModelResponse): + # Transform the response + responses_api_response = LiteLLMCompletionResponsesConfig.transform_chat_completion_response_to_responses_api_response( + request_input=self.request_input, + chat_completion_response=litellm_model_response, + responses_api_request=self.responses_api_request, + ) + + # Encode the response ID to match non-streaming behavior + encoded_response = ResponsesAPIRequestUtils._update_responses_api_response_id_with_model_id( + responses_api_response=responses_api_response, + custom_llm_provider=self.custom_llm_provider, + litellm_metadata=self.litellm_metadata, + ) return ResponseCompletedEvent( type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED, - response=LiteLLMCompletionResponsesConfig.transform_chat_completion_response_to_responses_api_response( - request_input=self.request_input, - chat_completion_response=litellm_model_response, - responses_api_request=self.responses_api_request, - ), + response=encoded_response, ) else: return None diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index a47a603098..16efe9ffd0 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -137,6 +137,14 @@ model_list: model: openai/my-fake-model api_key: my-fake-key api_base: https://exampleopenaiendpoint-production.up.railway.appxxxx/ + - model_name: gemini-1.5-flash + litellm_params: + model: gemini/gemini-1.5-flash + api_key: os.environ/GOOGLE_API_KEY + - model_name: gpt-4o + litellm_params: + model: gpt-4o + api_key: os.environ/OPENAI_API_KEY litellm_settings: diff --git a/tests/llm_responses_api_testing/test_base_responses_api_streaming_iterator.py b/tests/llm_responses_api_testing/test_base_responses_api_streaming_iterator.py new file mode 100644 index 0000000000..13ee4d7e76 --- /dev/null +++ b/tests/llm_responses_api_testing/test_base_responses_api_streaming_iterator.py @@ -0,0 +1,239 @@ +""" +Unit tests for BaseResponsesAPIStreamingIterator + +Tests core functionality including: +1. Processing chunks and handling ResponseCompletedEvent +2. Ensuring _update_responses_api_response_id_with_model_id is called for final chunk +3. Verifying ID update is NOT called for non-final chunks (delta events) +4. Edge case handling for invalid JSON, empty chunks, and [DONE] markers + +These tests ensure the streaming iterator correctly processes response chunks +and applies model ID updates only to completed responses, as required for proper +response tracking and logging. +""" + +import json +import os +import sys +from datetime import datetime +from typing import Any, Dict, Optional +from unittest.mock import Mock, patch + +import pytest + +sys.path.insert(0, os.path.abspath("../..")) + +from litellm.constants import STREAM_SSE_DONE_STRING +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig +from litellm.responses.streaming_iterator import BaseResponsesAPIStreamingIterator +from litellm.responses.utils import ResponsesAPIRequestUtils +from litellm.types.llms.openai import ( + ResponseCompletedEvent, + ResponsesAPIResponse, + ResponsesAPIStreamEvents, + OutputTextDeltaEvent +) + + +class TestBaseResponsesAPIStreamingIterator: + """Test cases for BaseResponsesAPIStreamingIterator""" + + def test_process_chunk_with_response_completed_event(self): + """ + Test that _process_chunk correctly processes a ResponseCompletedEvent + and calls _update_responses_api_response_id_with_model_id for the final chunk. + """ + # Mock dependencies + mock_response = Mock() + mock_logging_obj = Mock(spec=LiteLLMLoggingObj) + mock_config = Mock(spec=BaseResponsesAPIConfig) + + # Create a mock ResponsesAPIResponse for the completed event + mock_responses_api_response = Mock(spec=ResponsesAPIResponse) + mock_responses_api_response.id = "original_response_id" + + # Create a mock ResponseCompletedEvent + mock_completed_event = Mock(spec=ResponseCompletedEvent) + mock_completed_event.type = ResponsesAPIStreamEvents.RESPONSE_COMPLETED + mock_completed_event.response = mock_responses_api_response + + # Set up the mock transform method to return our completed event + mock_config.transform_streaming_response.return_value = mock_completed_event + + # Mock the _update_responses_api_response_id_with_model_id method + updated_response = Mock(spec=ResponsesAPIResponse) + updated_response.id = "updated_response_id" + + # Create the iterator instance + iterator = BaseResponsesAPIStreamingIterator( + response=mock_response, + model="gpt-4", + responses_api_provider_config=mock_config, + logging_obj=mock_logging_obj, + litellm_metadata={"model_info": {"id": "model_123"}}, + custom_llm_provider="openai" + ) + + # Prepare test chunk data + test_chunk_data = { + "type": "response.completed", + "response": { + "id": "original_response_id", + "output": [{"type": "message", "content": [{"text": "Hello World"}]}] + } + } + + with patch.object( + ResponsesAPIRequestUtils, + '_update_responses_api_response_id_with_model_id', + return_value=updated_response + ) as mock_update_id: + # Process the chunk + result = iterator._process_chunk(json.dumps(test_chunk_data)) + + # Assertions + assert result is not None + assert result.type == ResponsesAPIStreamEvents.RESPONSE_COMPLETED + + # Verify that _update_responses_api_response_id_with_model_id was called + mock_update_id.assert_called_once_with( + responses_api_response=mock_responses_api_response, + litellm_metadata={"model_info": {"id": "model_123"}}, + custom_llm_provider="openai" + ) + + # Verify the completed response was stored + assert iterator.completed_response == result + + # Verify the response was updated on the event + assert result.response == updated_response + + def test_process_chunk_with_delta_event_no_id_update(self): + """ + Test that _process_chunk correctly processes a delta event + and does NOT call _update_responses_api_response_id_with_model_id. + """ + # Mock dependencies + mock_response = Mock() + mock_logging_obj = Mock(spec=LiteLLMLoggingObj) + mock_config = Mock(spec=BaseResponsesAPIConfig) + + # Create a mock OutputTextDeltaEvent (not a completed event) + mock_delta_event = Mock(spec=OutputTextDeltaEvent) + mock_delta_event.type = ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA + mock_delta_event.delta = "Hello" + # Delta events don't have a response attribute + delattr(mock_delta_event, 'response') if hasattr(mock_delta_event, 'response') else None + + # Set up the mock transform method to return our delta event + mock_config.transform_streaming_response.return_value = mock_delta_event + + # Create the iterator instance + iterator = BaseResponsesAPIStreamingIterator( + response=mock_response, + model="gpt-4", + responses_api_provider_config=mock_config, + logging_obj=mock_logging_obj, + litellm_metadata={"model_info": {"id": "model_123"}}, + custom_llm_provider="openai" + ) + + # Prepare test chunk data for a delta event + test_chunk_data = { + "type": "response.output_text.delta", + "delta": "Hello", + "item_id": "item_123", + "output_index": 0, + "content_index": 0 + } + + with patch.object( + ResponsesAPIRequestUtils, + '_update_responses_api_response_id_with_model_id' + ) as mock_update_id: + # Process the chunk + result = iterator._process_chunk(json.dumps(test_chunk_data)) + + # Assertions + assert result is not None + assert result.type == ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA + + # Verify that _update_responses_api_response_id_with_model_id was NOT called + mock_update_id.assert_not_called() + + # Verify no completed response was stored (since this is not a completed event) + assert iterator.completed_response is None + + def test_process_chunk_handles_invalid_json(self): + """ + Test that _process_chunk gracefully handles invalid JSON. + """ + # Mock dependencies + mock_response = Mock() + mock_logging_obj = Mock(spec=LiteLLMLoggingObj) + mock_config = Mock(spec=BaseResponsesAPIConfig) + + # Create the iterator instance + iterator = BaseResponsesAPIStreamingIterator( + response=mock_response, + model="gpt-4", + responses_api_provider_config=mock_config, + logging_obj=mock_logging_obj + ) + + # Test with invalid JSON + result = iterator._process_chunk("invalid json {") + + # Should return None for invalid JSON + assert result is None + assert iterator.completed_response is None + + def test_process_chunk_handles_done_marker(self): + """ + Test that _process_chunk correctly handles the [DONE] marker. + """ + # Mock dependencies + mock_response = Mock() + mock_logging_obj = Mock(spec=LiteLLMLoggingObj) + mock_config = Mock(spec=BaseResponsesAPIConfig) + + # Create the iterator instance + iterator = BaseResponsesAPIStreamingIterator( + response=mock_response, + model="gpt-4", + responses_api_provider_config=mock_config, + logging_obj=mock_logging_obj + ) + + # Test with [DONE] marker + result = iterator._process_chunk(STREAM_SSE_DONE_STRING) + + # Should return None and set finished flag + assert result is None + assert iterator.finished is True + + def test_process_chunk_handles_empty_chunk(self): + """ + Test that _process_chunk correctly handles empty or None chunks. + """ + # Mock dependencies + mock_response = Mock() + mock_logging_obj = Mock(spec=LiteLLMLoggingObj) + mock_config = Mock(spec=BaseResponsesAPIConfig) + + # Create the iterator instance + iterator = BaseResponsesAPIStreamingIterator( + response=mock_response, + model="gpt-4", + responses_api_provider_config=mock_config, + logging_obj=mock_logging_obj + ) + + # Test with empty chunk + result = iterator._process_chunk("") + assert result is None + + # Test with None chunk + result = iterator._process_chunk(None) + assert result is None \ No newline at end of file diff --git a/tests/test_litellm/responses/litellm_completion_transformation/test_reasoning_content_transformation.py b/tests/test_litellm/responses/litellm_completion_transformation/test_reasoning_content_transformation.py index 20bd56ce41..5323589818 100644 --- a/tests/test_litellm/responses/litellm_completion_transformation/test_reasoning_content_transformation.py +++ b/tests/test_litellm/responses/litellm_completion_transformation/test_reasoning_content_transformation.py @@ -253,3 +253,34 @@ class TestReasoningContentFinalResponse: ] assert len(reasoning_items) == 1, "Should have exactly one reasoning item" assert reasoning_items[0].content[0].text == "Reasoning for first answer" + + +def test_streaming_chunk_id_raw(): + """Test that streaming chunk IDs are raw (not encoded) to match OpenAI format""" + chunk = ModelResponseStream( + id="chunk-123", + created=1234567890, + model="test-model", + object="chat.completion.chunk", + choices=[ + StreamingChoices( + finish_reason=None, + index=0, + delta=Delta(content="Hello", role="assistant"), + ) + ], + ) + + iterator = LiteLLMCompletionStreamingIterator( + litellm_custom_stream_wrapper=AsyncMock(), + request_input="Test input", + responses_api_request={}, + custom_llm_provider="openai", + litellm_metadata={"model_info": {"id": "gpt-4"}}, + ) + + result = iterator._transform_chat_completion_chunk_to_response_api_chunk(chunk) + + # Streaming chunk IDs should be raw (like OpenAI's msg_xxx format) + assert result.item_id == "chunk-123" # Should be raw, not encoded + assert not result.item_id.startswith("resp_") # Should NOT have resp_ prefix