diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 1390455317..b9cb43887a 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -5,10 +5,11 @@ import os import json import logging +import httpx import sentry_sdk from sentry_sdk import start_span -from sentry_sdk.consts import SPANDATA +from sentry_sdk.consts import SPANDATA, OP from sentry_sdk.integrations.logging import LoggingIntegration from sentry_sdk.integrations.openai_agents import OpenAIAgentsIntegration from sentry_sdk.integrations.openai_agents.utils import _set_input_data, safe_serialize @@ -314,6 +315,25 @@ def test_agent_custom_model(): ) +@pytest.fixture +def get_model_response(): + def inner(response_content): + model_request = httpx.Request( + "POST", + "/responses", + ) + + response = httpx.Response( + 200, + request=model_request, + content=json.dumps(response_content.model_dump()).encode("utf-8"), + ) + + return response + + return inner + + @pytest.mark.asyncio async def test_agent_invocation_span_no_pii( sentry_init, capture_events, test_agent, mock_model_response @@ -1095,7 +1115,9 @@ async def test_max_turns_before_handoff_span(sentry_init, capture_events, mock_u @pytest.mark.asyncio -async def test_tool_execution_span(sentry_init, capture_events, test_agent): +async def test_tool_execution_span( + sentry_init, capture_events, test_agent, get_model_response +): """ Test tool execution span creation. """ @@ -1106,78 +1128,106 @@ def simple_test_tool(message: str) -> str: return f"Tool executed with: {message}" # Create agent with the tool - agent_with_tool = test_agent.clone(tools=[simple_test_tool]) - - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Create a mock response that includes tool calls - tool_call = ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="simple_test_tool", - type="function_call", - arguments='{"message": "hello"}', - ) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model) - # First response with tool call - tool_response = ModelResponse( - output=[tool_call], - usage=Usage( - requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + tool_response = get_model_response( + Response( + id="resp_tool_123", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="simple_test_tool", + type="function_call", + arguments='{"message": "hello"}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_tool_123", - ) - - # Second response with final answer - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed using the tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, input_tokens=15, output_tokens=10, total_tokens=25 + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, ), - response_id="resp_final_123", - ) + total_tokens=15, + ), + ) + ) - # Return different responses on successive calls - mock_get_response.side_effect = [tool_response, final_response] + final_response = get_model_response( + Response( + id="resp_final_123", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Task completed using the tool", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=15, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=25, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - await agents.Runner.run( - agent_with_tool, - "Please use the simple test tool", - run_config=test_run_config, - ) + await agents.Runner.run( + agent_with_tool, + "Please use the simple test tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] - ( - agent_span, - ai_client_span1, - tool_span, - ai_client_span2, - ) = spans + agent_span = next(span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT) + ai_client_span1, ai_client_span2 = ( + span for span in spans if span["op"] == OP.GEN_AI_CHAT + ) + tool_span = next(span for span in spans if span["op"] == OP.GEN_AI_EXECUTE_TOOL) available_tools = [ { @@ -2095,7 +2145,9 @@ def test_openai_agents_message_role_mapping( @pytest.mark.asyncio -async def test_tool_execution_error_tracing(sentry_init, capture_events, test_agent): +async def test_tool_execution_error_tracing( + sentry_init, capture_events, test_agent, get_model_response +): """ Test that tool execution errors are properly tracked via error tracing patch. @@ -2113,70 +2165,100 @@ def failing_tool(message: str) -> str: raise ValueError("Tool execution failed") # Create agent with the failing tool - agent_with_tool = test_agent.clone(tools=[failing_tool]) - - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Create a mock response that includes tool call - tool_call = ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="failing_tool", - type="function_call", - arguments='{"message": "test"}', - ) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[failing_tool], model=model) - # First response with tool call - tool_response = ModelResponse( - output=[tool_call], - usage=Usage( - requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + tool_response = get_model_response( + Response( + id="resp_1", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="failing_tool", + type="function_call", + arguments='{"message": "test"}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_tool_123", - ) - - # Second response after tool error (agents library handles the error and continues) - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="An error occurred while running the tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, input_tokens=15, output_tokens=10, total_tokens=25 + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, ), - response_id="resp_final_123", - ) + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [tool_response, final_response] + final_response = get_model_response( + Response( + id="resp_2", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="An error occurred while running the tool", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4-0613", + object="response", + usage=ResponseUsage( + input_tokens=15, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=25, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - # Note: The agents library catches tool exceptions internally, - # so we don't expect this to raise - await agents.Runner.run( - agent_with_tool, - "Please use the failing tool", - run_config=test_run_config, - ) + # Note: The agents library catches tool exceptions internally, + # so we don't expect this to raise + await agents.Runner.run( + agent_with_tool, + "Please use the failing tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] @@ -2412,7 +2494,7 @@ async def test_ai_client_span_response_model_with_chat_completions( @pytest.mark.asyncio async def test_multiple_llm_calls_aggregate_usage( - sentry_init, capture_events, test_agent + sentry_init, capture_events, test_agent, get_model_response ): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls @@ -2424,79 +2506,100 @@ def calculator(a: int, b: int) -> int: """Add two numbers""" return a + b - agent_with_tool = test_agent.clone(tools=[calculator]) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[calculator], model=model) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # First call: agent decides to use tool (10 input, 5 output tokens) - tool_call_response = ModelResponse( - output=[ - ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="calculator", - type="function_call", - arguments='{"a": 5, "b": 3}', - ) - ], - usage=Usage( - requests=1, - input_tokens=10, - output_tokens=5, - total_tokens=15, - input_tokens_details=InputTokensDetails(cached_tokens=0), - output_tokens_details=OutputTokensDetails(reasoning_tokens=0), + tool_call_response = get_model_response( + Response( + id="resp_1", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="calculator", + type="function_call", + arguments='{"a": 5, "b": 3}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_tool_call", - ) - - # Second call: agent uses tool result to respond (20 input, 15 output tokens) - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="The result is 8", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, - input_tokens=20, - output_tokens=15, - total_tokens=35, - input_tokens_details=InputTokensDetails(cached_tokens=5), - output_tokens_details=OutputTokensDetails(reasoning_tokens=3), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, ), - response_id="resp_final", - ) + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [tool_call_response, final_response] + final_response = get_model_response( + Response( + id="resp_2", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="The result is 8", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4-0613", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=5, + ), + output_tokens=15, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=3, + ), + total_tokens=35, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_call_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - result = await agents.Runner.run( - agent_with_tool, - "What is 5 + 3?", - run_config=test_run_config, - ) + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) - assert result is not None + assert result is not None (transaction,) = events spans = transaction["spans"] @@ -2656,7 +2759,10 @@ async def test_invoke_agent_span_includes_response_model( @pytest.mark.asyncio async def test_invoke_agent_span_uses_last_response_model( - sentry_init, capture_events, test_agent + sentry_init, + capture_events, + test_agent, + get_model_response, ): """ Test that when an agent makes multiple LLM calls (e.g., with tools), @@ -2668,17 +2774,14 @@ def calculator(a: int, b: int) -> int: """Add two numbers""" return a + b - agent_with_tool = test_agent.clone(tools=[calculator]) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[calculator], model=model) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel._fetch_response" - ) as mock_fetch_response: - # First call: gpt-4 model returns tool call - first_response = MagicMock() - first_response.model = "gpt-4-0613" - first_response.id = "resp_1" - first_response.output = [ + first_response = get_model_response( + Response( + id="resp_1", + output=[ ResponseFunctionToolCall( id="call_123", call_id="call_123", @@ -2686,65 +2789,85 @@ def calculator(a: int, b: int) -> int: type="function_call", arguments='{"a": 5, "b": 3}', ) - ] - first_response.usage = MagicMock() - first_response.usage.input_tokens = 10 - first_response.usage.output_tokens = 5 - first_response.usage.total_tokens = 15 - first_response.usage.input_tokens_details = InputTokensDetails( - cached_tokens=0 - ) - first_response.usage.output_tokens_details = OutputTokensDetails( - reasoning_tokens=0 - ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4-0613", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=15, + ), + ) + ) - # Second call: different model version returns final message - second_response = MagicMock() - second_response.model = "gpt-4.1-2025-04-14" - second_response.id = "resp_2" - second_response.output = [ + second_response = get_model_response( + Response( + id="resp_2", + output=[ ResponseOutputMessage( id="msg_final", type="message", status="completed", content=[ ResponseOutputText( - text="The result is 8", + text="I'm the specialist and I can help with that!", type="output_text", annotations=[], ) ], role="assistant", ) - ] - second_response.usage = MagicMock() - second_response.usage.input_tokens = 20 - second_response.usage.output_tokens = 15 - second_response.usage.total_tokens = 35 - second_response.usage.input_tokens_details = InputTokensDetails( - cached_tokens=5 - ) - second_response.usage.output_tokens_details = OutputTokensDetails( - reasoning_tokens=3 - ) - - mock_fetch_response.side_effect = [first_response, second_response] + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=15, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=5, + ), + total_tokens=35, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - result = await agents.Runner.run( - agent_with_tool, - "What is 5 + 3?", - run_config=test_run_config, - ) + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) - assert result is not None + assert result is not None (transaction,) = events spans = transaction["spans"] @@ -2966,7 +3089,9 @@ async def test_conversation_id_on_all_spans( reason="conversation_id support requires openai-agents >= 0.4.0", ) @pytest.mark.asyncio -async def test_conversation_id_on_tool_span(sentry_init, capture_events, test_agent): +async def test_conversation_id_on_tool_span( + sentry_init, capture_events, test_agent, get_model_response +): """ Test that gen_ai.conversation.id is set on tool execution spans when passed to Runner.run(). """ @@ -2976,65 +3101,98 @@ def simple_tool(message: str) -> str: """A simple tool""" return f"Result: {message}" - agent_with_tool = test_agent.clone(tools=[simple_tool]) - - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - tool_call = ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="simple_tool", - type="function_call", - arguments='{"message": "hello"}', - ) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[simple_tool], model=model) - tool_response = ModelResponse( - output=[tool_call], - usage=Usage( - requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + tool_response = get_model_response( + Response( + id="call_123", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="simple_tool", + type="function_call", + arguments='{"message": "hello"}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_tool_456", - ) - - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Done", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, input_tokens=15, output_tokens=10, total_tokens=25 + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, ), - response_id="resp_final_789", - ) + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [tool_response, final_response] + final_response = get_model_response( + Response( + id="resp_final_789", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Done", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=5, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=8, + ), + total_tokens=30, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[tool_response, final_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) - events = capture_events() + events = capture_events() - await agents.Runner.run( - agent_with_tool, - "Use the tool", - run_config=test_run_config, - conversation_id="conv_tool_test_456", - ) + await agents.Runner.run( + agent_with_tool, + "Use the tool", + run_config=test_run_config, + conversation_id="conv_tool_test_456", + ) (transaction,) = events spans = transaction["spans"]