From 769440f5c8bd4655fab83fc3b64f45b116175780 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Fri, 6 Mar 2026 12:54:59 +0100 Subject: [PATCH 1/6] test(openai-agents): Replace mocks with httpx in multi-turn tests --- .../openai_agents/test_openai_agents.py | 1535 ++++++++++------- 1 file changed, 922 insertions(+), 613 deletions(-) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 1390455317..af7c6011a7 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -5,10 +5,11 @@ import os import json import logging +import httpx import sentry_sdk from sentry_sdk import start_span -from sentry_sdk.consts import SPANDATA +from sentry_sdk.consts import SPANDATA, OP from sentry_sdk.integrations.logging import LoggingIntegration from sentry_sdk.integrations.openai_agents import OpenAIAgentsIntegration from sentry_sdk.integrations.openai_agents.utils import _set_input_data, safe_serialize @@ -314,6 +315,25 @@ def test_agent_custom_model(): ) +@pytest.fixture +def get_model_response(): + def inner(response_content): + model_request = httpx.Request( + "POST", + "/responses", + ) + + response = httpx.Response( + 200, + request=model_request, + content=json.dumps(response_content.model_dump()).encode("utf-8"), + ) + + return response + + return inner + + @pytest.mark.asyncio async def test_agent_invocation_span_no_pii( sentry_init, capture_events, test_agent, mock_model_response @@ -917,85 +937,121 @@ def test_agent_invocation_span_sync( @pytest.mark.asyncio -async def test_handoff_span(sentry_init, capture_events, mock_usage): +async def test_handoff_span(sentry_init, capture_events, get_model_response): """ Test that handoff spans are created when agents hand off to other agents. """ + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client) + # Create two simple agents with a handoff relationship secondary_agent = agents.Agent( name="secondary_agent", instructions="You are a secondary agent.", - model="gpt-4o-mini", + model=model, ) primary_agent = agents.Agent( name="primary_agent", instructions="You are a primary agent that hands off to secondary agent.", - model="gpt-4o-mini", + model=model, handoffs=[secondary_agent], ) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Mock two responses: - # 1. Primary agent calls handoff tool - # 2. Secondary agent provides final response - handoff_response = ModelResponse( - output=[ - ResponseFunctionToolCall( - id="call_handoff_123", - call_id="call_handoff_123", - name="transfer_to_secondary_agent", - type="function_call", - arguments="{}", - ) - ], - usage=mock_usage, - response_id="resp_handoff_123", - ) - - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="I'm the specialist and I can help with that!", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=mock_usage, - response_id="resp_final_123", - ) + first_response = get_model_response( + Response( + id="resp_tool_123", + output=[ + ResponseFunctionToolCall( + id="call_handoff_123", + call_id="call_handoff_123", + name="transfer_to_secondary_agent", + type="function_call", + arguments="{}", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [handoff_response, final_response] + second_response = get_model_response( + Response( + id="resp_final_123", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="I'm the specialist and I can help with that!", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=15, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=25, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + with patch.object( + primary_agent.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) - events = capture_events() + events = capture_events() - result = await agents.Runner.run( - primary_agent, - "Please hand off to secondary agent", - run_config=test_run_config, - ) + result = await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + ) - assert result is not None + assert result is not None (transaction,) = events spans = transaction["spans"] - handoff_span = spans[2] + + handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) # Verify handoff span was created assert handoff_span is not None @@ -1006,85 +1062,123 @@ async def test_handoff_span(sentry_init, capture_events, mock_usage): @pytest.mark.asyncio -async def test_max_turns_before_handoff_span(sentry_init, capture_events, mock_usage): +async def test_max_turns_before_handoff_span( + sentry_init, capture_events, get_model_response +): """ Example raising agents.exceptions.AgentsException after the agent invocation span is complete. """ + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client) + # Create two simple agents with a handoff relationship secondary_agent = agents.Agent( name="secondary_agent", instructions="You are a secondary agent.", - model="gpt-4o-mini", + model=model, ) primary_agent = agents.Agent( name="primary_agent", instructions="You are a primary agent that hands off to secondary agent.", - model="gpt-4o-mini", + model=model, handoffs=[secondary_agent], ) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Mock two responses: - # 1. Primary agent calls handoff tool - # 2. Secondary agent provides final response - handoff_response = ModelResponse( - output=[ - ResponseFunctionToolCall( - id="call_handoff_123", - call_id="call_handoff_123", - name="transfer_to_secondary_agent", - type="function_call", - arguments="{}", - ) - ], - usage=mock_usage, - response_id="resp_handoff_123", - ) - - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="I'm the specialist and I can help with that!", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=mock_usage, - response_id="resp_final_123", - ) + first_response = get_model_response( + Response( + id="resp_tool_123", + output=[ + ResponseFunctionToolCall( + id="call_handoff_123", + call_id="call_handoff_123", + name="transfer_to_secondary_agent", + type="function_call", + arguments="{}", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [handoff_response, final_response] + second_response = get_model_response( + Response( + id="resp_final_123", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="I'm the specialist and I can help with that!", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=15, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=25, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + with patch.object( + primary_agent.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) - events = capture_events() + events = capture_events() - with pytest.raises(MaxTurnsExceeded): - await agents.Runner.run( - primary_agent, - "Please hand off to secondary agent", - run_config=test_run_config, - max_turns=1, - ) + with pytest.raises(MaxTurnsExceeded): + await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + max_turns=1, + ) (error, transaction) = events spans = transaction["spans"] - handoff_span = spans[2] + + handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) # Verify handoff span was created assert handoff_span is not None @@ -1095,7 +1189,9 @@ async def test_max_turns_before_handoff_span(sentry_init, capture_events, mock_u @pytest.mark.asyncio -async def test_tool_execution_span(sentry_init, capture_events, test_agent): +async def test_tool_execution_span( + sentry_init, capture_events, test_agent, get_model_response +): """ Test tool execution span creation. """ @@ -1106,78 +1202,106 @@ def simple_test_tool(message: str) -> str: return f"Tool executed with: {message}" # Create agent with the tool - agent_with_tool = test_agent.clone(tools=[simple_test_tool]) - - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Create a mock response that includes tool calls - tool_call = ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="simple_test_tool", - type="function_call", - arguments='{"message": "hello"}', - ) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model) - # First response with tool call - tool_response = ModelResponse( - output=[tool_call], - usage=Usage( - requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + first_response = get_model_response( + Response( + id="resp_tool_123", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="simple_test_tool", + type="function_call", + arguments='{"message": "hello"}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_tool_123", - ) - - # Second response with final answer - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed using the tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, input_tokens=15, output_tokens=10, total_tokens=25 + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, ), - response_id="resp_final_123", - ) - - # Return different responses on successive calls - mock_get_response.side_effect = [tool_response, final_response] + total_tokens=15, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + second_response = get_model_response( + Response( + id="resp_final_123", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Task completed using the tool", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=15, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=25, + ), + ) + ) - events = capture_events() + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - await agents.Runner.run( - agent_with_tool, - "Please use the simple test tool", - run_config=test_run_config, - ) + events = capture_events() + + await agents.Runner.run( + agent_with_tool, + "Please use the simple test tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] - ( - agent_span, - ai_client_span1, - tool_span, - ai_client_span2, - ) = spans + agent_span = next(span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT) + ai_client_span1, ai_client_span2 = ( + span for span in spans if span["op"] == OP.GEN_AI_CHAT + ) + tool_span = next(span for span in spans if span["op"] == OP.GEN_AI_EXECUTE_TOOL) available_tools = [ { @@ -1258,22 +1382,18 @@ def simple_test_tool(message: str) -> str: assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5 assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15 - - tool_call = { - "arguments": '{"message": "hello"}', - "call_id": "call_123", - "name": "simple_test_tool", - "type": "function_call", - "id": "call_123", - "status": None, - } - - if OPENAI_VERSION >= (2, 25, 0): - tool_call["namespace"] = None - - assert json.loads(ai_client_span1["data"]["gen_ai.response.tool_calls"]) == [ - tool_call - ] + assert ai_client_span1["data"]["gen_ai.response.tool_calls"] == safe_serialize( + [ + { + "arguments": '{"message": "hello"}', + "call_id": "call_123", + "name": "simple_test_tool", + "type": "function_call", + "id": "call_123", + "status": None, + } + ] + ) assert tool_span["description"] == "execute_tool simple_test_tool" assert tool_span["data"]["gen_ai.agent.name"] == "test_agent" @@ -1708,79 +1828,106 @@ async def test_span_status_error(sentry_init, capture_events, test_agent): @pytest.mark.asyncio -async def test_mcp_tool_execution_spans(sentry_init, capture_events, test_agent): +async def test_mcp_tool_execution_spans( + sentry_init, capture_events, test_agent, get_model_response +): """ Test that MCP (Model Context Protocol) tool calls create execute_tool spans. """ + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent = test_agent.clone(model=model) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Create a McpCall object - mcp_call = McpCall( - id="mcp_call_123", - name="test_mcp_tool", - arguments='{"query": "search term"}', - output="MCP tool executed successfully", - error=None, - type="mcp_call", - server_label="test_server", - ) - - # Create a ModelResponse with an McpCall in the output - mcp_response = ModelResponse( - output=[mcp_call], - usage=Usage( - requests=1, - input_tokens=10, - output_tokens=5, - total_tokens=15, + first_response = get_model_response( + Response( + id="resp_mcp_123", + output=[ + McpCall( + id="mcp_call_123", + name="test_mcp_tool", + arguments='{"query": "search term"}', + output="MCP tool executed successfully", + error=None, + type="mcp_call", + server_label="test_server", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_mcp_123", - ) - - # Final response after MCP tool execution - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed using MCP tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, - input_tokens=15, - output_tokens=10, - total_tokens=25, + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, ), - response_id="resp_final_123", - ) + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [mcp_response, final_response] + second_response = get_model_response( + Response( + id="resp_final_123", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Task completed using MCP tool", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=20, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=5, + ), + total_tokens=30, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - await agents.Runner.run( - test_agent, - "Please use MCP tool", - run_config=test_run_config, - ) + await agents.Runner.run( + agent, + "Please use MCP tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] @@ -1811,79 +1958,106 @@ async def test_mcp_tool_execution_spans(sentry_init, capture_events, test_agent) @pytest.mark.asyncio -async def test_mcp_tool_execution_with_error(sentry_init, capture_events, test_agent): +async def test_mcp_tool_execution_with_error( + sentry_init, capture_events, test_agent, get_model_response +): """ Test that MCP tool calls with errors are tracked with error status. """ + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent = test_agent.clone(model=model) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Create a McpCall object with an error - mcp_call_with_error = McpCall( - id="mcp_call_error_123", - name="failing_mcp_tool", - arguments='{"query": "test"}', - output=None, - error="MCP tool execution failed", - type="mcp_call", - server_label="test_server", - ) - - # Create a ModelResponse with a failing McpCall - mcp_response = ModelResponse( - output=[mcp_call_with_error], - usage=Usage( - requests=1, - input_tokens=10, - output_tokens=5, - total_tokens=15, + first_response = get_model_response( + Response( + id="resp_mcp_123", + output=[ + McpCall( + id="mcp_call_error_123", + name="failing_mcp_tool", + arguments='{"query": "test"}', + output=None, + error="MCP tool execution failed", + type="mcp_call", + server_label="test_server", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_mcp_error_123", - ) - - # Final response after error - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="The MCP tool encountered an error", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, - input_tokens=15, - output_tokens=10, - total_tokens=25, + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, ), - response_id="resp_final_error_123", - ) + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [mcp_response, final_response] + second_response = get_model_response( + Response( + id="resp_final_123", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Task completed using MCP tool", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=20, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=5, + ), + total_tokens=30, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - await agents.Runner.run( - test_agent, - "Please use failing MCP tool", - run_config=test_run_config, - ) + await agents.Runner.run( + agent, + "Please use failing MCP tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] @@ -1912,79 +2086,106 @@ async def test_mcp_tool_execution_with_error(sentry_init, capture_events, test_a @pytest.mark.asyncio -async def test_mcp_tool_execution_without_pii(sentry_init, capture_events, test_agent): +async def test_mcp_tool_execution_without_pii( + sentry_init, capture_events, test_agent, get_model_response +): """ Test that MCP tool input/output are not included when send_default_pii is False. """ + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent = test_agent.clone(model=model) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Create a McpCall object - mcp_call = McpCall( - id="mcp_call_pii_123", - name="test_mcp_tool", - arguments='{"query": "sensitive data"}', - output="Result with sensitive info", - error=None, - type="mcp_call", - server_label="test_server", - ) - - # Create a ModelResponse with an McpCall - mcp_response = ModelResponse( - output=[mcp_call], - usage=Usage( - requests=1, - input_tokens=10, - output_tokens=5, - total_tokens=15, + first_response = get_model_response( + Response( + id="resp_mcp_123", + output=[ + McpCall( + id="mcp_call_pii_123", + name="test_mcp_tool", + arguments='{"query": "sensitive data"}', + output="Result with sensitive info", + error=None, + type="mcp_call", + server_label="test_server", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_mcp_123", - ) + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=15, + ), + ) + ) - # Final response - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, - input_tokens=15, - output_tokens=10, - total_tokens=25, + second_response = get_model_response( + Response( + id="resp_final_123", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Task completed", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, ), - response_id="resp_final_123", - ) - - mock_get_response.side_effect = [mcp_response, final_response] + output_tokens=20, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=5, + ), + total_tokens=30, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=False, # PII disabled - ) + with patch.object( + agent.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, # PII disabled + ) - events = capture_events() + events = capture_events() - await agents.Runner.run( - test_agent, - "Please use MCP tool", - run_config=test_run_config, - ) + await agents.Runner.run( + agent, + "Please use MCP tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] @@ -2095,7 +2296,9 @@ def test_openai_agents_message_role_mapping( @pytest.mark.asyncio -async def test_tool_execution_error_tracing(sentry_init, capture_events, test_agent): +async def test_tool_execution_error_tracing( + sentry_init, capture_events, test_agent, get_model_response +): """ Test that tool execution errors are properly tracked via error tracing patch. @@ -2113,70 +2316,100 @@ def failing_tool(message: str) -> str: raise ValueError("Tool execution failed") # Create agent with the failing tool - agent_with_tool = test_agent.clone(tools=[failing_tool]) - - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # Create a mock response that includes tool call - tool_call = ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="failing_tool", - type="function_call", - arguments='{"message": "test"}', - ) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[failing_tool], model=model) - # First response with tool call - tool_response = ModelResponse( - output=[tool_call], - usage=Usage( - requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + first_response = get_model_response( + Response( + id="resp_1", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="failing_tool", + type="function_call", + arguments='{"message": "test"}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=5, ), - response_id="resp_tool_123", - ) - - # Second response after tool error (agents library handles the error and continues) - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="An error occurred while running the tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, input_tokens=15, output_tokens=10, total_tokens=25 + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=3, ), - response_id="resp_final_123", - ) + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [tool_response, final_response] + second_response = get_model_response( + Response( + id="resp_2", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="An error occurred while running the tool", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4-0613", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=15, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=35, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - # Note: The agents library catches tool exceptions internally, - # so we don't expect this to raise - await agents.Runner.run( - agent_with_tool, - "Please use the failing tool", - run_config=test_run_config, - ) + # Note: The agents library catches tool exceptions internally, + # so we don't expect this to raise + await agents.Runner.run( + agent_with_tool, + "Please use the failing tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] @@ -2412,7 +2645,7 @@ async def test_ai_client_span_response_model_with_chat_completions( @pytest.mark.asyncio async def test_multiple_llm_calls_aggregate_usage( - sentry_init, capture_events, test_agent + sentry_init, capture_events, test_agent, get_model_response ): """ Test that invoke_agent spans show aggregated usage across multiple LLM calls @@ -2424,79 +2657,100 @@ def calculator(a: int, b: int) -> int: """Add two numbers""" return a + b - agent_with_tool = test_agent.clone(tools=[calculator]) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[calculator], model=model) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - # First call: agent decides to use tool (10 input, 5 output tokens) - tool_call_response = ModelResponse( - output=[ - ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="calculator", - type="function_call", - arguments='{"a": 5, "b": 3}', - ) - ], - usage=Usage( - requests=1, - input_tokens=10, - output_tokens=5, - total_tokens=15, - input_tokens_details=InputTokensDetails(cached_tokens=0), - output_tokens_details=OutputTokensDetails(reasoning_tokens=0), + first_response = get_model_response( + Response( + id="resp_1", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="calculator", + type="function_call", + arguments='{"a": 5, "b": 3}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=5, ), - response_id="resp_tool_call", - ) - - # Second call: agent uses tool result to respond (20 input, 15 output tokens) - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="The result is 8", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, - input_tokens=20, - output_tokens=15, - total_tokens=35, - input_tokens_details=InputTokensDetails(cached_tokens=5), - output_tokens_details=OutputTokensDetails(reasoning_tokens=3), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=3, ), - response_id="resp_final", - ) + total_tokens=15, + ), + ) + ) - mock_get_response.side_effect = [tool_call_response, final_response] + second_response = get_model_response( + Response( + id="resp_2", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="The result is 8", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4-0613", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=15, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=35, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - result = await agents.Runner.run( - agent_with_tool, - "What is 5 + 3?", - run_config=test_run_config, - ) + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) - assert result is not None + assert result is not None (transaction,) = events spans = transaction["spans"] @@ -2656,7 +2910,10 @@ async def test_invoke_agent_span_includes_response_model( @pytest.mark.asyncio async def test_invoke_agent_span_uses_last_response_model( - sentry_init, capture_events, test_agent + sentry_init, + capture_events, + test_agent, + get_model_response, ): """ Test that when an agent makes multiple LLM calls (e.g., with tools), @@ -2668,17 +2925,14 @@ def calculator(a: int, b: int) -> int: """Add two numbers""" return a + b - agent_with_tool = test_agent.clone(tools=[calculator]) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[calculator], model=model) - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel._fetch_response" - ) as mock_fetch_response: - # First call: gpt-4 model returns tool call - first_response = MagicMock() - first_response.model = "gpt-4-0613" - first_response.id = "resp_1" - first_response.output = [ + first_response = get_model_response( + Response( + id="resp_1", + output=[ ResponseFunctionToolCall( id="call_123", call_id="call_123", @@ -2686,65 +2940,85 @@ def calculator(a: int, b: int) -> int: type="function_call", arguments='{"a": 5, "b": 3}', ) - ] - first_response.usage = MagicMock() - first_response.usage.input_tokens = 10 - first_response.usage.output_tokens = 5 - first_response.usage.total_tokens = 15 - first_response.usage.input_tokens_details = InputTokensDetails( - cached_tokens=0 - ) - first_response.usage.output_tokens_details = OutputTokensDetails( - reasoning_tokens=0 - ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4-0613", + object="response", + usage=ResponseUsage( + input_tokens=10, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=5, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=15, + ), + ) + ) - # Second call: different model version returns final message - second_response = MagicMock() - second_response.model = "gpt-4.1-2025-04-14" - second_response.id = "resp_2" - second_response.output = [ + second_response = get_model_response( + Response( + id="resp_2", + output=[ ResponseOutputMessage( id="msg_final", type="message", status="completed", content=[ ResponseOutputText( - text="The result is 8", + text="I'm the specialist and I can help with that!", type="output_text", annotations=[], ) ], role="assistant", ) - ] - second_response.usage = MagicMock() - second_response.usage.input_tokens = 20 - second_response.usage.output_tokens = 15 - second_response.usage.total_tokens = 35 - second_response.usage.input_tokens_details = InputTokensDetails( - cached_tokens=5 - ) - second_response.usage.output_tokens_details = OutputTokensDetails( - reasoning_tokens=3 - ) - - mock_fetch_response.side_effect = [first_response, second_response] + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4.1-2025-04-14", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=0, + ), + output_tokens=15, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=5, + ), + total_tokens=35, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - events = capture_events() + events = capture_events() - result = await agents.Runner.run( - agent_with_tool, - "What is 5 + 3?", - run_config=test_run_config, - ) + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) - assert result is not None + assert result is not None (transaction,) = events spans = transaction["spans"] @@ -2966,7 +3240,9 @@ async def test_conversation_id_on_all_spans( reason="conversation_id support requires openai-agents >= 0.4.0", ) @pytest.mark.asyncio -async def test_conversation_id_on_tool_span(sentry_init, capture_events, test_agent): +async def test_conversation_id_on_tool_span( + sentry_init, capture_events, test_agent, get_model_response +): """ Test that gen_ai.conversation.id is set on tool execution spans when passed to Runner.run(). """ @@ -2976,65 +3252,98 @@ def simple_tool(message: str) -> str: """A simple tool""" return f"Result: {message}" - agent_with_tool = test_agent.clone(tools=[simple_tool]) - - with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): - with patch( - "agents.models.openai_responses.OpenAIResponsesModel.get_response" - ) as mock_get_response: - tool_call = ResponseFunctionToolCall( - id="call_123", - call_id="call_123", - name="simple_tool", - type="function_call", - arguments='{"message": "hello"}', - ) + client = AsyncOpenAI(api_key="test-key") + model = OpenAIResponsesModel(model="gpt-4", openai_client=client) + agent_with_tool = test_agent.clone(tools=[simple_tool], model=model) - tool_response = ModelResponse( - output=[tool_call], - usage=Usage( - requests=1, input_tokens=10, output_tokens=5, total_tokens=15 + first_response = get_model_response( + Response( + id="call_123", + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="simple_tool", + type="function_call", + arguments='{"message": "hello"}', + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=5, ), - response_id="resp_tool_456", - ) - - final_response = ModelResponse( - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Done", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - usage=Usage( - requests=1, input_tokens=15, output_tokens=10, total_tokens=25 + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=8, ), - response_id="resp_final_789", - ) + total_tokens=30, + ), + ) + ) - mock_get_response.side_effect = [tool_response, final_response] + second_response = get_model_response( + Response( + id="resp_final_789", + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Done", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model="gpt-4", + object="response", + usage=ResponseUsage( + input_tokens=20, + input_tokens_details=InputTokensDetails( + cached_tokens=5, + ), + output_tokens=10, + output_tokens_details=OutputTokensDetails( + reasoning_tokens=8, + ), + total_tokens=30, + ), + ) + ) - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + with patch.object( + agent_with_tool.model._client._client, + "send", + side_effect=[first_response, second_response], + ) as _: + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) - events = capture_events() + events = capture_events() - await agents.Runner.run( - agent_with_tool, - "Use the tool", - run_config=test_run_config, - conversation_id="conv_tool_test_456", - ) + await agents.Runner.run( + agent_with_tool, + "Use the tool", + run_config=test_run_config, + conversation_id="conv_tool_test_456", + ) (transaction,) = events spans = transaction["spans"] From a6c52d69c59d5cb010d10da616459d49477e02be Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Fri, 6 Mar 2026 13:10:39 +0100 Subject: [PATCH 2/6] . --- .../openai_agents/test_openai_agents.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index af7c6011a7..aca7b96461 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -1382,18 +1382,22 @@ def simple_test_tool(message: str) -> str: assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5 assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0 assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15 - assert ai_client_span1["data"]["gen_ai.response.tool_calls"] == safe_serialize( - [ - { - "arguments": '{"message": "hello"}', - "call_id": "call_123", - "name": "simple_test_tool", - "type": "function_call", - "id": "call_123", - "status": None, - } - ] - ) + + tool_call = { + "arguments": '{"message": "hello"}', + "call_id": "call_123", + "name": "simple_test_tool", + "type": "function_call", + "id": "call_123", + "status": None, + } + + if OPENAI_VERSION >= (2, 25, 0): + tool_call["namespace"] = None + + assert json.loads(ai_client_span1["data"]["gen_ai.response.tool_calls"]) == [ + tool_call + ] assert tool_span["description"] == "execute_tool simple_test_tool" assert tool_span["data"]["gen_ai.agent.name"] == "test_agent" From 298245da2717b11dd5b8ebb866fcb028cb23b0cc Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Fri, 6 Mar 2026 13:53:06 +0100 Subject: [PATCH 3/6] restore token numbers --- .../openai_agents/test_openai_agents.py | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index aca7b96461..cd52f30579 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -981,11 +981,11 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=5, + output_tokens=20, output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + reasoning_tokens=5, ), - total_tokens=15, + total_tokens=30, ), ) ) @@ -1015,15 +1015,15 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): model="gpt-4", object="response", usage=ResponseUsage( - input_tokens=15, + input_tokens=10, input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=10, + output_tokens=20, output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + reasoning_tokens=5, ), - total_tokens=25, + total_tokens=30, ), ) ) @@ -1108,11 +1108,11 @@ async def test_max_turns_before_handoff_span( input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=5, + output_tokens=20, output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + reasoning_tokens=5, ), - total_tokens=15, + total_tokens=30, ), ) ) @@ -1142,15 +1142,15 @@ async def test_max_turns_before_handoff_span( model="gpt-4", object="response", usage=ResponseUsage( - input_tokens=15, + input_tokens=10, input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=10, + output_tokens=20, output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + reasoning_tokens=5, ), - total_tokens=25, + total_tokens=30, ), ) ) @@ -1901,15 +1901,15 @@ async def test_mcp_tool_execution_spans( model="gpt-4.1-2025-04-14", object="response", usage=ResponseUsage( - input_tokens=10, + input_tokens=15, input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=20, + output_tokens=10, output_tokens_details=OutputTokensDetails( - reasoning_tokens=5, + reasoning_tokens=0, ), - total_tokens=30, + total_tokens=25, ), ) ) @@ -2031,15 +2031,15 @@ async def test_mcp_tool_execution_with_error( model="gpt-4.1-2025-04-14", object="response", usage=ResponseUsage( - input_tokens=10, + input_tokens=15, input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=20, + output_tokens=10, output_tokens_details=OutputTokensDetails( - reasoning_tokens=5, + reasoning_tokens=0, ), - total_tokens=30, + total_tokens=25, ), ) ) @@ -2159,15 +2159,15 @@ async def test_mcp_tool_execution_without_pii( model="gpt-4.1-2025-04-14", object="response", usage=ResponseUsage( - input_tokens=10, + input_tokens=15, input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=20, + output_tokens=10, output_tokens_details=OutputTokensDetails( reasoning_tokens=5, ), - total_tokens=30, + total_tokens=25, ), ) ) @@ -2345,11 +2345,11 @@ def failing_tool(message: str) -> str: usage=ResponseUsage( input_tokens=10, input_tokens_details=InputTokensDetails( - cached_tokens=5, + cached_tokens=0, ), output_tokens=5, output_tokens_details=OutputTokensDetails( - reasoning_tokens=3, + reasoning_tokens=0, ), total_tokens=15, ), @@ -2381,15 +2381,15 @@ def failing_tool(message: str) -> str: model="gpt-4-0613", object="response", usage=ResponseUsage( - input_tokens=20, + input_tokens=15, input_tokens_details=InputTokensDetails( cached_tokens=0, ), - output_tokens=15, + output_tokens=10, output_tokens_details=OutputTokensDetails( reasoning_tokens=0, ), - total_tokens=35, + total_tokens=25, ), ) ) @@ -2686,11 +2686,11 @@ def calculator(a: int, b: int) -> int: usage=ResponseUsage( input_tokens=10, input_tokens_details=InputTokensDetails( - cached_tokens=5, + cached_tokens=0, ), output_tokens=5, output_tokens_details=OutputTokensDetails( - reasoning_tokens=3, + reasoning_tokens=0, ), total_tokens=15, ), @@ -2724,11 +2724,11 @@ def calculator(a: int, b: int) -> int: usage=ResponseUsage( input_tokens=20, input_tokens_details=InputTokensDetails( - cached_tokens=0, + cached_tokens=5, ), output_tokens=15, output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + reasoning_tokens=3, ), total_tokens=35, ), @@ -3279,15 +3279,15 @@ def simple_tool(message: str) -> str: model="gpt-4", object="response", usage=ResponseUsage( - input_tokens=20, + input_tokens=10, input_tokens_details=InputTokensDetails( - cached_tokens=5, + cached_tokens=0, ), - output_tokens=10, + output_tokens=5, output_tokens_details=OutputTokensDetails( - reasoning_tokens=8, + reasoning_tokens=0, ), - total_tokens=30, + total_tokens=15, ), ) ) From 7d966bd015cb99587b331bba5b3491ba740320ec Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Fri, 6 Mar 2026 14:05:36 +0100 Subject: [PATCH 4/6] rename responses --- .../openai_agents/test_openai_agents.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index cd52f30579..7663ea297f 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -958,7 +958,7 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): handoffs=[secondary_agent], ) - first_response = get_model_response( + handoff_response = get_model_response( Response( id="resp_tool_123", output=[ @@ -990,7 +990,7 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_final_123", output=[ @@ -1031,7 +1031,7 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): with patch.object( primary_agent.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[handoff_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -1085,7 +1085,7 @@ async def test_max_turns_before_handoff_span( handoffs=[secondary_agent], ) - first_response = get_model_response( + handoff_response = get_model_response( Response( id="resp_tool_123", output=[ @@ -1117,7 +1117,7 @@ async def test_max_turns_before_handoff_span( ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_final_123", output=[ @@ -1158,7 +1158,7 @@ async def test_max_turns_before_handoff_span( with patch.object( primary_agent.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[handoff_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -1206,7 +1206,7 @@ def simple_test_tool(message: str) -> str: model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model) - first_response = get_model_response( + tool_call_response = get_model_response( Response( id="resp_tool_123", output=[ @@ -1238,7 +1238,7 @@ def simple_test_tool(message: str) -> str: ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_final_123", output=[ @@ -1279,7 +1279,7 @@ def simple_test_tool(message: str) -> str: with patch.object( agent_with_tool.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[tool_call_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -1842,7 +1842,7 @@ async def test_mcp_tool_execution_spans( model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent = test_agent.clone(model=model) - first_response = get_model_response( + mcp_response = get_model_response( Response( id="resp_mcp_123", output=[ @@ -1876,7 +1876,7 @@ async def test_mcp_tool_execution_spans( ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_final_123", output=[ @@ -1917,7 +1917,7 @@ async def test_mcp_tool_execution_spans( with patch.object( agent.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[mcp_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -1972,7 +1972,7 @@ async def test_mcp_tool_execution_with_error( model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent = test_agent.clone(model=model) - first_response = get_model_response( + mcp_call_with_error_response = get_model_response( Response( id="resp_mcp_123", output=[ @@ -2006,7 +2006,7 @@ async def test_mcp_tool_execution_with_error( ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_final_123", output=[ @@ -2047,7 +2047,7 @@ async def test_mcp_tool_execution_with_error( with patch.object( agent.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[mcp_call_with_error_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -2100,7 +2100,7 @@ async def test_mcp_tool_execution_without_pii( model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent = test_agent.clone(model=model) - first_response = get_model_response( + mcp_response = get_model_response( Response( id="resp_mcp_123", output=[ @@ -2134,7 +2134,7 @@ async def test_mcp_tool_execution_without_pii( ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_final_123", output=[ @@ -2175,7 +2175,7 @@ async def test_mcp_tool_execution_without_pii( with patch.object( agent.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[mcp_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -2324,7 +2324,7 @@ def failing_tool(message: str) -> str: model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent_with_tool = test_agent.clone(tools=[failing_tool], model=model) - first_response = get_model_response( + tool_response = get_model_response( Response( id="resp_1", output=[ @@ -2356,7 +2356,7 @@ def failing_tool(message: str) -> str: ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_2", output=[ @@ -2397,7 +2397,7 @@ def failing_tool(message: str) -> str: with patch.object( agent_with_tool.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[tool_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -2665,7 +2665,7 @@ def calculator(a: int, b: int) -> int: model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent_with_tool = test_agent.clone(tools=[calculator], model=model) - first_response = get_model_response( + tool_call_response = get_model_response( Response( id="resp_1", output=[ @@ -2697,7 +2697,7 @@ def calculator(a: int, b: int) -> int: ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_2", output=[ @@ -2738,7 +2738,7 @@ def calculator(a: int, b: int) -> int: with patch.object( agent_with_tool.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[tool_call_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -3260,7 +3260,7 @@ def simple_tool(message: str) -> str: model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent_with_tool = test_agent.clone(tools=[simple_tool], model=model) - first_response = get_model_response( + tool_response = get_model_response( Response( id="call_123", output=[ @@ -3292,7 +3292,7 @@ def simple_tool(message: str) -> str: ) ) - second_response = get_model_response( + final_response = get_model_response( Response( id="resp_final_789", output=[ @@ -3333,7 +3333,7 @@ def simple_tool(message: str) -> str: with patch.object( agent_with_tool.model._client._client, "send", - side_effect=[first_response, second_response], + side_effect=[tool_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], From 085dab99df3dda8682570d73bc696d9a63a92cea Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Fri, 6 Mar 2026 14:07:20 +0100 Subject: [PATCH 5/6] rename responses --- tests/integrations/openai_agents/test_openai_agents.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 7663ea297f..6d2b782cca 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -1206,7 +1206,7 @@ def simple_test_tool(message: str) -> str: model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model) - tool_call_response = get_model_response( + tool_response = get_model_response( Response( id="resp_tool_123", output=[ @@ -1279,7 +1279,7 @@ def simple_test_tool(message: str) -> str: with patch.object( agent_with_tool.model._client._client, "send", - side_effect=[tool_call_response, final_response], + side_effect=[tool_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], @@ -1972,7 +1972,7 @@ async def test_mcp_tool_execution_with_error( model = OpenAIResponsesModel(model="gpt-4", openai_client=client) agent = test_agent.clone(model=model) - mcp_call_with_error_response = get_model_response( + mcp_response = get_model_response( Response( id="resp_mcp_123", output=[ @@ -2047,7 +2047,7 @@ async def test_mcp_tool_execution_with_error( with patch.object( agent.model._client._client, "send", - side_effect=[mcp_call_with_error_response, final_response], + side_effect=[mcp_response, final_response], ) as _: sentry_init( integrations=[OpenAIAgentsIntegration()], From b8af88b112f03dcd984b926660f224a74f402ac9 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Fri, 6 Mar 2026 14:33:20 +0100 Subject: [PATCH 6/6] restore handoff and mcp tests --- .../openai_agents/test_openai_agents.py | 759 +++++++----------- 1 file changed, 302 insertions(+), 457 deletions(-) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 6d2b782cca..b9cb43887a 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -937,121 +937,85 @@ def test_agent_invocation_span_sync( @pytest.mark.asyncio -async def test_handoff_span(sentry_init, capture_events, get_model_response): +async def test_handoff_span(sentry_init, capture_events, mock_usage): """ Test that handoff spans are created when agents hand off to other agents. """ - client = AsyncOpenAI(api_key="test-key") - model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client) - # Create two simple agents with a handoff relationship secondary_agent = agents.Agent( name="secondary_agent", instructions="You are a secondary agent.", - model=model, + model="gpt-4o-mini", ) primary_agent = agents.Agent( name="primary_agent", instructions="You are a primary agent that hands off to secondary agent.", - model=model, + model="gpt-4o-mini", handoffs=[secondary_agent], ) - handoff_response = get_model_response( - Response( - id="resp_tool_123", - output=[ - ResponseFunctionToolCall( - id="call_handoff_123", - call_id="call_handoff_123", - name="transfer_to_secondary_agent", - type="function_call", - arguments="{}", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=20, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=5, - ), - total_tokens=30, - ), - ) - ) + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # Mock two responses: + # 1. Primary agent calls handoff tool + # 2. Secondary agent provides final response + handoff_response = ModelResponse( + output=[ + ResponseFunctionToolCall( + id="call_handoff_123", + call_id="call_handoff_123", + name="transfer_to_secondary_agent", + type="function_call", + arguments="{}", + ) + ], + usage=mock_usage, + response_id="resp_handoff_123", + ) - final_response = get_model_response( - Response( - id="resp_final_123", - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="I'm the specialist and I can help with that!", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=20, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=5, - ), - total_tokens=30, - ), - ) - ) + final_response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="I'm the specialist and I can help with that!", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=mock_usage, + response_id="resp_final_123", + ) - with patch.object( - primary_agent.model._client._client, - "send", - side_effect=[handoff_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + mock_get_response.side_effect = [handoff_response, final_response] - events = capture_events() + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) - result = await agents.Runner.run( - primary_agent, - "Please hand off to secondary agent", - run_config=test_run_config, - ) + events = capture_events() - assert result is not None + result = await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + ) + + assert result is not None (transaction,) = events spans = transaction["spans"] - - handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) + handoff_span = spans[2] # Verify handoff span was created assert handoff_span is not None @@ -1062,123 +1026,85 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response): @pytest.mark.asyncio -async def test_max_turns_before_handoff_span( - sentry_init, capture_events, get_model_response -): +async def test_max_turns_before_handoff_span(sentry_init, capture_events, mock_usage): """ Example raising agents.exceptions.AgentsException after the agent invocation span is complete. """ - client = AsyncOpenAI(api_key="test-key") - model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client) - # Create two simple agents with a handoff relationship secondary_agent = agents.Agent( name="secondary_agent", instructions="You are a secondary agent.", - model=model, + model="gpt-4o-mini", ) primary_agent = agents.Agent( name="primary_agent", instructions="You are a primary agent that hands off to secondary agent.", - model=model, + model="gpt-4o-mini", handoffs=[secondary_agent], ) - handoff_response = get_model_response( - Response( - id="resp_tool_123", - output=[ - ResponseFunctionToolCall( - id="call_handoff_123", - call_id="call_handoff_123", - name="transfer_to_secondary_agent", - type="function_call", - arguments="{}", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=20, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=5, - ), - total_tokens=30, - ), - ) - ) - - final_response = get_model_response( - Response( - id="resp_final_123", - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="I'm the specialist and I can help with that!", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=20, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=5, - ), - total_tokens=30, - ), - ) - ) + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # Mock two responses: + # 1. Primary agent calls handoff tool + # 2. Secondary agent provides final response + handoff_response = ModelResponse( + output=[ + ResponseFunctionToolCall( + id="call_handoff_123", + call_id="call_handoff_123", + name="transfer_to_secondary_agent", + type="function_call", + arguments="{}", + ) + ], + usage=mock_usage, + response_id="resp_handoff_123", + ) - with patch.object( - primary_agent.model._client._client, - "send", - side_effect=[handoff_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - ) + final_response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="I'm the specialist and I can help with that!", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=mock_usage, + response_id="resp_final_123", + ) - events = capture_events() + mock_get_response.side_effect = [handoff_response, final_response] - with pytest.raises(MaxTurnsExceeded): - await agents.Runner.run( - primary_agent, - "Please hand off to secondary agent", - run_config=test_run_config, - max_turns=1, + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, ) + events = capture_events() + + with pytest.raises(MaxTurnsExceeded): + await agents.Runner.run( + primary_agent, + "Please hand off to secondary agent", + run_config=test_run_config, + max_turns=1, + ) + (error, transaction) = events spans = transaction["spans"] - - handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF) + handoff_span = spans[2] # Verify handoff span was created assert handoff_span is not None @@ -1832,106 +1758,79 @@ async def test_span_status_error(sentry_init, capture_events, test_agent): @pytest.mark.asyncio -async def test_mcp_tool_execution_spans( - sentry_init, capture_events, test_agent, get_model_response -): +async def test_mcp_tool_execution_spans(sentry_init, capture_events, test_agent): """ Test that MCP (Model Context Protocol) tool calls create execute_tool spans. """ - client = AsyncOpenAI(api_key="test-key") - model = OpenAIResponsesModel(model="gpt-4", openai_client=client) - agent = test_agent.clone(model=model) - mcp_response = get_model_response( - Response( - id="resp_mcp_123", - output=[ - McpCall( - id="mcp_call_123", - name="test_mcp_tool", - arguments='{"query": "search term"}', - output="MCP tool executed successfully", - error=None, - type="mcp_call", - server_label="test_server", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4.1-2025-04-14", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # Create a McpCall object + mcp_call = McpCall( + id="mcp_call_123", + name="test_mcp_tool", + arguments='{"query": "search term"}', + output="MCP tool executed successfully", + error=None, + type="mcp_call", + server_label="test_server", + ) + + # Create a ModelResponse with an McpCall in the output + mcp_response = ModelResponse( + output=[mcp_call], + usage=Usage( + requests=1, + input_tokens=10, + output_tokens=5, + total_tokens=15, ), - output_tokens=5, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, - ), - total_tokens=15, - ), - ) - ) + response_id="resp_mcp_123", + ) - final_response = get_model_response( - Response( - id="resp_final_123", - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed using MCP tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4.1-2025-04-14", - object="response", - usage=ResponseUsage( - input_tokens=15, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=10, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + # Final response after MCP tool execution + final_response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Task completed using MCP tool", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=Usage( + requests=1, + input_tokens=15, + output_tokens=10, + total_tokens=25, ), - total_tokens=25, - ), - ) - ) + response_id="resp_final_123", + ) - with patch.object( - agent.model._client._client, - "send", - side_effect=[mcp_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + mock_get_response.side_effect = [mcp_response, final_response] - events = capture_events() + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - await agents.Runner.run( - agent, - "Please use MCP tool", - run_config=test_run_config, - ) + events = capture_events() + + await agents.Runner.run( + test_agent, + "Please use MCP tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] @@ -1962,106 +1861,79 @@ async def test_mcp_tool_execution_spans( @pytest.mark.asyncio -async def test_mcp_tool_execution_with_error( - sentry_init, capture_events, test_agent, get_model_response -): +async def test_mcp_tool_execution_with_error(sentry_init, capture_events, test_agent): """ Test that MCP tool calls with errors are tracked with error status. """ - client = AsyncOpenAI(api_key="test-key") - model = OpenAIResponsesModel(model="gpt-4", openai_client=client) - agent = test_agent.clone(model=model) - mcp_response = get_model_response( - Response( - id="resp_mcp_123", - output=[ - McpCall( - id="mcp_call_error_123", - name="failing_mcp_tool", - arguments='{"query": "test"}', - output=None, - error="MCP tool execution failed", - type="mcp_call", - server_label="test_server", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4.1-2025-04-14", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=5, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, - ), - total_tokens=15, - ), - ) - ) + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # Create a McpCall object with an error + mcp_call_with_error = McpCall( + id="mcp_call_error_123", + name="failing_mcp_tool", + arguments='{"query": "test"}', + output=None, + error="MCP tool execution failed", + type="mcp_call", + server_label="test_server", + ) - final_response = get_model_response( - Response( - id="resp_final_123", - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed using MCP tool", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4.1-2025-04-14", - object="response", - usage=ResponseUsage( - input_tokens=15, - input_tokens_details=InputTokensDetails( - cached_tokens=0, + # Create a ModelResponse with a failing McpCall + mcp_response = ModelResponse( + output=[mcp_call_with_error], + usage=Usage( + requests=1, + input_tokens=10, + output_tokens=5, + total_tokens=15, ), - output_tokens=10, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, + response_id="resp_mcp_error_123", + ) + + # Final response after error + final_response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="The MCP tool encountered an error", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=Usage( + requests=1, + input_tokens=15, + output_tokens=10, + total_tokens=25, ), - total_tokens=25, - ), - ) - ) + response_id="resp_final_error_123", + ) - with patch.object( - agent.model._client._client, - "send", - side_effect=[mcp_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=True, - ) + mock_get_response.side_effect = [mcp_response, final_response] - events = capture_events() + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) - await agents.Runner.run( - agent, - "Please use failing MCP tool", - run_config=test_run_config, - ) + events = capture_events() + + await agents.Runner.run( + test_agent, + "Please use failing MCP tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"] @@ -2090,106 +1962,79 @@ async def test_mcp_tool_execution_with_error( @pytest.mark.asyncio -async def test_mcp_tool_execution_without_pii( - sentry_init, capture_events, test_agent, get_model_response -): +async def test_mcp_tool_execution_without_pii(sentry_init, capture_events, test_agent): """ Test that MCP tool input/output are not included when send_default_pii is False. """ - client = AsyncOpenAI(api_key="test-key") - model = OpenAIResponsesModel(model="gpt-4", openai_client=client) - agent = test_agent.clone(model=model) - mcp_response = get_model_response( - Response( - id="resp_mcp_123", - output=[ - McpCall( - id="mcp_call_pii_123", - name="test_mcp_tool", - arguments='{"query": "sensitive data"}', - output="Result with sensitive info", - error=None, - type="mcp_call", - server_label="test_server", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4.1-2025-04-14", - object="response", - usage=ResponseUsage( - input_tokens=10, - input_tokens_details=InputTokensDetails( - cached_tokens=0, - ), - output_tokens=5, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=0, - ), - total_tokens=15, - ), - ) - ) + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # Create a McpCall object + mcp_call = McpCall( + id="mcp_call_pii_123", + name="test_mcp_tool", + arguments='{"query": "sensitive data"}', + output="Result with sensitive info", + error=None, + type="mcp_call", + server_label="test_server", + ) - final_response = get_model_response( - Response( - id="resp_final_123", - output=[ - ResponseOutputMessage( - id="msg_final", - type="message", - status="completed", - content=[ - ResponseOutputText( - text="Task completed", - type="output_text", - annotations=[], - ) - ], - role="assistant", - ) - ], - parallel_tool_calls=False, - tool_choice="none", - tools=[], - created_at=10000000, - model="gpt-4.1-2025-04-14", - object="response", - usage=ResponseUsage( - input_tokens=15, - input_tokens_details=InputTokensDetails( - cached_tokens=0, + # Create a ModelResponse with an McpCall + mcp_response = ModelResponse( + output=[mcp_call], + usage=Usage( + requests=1, + input_tokens=10, + output_tokens=5, + total_tokens=15, ), - output_tokens=10, - output_tokens_details=OutputTokensDetails( - reasoning_tokens=5, + response_id="resp_mcp_123", + ) + + # Final response + final_response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Task completed", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=Usage( + requests=1, + input_tokens=15, + output_tokens=10, + total_tokens=25, ), - total_tokens=25, - ), - ) - ) + response_id="resp_final_123", + ) - with patch.object( - agent.model._client._client, - "send", - side_effect=[mcp_response, final_response], - ) as _: - sentry_init( - integrations=[OpenAIAgentsIntegration()], - traces_sample_rate=1.0, - send_default_pii=False, # PII disabled - ) + mock_get_response.side_effect = [mcp_response, final_response] - events = capture_events() + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=False, # PII disabled + ) - await agents.Runner.run( - agent, - "Please use MCP tool", - run_config=test_run_config, - ) + events = capture_events() + + await agents.Runner.run( + test_agent, + "Please use MCP tool", + run_config=test_run_config, + ) (transaction,) = events spans = transaction["spans"]