From 769440f5c8bd4655fab83fc3b64f45b116175780 Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Fri, 6 Mar 2026 12:54:59 +0100
Subject: [PATCH 1/6] test(openai-agents): Replace mocks with httpx in
 multi-turn tests

---
 .../openai_agents/test_openai_agents.py       | 1535 ++++++++++-------
 1 file changed, 922 insertions(+), 613 deletions(-)

diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index 1390455317..af7c6011a7 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -5,10 +5,11 @@
 import os
 import json
 import logging
+import httpx
 
 import sentry_sdk
 from sentry_sdk import start_span
-from sentry_sdk.consts import SPANDATA
+from sentry_sdk.consts import SPANDATA, OP
 from sentry_sdk.integrations.logging import LoggingIntegration
 from sentry_sdk.integrations.openai_agents import OpenAIAgentsIntegration
 from sentry_sdk.integrations.openai_agents.utils import _set_input_data, safe_serialize
@@ -314,6 +315,25 @@ def test_agent_custom_model():
     )
 
 
+@pytest.fixture
+def get_model_response():
+    def inner(response_content):
+        model_request = httpx.Request(
+            "POST",
+            "/responses",
+        )
+
+        response = httpx.Response(
+            200,
+            request=model_request,
+            content=json.dumps(response_content.model_dump()).encode("utf-8"),
+        )
+
+        return response
+
+    return inner
+
+
 @pytest.mark.asyncio
 async def test_agent_invocation_span_no_pii(
     sentry_init, capture_events, test_agent, mock_model_response
@@ -917,85 +937,121 @@ def test_agent_invocation_span_sync(
 
 
 @pytest.mark.asyncio
-async def test_handoff_span(sentry_init, capture_events, mock_usage):
+async def test_handoff_span(sentry_init, capture_events, get_model_response):
     """
     Test that handoff spans are created when agents hand off to other agents.
     """
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client)
+
     # Create two simple agents with a handoff relationship
     secondary_agent = agents.Agent(
         name="secondary_agent",
         instructions="You are a secondary agent.",
-        model="gpt-4o-mini",
+        model=model,
     )
 
     primary_agent = agents.Agent(
         name="primary_agent",
         instructions="You are a primary agent that hands off to secondary agent.",
-        model="gpt-4o-mini",
+        model=model,
         handoffs=[secondary_agent],
     )
 
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # Mock two responses:
-            # 1. Primary agent calls handoff tool
-            # 2. Secondary agent provides final response
-            handoff_response = ModelResponse(
-                output=[
-                    ResponseFunctionToolCall(
-                        id="call_handoff_123",
-                        call_id="call_handoff_123",
-                        name="transfer_to_secondary_agent",
-                        type="function_call",
-                        arguments="{}",
-                    )
-                ],
-                usage=mock_usage,
-                response_id="resp_handoff_123",
-            )
-
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="I'm the specialist and I can help with that!",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=mock_usage,
-                response_id="resp_final_123",
-            )
+    first_response = get_model_response(
+        Response(
+            id="resp_tool_123",
+            output=[
+                ResponseFunctionToolCall(
+                    id="call_handoff_123",
+                    call_id="call_handoff_123",
+                    name="transfer_to_secondary_agent",
+                    type="function_call",
+                    arguments="{}",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=15,
+            ),
+        )
+    )
 
-            mock_get_response.side_effect = [handoff_response, final_response]
+    second_response = get_model_response(
+        Response(
+            id="resp_final_123",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="I'm the specialist and I can help with that!",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=15,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=10,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=25,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-            )
+    with patch.object(
+        primary_agent.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            result = await agents.Runner.run(
-                primary_agent,
-                "Please hand off to secondary agent",
-                run_config=test_run_config,
-            )
+        result = await agents.Runner.run(
+            primary_agent,
+            "Please hand off to secondary agent",
+            run_config=test_run_config,
+        )
 
-            assert result is not None
+        assert result is not None
 
     (transaction,) = events
     spans = transaction["spans"]
-    handoff_span = spans[2]
+
+    handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF)
 
     # Verify handoff span was created
     assert handoff_span is not None
@@ -1006,85 +1062,123 @@ async def test_handoff_span(sentry_init, capture_events, mock_usage):
 
 
 @pytest.mark.asyncio
-async def test_max_turns_before_handoff_span(sentry_init, capture_events, mock_usage):
+async def test_max_turns_before_handoff_span(
+    sentry_init, capture_events, get_model_response
+):
     """
     Example raising agents.exceptions.AgentsException after the agent invocation span is complete.
     """
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client)
+
     # Create two simple agents with a handoff relationship
     secondary_agent = agents.Agent(
         name="secondary_agent",
         instructions="You are a secondary agent.",
-        model="gpt-4o-mini",
+        model=model,
     )
 
     primary_agent = agents.Agent(
         name="primary_agent",
         instructions="You are a primary agent that hands off to secondary agent.",
-        model="gpt-4o-mini",
+        model=model,
         handoffs=[secondary_agent],
     )
 
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # Mock two responses:
-            # 1. Primary agent calls handoff tool
-            # 2. Secondary agent provides final response
-            handoff_response = ModelResponse(
-                output=[
-                    ResponseFunctionToolCall(
-                        id="call_handoff_123",
-                        call_id="call_handoff_123",
-                        name="transfer_to_secondary_agent",
-                        type="function_call",
-                        arguments="{}",
-                    )
-                ],
-                usage=mock_usage,
-                response_id="resp_handoff_123",
-            )
-
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="I'm the specialist and I can help with that!",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=mock_usage,
-                response_id="resp_final_123",
-            )
+    first_response = get_model_response(
+        Response(
+            id="resp_tool_123",
+            output=[
+                ResponseFunctionToolCall(
+                    id="call_handoff_123",
+                    call_id="call_handoff_123",
+                    name="transfer_to_secondary_agent",
+                    type="function_call",
+                    arguments="{}",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=15,
+            ),
+        )
+    )
 
-            mock_get_response.side_effect = [handoff_response, final_response]
+    second_response = get_model_response(
+        Response(
+            id="resp_final_123",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="I'm the specialist and I can help with that!",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=15,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=10,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=25,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-            )
+    with patch.object(
+        primary_agent.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            with pytest.raises(MaxTurnsExceeded):
-                await agents.Runner.run(
-                    primary_agent,
-                    "Please hand off to secondary agent",
-                    run_config=test_run_config,
-                    max_turns=1,
-                )
+        with pytest.raises(MaxTurnsExceeded):
+            await agents.Runner.run(
+                primary_agent,
+                "Please hand off to secondary agent",
+                run_config=test_run_config,
+                max_turns=1,
+            )
 
     (error, transaction) = events
     spans = transaction["spans"]
-    handoff_span = spans[2]
+
+    handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF)
 
     # Verify handoff span was created
     assert handoff_span is not None
@@ -1095,7 +1189,9 @@ async def test_max_turns_before_handoff_span(sentry_init, capture_events, mock_u
 
 
 @pytest.mark.asyncio
-async def test_tool_execution_span(sentry_init, capture_events, test_agent):
+async def test_tool_execution_span(
+    sentry_init, capture_events, test_agent, get_model_response
+):
     """
     Test tool execution span creation.
     """
@@ -1106,78 +1202,106 @@ def simple_test_tool(message: str) -> str:
         return f"Tool executed with: {message}"
 
     # Create agent with the tool
-    agent_with_tool = test_agent.clone(tools=[simple_test_tool])
-
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # Create a mock response that includes tool calls
-            tool_call = ResponseFunctionToolCall(
-                id="call_123",
-                call_id="call_123",
-                name="simple_test_tool",
-                type="function_call",
-                arguments='{"message": "hello"}',
-            )
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model)
 
-            # First response with tool call
-            tool_response = ModelResponse(
-                output=[tool_call],
-                usage=Usage(
-                    requests=1, input_tokens=10, output_tokens=5, total_tokens=15
+    first_response = get_model_response(
+        Response(
+            id="resp_tool_123",
+            output=[
+                ResponseFunctionToolCall(
+                    id="call_123",
+                    call_id="call_123",
+                    name="simple_test_tool",
+                    type="function_call",
+                    arguments='{"message": "hello"}',
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
                 ),
-                response_id="resp_tool_123",
-            )
-
-            # Second response with final answer
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="Task completed using the tool",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=Usage(
-                    requests=1, input_tokens=15, output_tokens=10, total_tokens=25
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
                 ),
-                response_id="resp_final_123",
-            )
-
-            # Return different responses on successive calls
-            mock_get_response.side_effect = [tool_response, final_response]
+                total_tokens=15,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-                send_default_pii=True,
-            )
+    second_response = get_model_response(
+        Response(
+            id="resp_final_123",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Task completed using the tool",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=15,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=10,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=25,
+            ),
+        )
+    )
 
-            events = capture_events()
+    with patch.object(
+        agent_with_tool.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+            send_default_pii=True,
+        )
 
-            await agents.Runner.run(
-                agent_with_tool,
-                "Please use the simple test tool",
-                run_config=test_run_config,
-            )
+        events = capture_events()
+
+        await agents.Runner.run(
+            agent_with_tool,
+            "Please use the simple test tool",
+            run_config=test_run_config,
+        )
 
     (transaction,) = events
     spans = transaction["spans"]
-    (
-        agent_span,
-        ai_client_span1,
-        tool_span,
-        ai_client_span2,
-    ) = spans
+    agent_span = next(span for span in spans if span["op"] == OP.GEN_AI_INVOKE_AGENT)
+    ai_client_span1, ai_client_span2 = (
+        span for span in spans if span["op"] == OP.GEN_AI_CHAT
+    )
+    tool_span = next(span for span in spans if span["op"] == OP.GEN_AI_EXECUTE_TOOL)
 
     available_tools = [
         {
@@ -1258,22 +1382,18 @@ def simple_test_tool(message: str) -> str:
     assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5
     assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0
     assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15
-
-    tool_call = {
-        "arguments": '{"message": "hello"}',
-        "call_id": "call_123",
-        "name": "simple_test_tool",
-        "type": "function_call",
-        "id": "call_123",
-        "status": None,
-    }
-
-    if OPENAI_VERSION >= (2, 25, 0):
-        tool_call["namespace"] = None
-
-    assert json.loads(ai_client_span1["data"]["gen_ai.response.tool_calls"]) == [
-        tool_call
-    ]
+    assert ai_client_span1["data"]["gen_ai.response.tool_calls"] == safe_serialize(
+        [
+            {
+                "arguments": '{"message": "hello"}',
+                "call_id": "call_123",
+                "name": "simple_test_tool",
+                "type": "function_call",
+                "id": "call_123",
+                "status": None,
+            }
+        ]
+    )
 
     assert tool_span["description"] == "execute_tool simple_test_tool"
     assert tool_span["data"]["gen_ai.agent.name"] == "test_agent"
@@ -1708,79 +1828,106 @@ async def test_span_status_error(sentry_init, capture_events, test_agent):
 
 
 @pytest.mark.asyncio
-async def test_mcp_tool_execution_spans(sentry_init, capture_events, test_agent):
+async def test_mcp_tool_execution_spans(
+    sentry_init, capture_events, test_agent, get_model_response
+):
     """
     Test that MCP (Model Context Protocol) tool calls create execute_tool spans.
     """
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent = test_agent.clone(model=model)
 
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # Create a McpCall object
-            mcp_call = McpCall(
-                id="mcp_call_123",
-                name="test_mcp_tool",
-                arguments='{"query": "search term"}',
-                output="MCP tool executed successfully",
-                error=None,
-                type="mcp_call",
-                server_label="test_server",
-            )
-
-            # Create a ModelResponse with an McpCall in the output
-            mcp_response = ModelResponse(
-                output=[mcp_call],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=10,
-                    output_tokens=5,
-                    total_tokens=15,
+    first_response = get_model_response(
+        Response(
+            id="resp_mcp_123",
+            output=[
+                McpCall(
+                    id="mcp_call_123",
+                    name="test_mcp_tool",
+                    arguments='{"query": "search term"}',
+                    output="MCP tool executed successfully",
+                    error=None,
+                    type="mcp_call",
+                    server_label="test_server",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
                 ),
-                response_id="resp_mcp_123",
-            )
-
-            # Final response after MCP tool execution
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="Task completed using MCP tool",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=15,
-                    output_tokens=10,
-                    total_tokens=25,
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
                 ),
-                response_id="resp_final_123",
-            )
+                total_tokens=15,
+            ),
+        )
+    )
 
-            mock_get_response.side_effect = [mcp_response, final_response]
+    second_response = get_model_response(
+        Response(
+            id="resp_final_123",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Task completed using MCP tool",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=20,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=5,
+                ),
+                total_tokens=30,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-                send_default_pii=True,
-            )
+    with patch.object(
+        agent.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+            send_default_pii=True,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            await agents.Runner.run(
-                test_agent,
-                "Please use MCP tool",
-                run_config=test_run_config,
-            )
+        await agents.Runner.run(
+            agent,
+            "Please use MCP tool",
+            run_config=test_run_config,
+        )
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -1811,79 +1958,106 @@ async def test_mcp_tool_execution_spans(sentry_init, capture_events, test_agent)
 
 
 @pytest.mark.asyncio
-async def test_mcp_tool_execution_with_error(sentry_init, capture_events, test_agent):
+async def test_mcp_tool_execution_with_error(
+    sentry_init, capture_events, test_agent, get_model_response
+):
     """
     Test that MCP tool calls with errors are tracked with error status.
     """
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent = test_agent.clone(model=model)
 
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # Create a McpCall object with an error
-            mcp_call_with_error = McpCall(
-                id="mcp_call_error_123",
-                name="failing_mcp_tool",
-                arguments='{"query": "test"}',
-                output=None,
-                error="MCP tool execution failed",
-                type="mcp_call",
-                server_label="test_server",
-            )
-
-            # Create a ModelResponse with a failing McpCall
-            mcp_response = ModelResponse(
-                output=[mcp_call_with_error],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=10,
-                    output_tokens=5,
-                    total_tokens=15,
+    first_response = get_model_response(
+        Response(
+            id="resp_mcp_123",
+            output=[
+                McpCall(
+                    id="mcp_call_error_123",
+                    name="failing_mcp_tool",
+                    arguments='{"query": "test"}',
+                    output=None,
+                    error="MCP tool execution failed",
+                    type="mcp_call",
+                    server_label="test_server",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
                 ),
-                response_id="resp_mcp_error_123",
-            )
-
-            # Final response after error
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="The MCP tool encountered an error",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=15,
-                    output_tokens=10,
-                    total_tokens=25,
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
                 ),
-                response_id="resp_final_error_123",
-            )
+                total_tokens=15,
+            ),
+        )
+    )
 
-            mock_get_response.side_effect = [mcp_response, final_response]
+    second_response = get_model_response(
+        Response(
+            id="resp_final_123",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Task completed using MCP tool",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=20,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=5,
+                ),
+                total_tokens=30,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-                send_default_pii=True,
-            )
+    with patch.object(
+        agent.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+            send_default_pii=True,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            await agents.Runner.run(
-                test_agent,
-                "Please use failing MCP tool",
-                run_config=test_run_config,
-            )
+        await agents.Runner.run(
+            agent,
+            "Please use failing MCP tool",
+            run_config=test_run_config,
+        )
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -1912,79 +2086,106 @@ async def test_mcp_tool_execution_with_error(sentry_init, capture_events, test_a
 
 
 @pytest.mark.asyncio
-async def test_mcp_tool_execution_without_pii(sentry_init, capture_events, test_agent):
+async def test_mcp_tool_execution_without_pii(
+    sentry_init, capture_events, test_agent, get_model_response
+):
     """
     Test that MCP tool input/output are not included when send_default_pii is False.
     """
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent = test_agent.clone(model=model)
 
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # Create a McpCall object
-            mcp_call = McpCall(
-                id="mcp_call_pii_123",
-                name="test_mcp_tool",
-                arguments='{"query": "sensitive data"}',
-                output="Result with sensitive info",
-                error=None,
-                type="mcp_call",
-                server_label="test_server",
-            )
-
-            # Create a ModelResponse with an McpCall
-            mcp_response = ModelResponse(
-                output=[mcp_call],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=10,
-                    output_tokens=5,
-                    total_tokens=15,
+    first_response = get_model_response(
+        Response(
+            id="resp_mcp_123",
+            output=[
+                McpCall(
+                    id="mcp_call_pii_123",
+                    name="test_mcp_tool",
+                    arguments='{"query": "sensitive data"}',
+                    output="Result with sensitive info",
+                    error=None,
+                    type="mcp_call",
+                    server_label="test_server",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
                 ),
-                response_id="resp_mcp_123",
-            )
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=15,
+            ),
+        )
+    )
 
-            # Final response
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="Task completed",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=15,
-                    output_tokens=10,
-                    total_tokens=25,
+    second_response = get_model_response(
+        Response(
+            id="resp_final_123",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Task completed",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
                 ),
-                response_id="resp_final_123",
-            )
-
-            mock_get_response.side_effect = [mcp_response, final_response]
+                output_tokens=20,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=5,
+                ),
+                total_tokens=30,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-                send_default_pii=False,  # PII disabled
-            )
+    with patch.object(
+        agent.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+            send_default_pii=False,  # PII disabled
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            await agents.Runner.run(
-                test_agent,
-                "Please use MCP tool",
-                run_config=test_run_config,
-            )
+        await agents.Runner.run(
+            agent,
+            "Please use MCP tool",
+            run_config=test_run_config,
+        )
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -2095,7 +2296,9 @@ def test_openai_agents_message_role_mapping(
 
 
 @pytest.mark.asyncio
-async def test_tool_execution_error_tracing(sentry_init, capture_events, test_agent):
+async def test_tool_execution_error_tracing(
+    sentry_init, capture_events, test_agent, get_model_response
+):
     """
     Test that tool execution errors are properly tracked via error tracing patch.
 
@@ -2113,70 +2316,100 @@ def failing_tool(message: str) -> str:
         raise ValueError("Tool execution failed")
 
     # Create agent with the failing tool
-    agent_with_tool = test_agent.clone(tools=[failing_tool])
-
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # Create a mock response that includes tool call
-            tool_call = ResponseFunctionToolCall(
-                id="call_123",
-                call_id="call_123",
-                name="failing_tool",
-                type="function_call",
-                arguments='{"message": "test"}',
-            )
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent_with_tool = test_agent.clone(tools=[failing_tool], model=model)
 
-            # First response with tool call
-            tool_response = ModelResponse(
-                output=[tool_call],
-                usage=Usage(
-                    requests=1, input_tokens=10, output_tokens=5, total_tokens=15
+    first_response = get_model_response(
+        Response(
+            id="resp_1",
+            output=[
+                ResponseFunctionToolCall(
+                    id="call_123",
+                    call_id="call_123",
+                    name="failing_tool",
+                    type="function_call",
+                    arguments='{"message": "test"}',
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=5,
                 ),
-                response_id="resp_tool_123",
-            )
-
-            # Second response after tool error (agents library handles the error and continues)
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="An error occurred while running the tool",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=Usage(
-                    requests=1, input_tokens=15, output_tokens=10, total_tokens=25
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=3,
                 ),
-                response_id="resp_final_123",
-            )
+                total_tokens=15,
+            ),
+        )
+    )
 
-            mock_get_response.side_effect = [tool_response, final_response]
+    second_response = get_model_response(
+        Response(
+            id="resp_2",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="An error occurred while running the tool",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4-0613",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=20,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=15,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=35,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-                send_default_pii=True,
-            )
+    with patch.object(
+        agent_with_tool.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+            send_default_pii=True,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            # Note: The agents library catches tool exceptions internally,
-            # so we don't expect this to raise
-            await agents.Runner.run(
-                agent_with_tool,
-                "Please use the failing tool",
-                run_config=test_run_config,
-            )
+        # Note: The agents library catches tool exceptions internally,
+        # so we don't expect this to raise
+        await agents.Runner.run(
+            agent_with_tool,
+            "Please use the failing tool",
+            run_config=test_run_config,
+        )
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -2412,7 +2645,7 @@ async def test_ai_client_span_response_model_with_chat_completions(
 
 @pytest.mark.asyncio
 async def test_multiple_llm_calls_aggregate_usage(
-    sentry_init, capture_events, test_agent
+    sentry_init, capture_events, test_agent, get_model_response
 ):
     """
     Test that invoke_agent spans show aggregated usage across multiple LLM calls
@@ -2424,79 +2657,100 @@ def calculator(a: int, b: int) -> int:
         """Add two numbers"""
         return a + b
 
-    agent_with_tool = test_agent.clone(tools=[calculator])
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent_with_tool = test_agent.clone(tools=[calculator], model=model)
 
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            # First call: agent decides to use tool (10 input, 5 output tokens)
-            tool_call_response = ModelResponse(
-                output=[
-                    ResponseFunctionToolCall(
-                        id="call_123",
-                        call_id="call_123",
-                        name="calculator",
-                        type="function_call",
-                        arguments='{"a": 5, "b": 3}',
-                    )
-                ],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=10,
-                    output_tokens=5,
-                    total_tokens=15,
-                    input_tokens_details=InputTokensDetails(cached_tokens=0),
-                    output_tokens_details=OutputTokensDetails(reasoning_tokens=0),
+    first_response = get_model_response(
+        Response(
+            id="resp_1",
+            output=[
+                ResponseFunctionToolCall(
+                    id="call_123",
+                    call_id="call_123",
+                    name="calculator",
+                    type="function_call",
+                    arguments='{"a": 5, "b": 3}',
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=5,
                 ),
-                response_id="resp_tool_call",
-            )
-
-            # Second call: agent uses tool result to respond (20 input, 15 output tokens)
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="The result is 8",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=Usage(
-                    requests=1,
-                    input_tokens=20,
-                    output_tokens=15,
-                    total_tokens=35,
-                    input_tokens_details=InputTokensDetails(cached_tokens=5),
-                    output_tokens_details=OutputTokensDetails(reasoning_tokens=3),
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=3,
                 ),
-                response_id="resp_final",
-            )
+                total_tokens=15,
+            ),
+        )
+    )
 
-            mock_get_response.side_effect = [tool_call_response, final_response]
+    second_response = get_model_response(
+        Response(
+            id="resp_2",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="The result is 8",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4-0613",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=20,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=15,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=35,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-                send_default_pii=True,
-            )
+    with patch.object(
+        agent_with_tool.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+            send_default_pii=True,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            result = await agents.Runner.run(
-                agent_with_tool,
-                "What is 5 + 3?",
-                run_config=test_run_config,
-            )
+        result = await agents.Runner.run(
+            agent_with_tool,
+            "What is 5 + 3?",
+            run_config=test_run_config,
+        )
 
-            assert result is not None
+        assert result is not None
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -2656,7 +2910,10 @@ async def test_invoke_agent_span_includes_response_model(
 
 @pytest.mark.asyncio
 async def test_invoke_agent_span_uses_last_response_model(
-    sentry_init, capture_events, test_agent
+    sentry_init,
+    capture_events,
+    test_agent,
+    get_model_response,
 ):
     """
     Test that when an agent makes multiple LLM calls (e.g., with tools),
@@ -2668,17 +2925,14 @@ def calculator(a: int, b: int) -> int:
         """Add two numbers"""
         return a + b
 
-    agent_with_tool = test_agent.clone(tools=[calculator])
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent_with_tool = test_agent.clone(tools=[calculator], model=model)
 
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel._fetch_response"
-        ) as mock_fetch_response:
-            # First call: gpt-4 model returns tool call
-            first_response = MagicMock()
-            first_response.model = "gpt-4-0613"
-            first_response.id = "resp_1"
-            first_response.output = [
+    first_response = get_model_response(
+        Response(
+            id="resp_1",
+            output=[
                 ResponseFunctionToolCall(
                     id="call_123",
                     call_id="call_123",
@@ -2686,65 +2940,85 @@ def calculator(a: int, b: int) -> int:
                     type="function_call",
                     arguments='{"a": 5, "b": 3}',
                 )
-            ]
-            first_response.usage = MagicMock()
-            first_response.usage.input_tokens = 10
-            first_response.usage.output_tokens = 5
-            first_response.usage.total_tokens = 15
-            first_response.usage.input_tokens_details = InputTokensDetails(
-                cached_tokens=0
-            )
-            first_response.usage.output_tokens_details = OutputTokensDetails(
-                reasoning_tokens=0
-            )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4-0613",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=10,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=5,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=0,
+                ),
+                total_tokens=15,
+            ),
+        )
+    )
 
-            # Second call: different model version returns final message
-            second_response = MagicMock()
-            second_response.model = "gpt-4.1-2025-04-14"
-            second_response.id = "resp_2"
-            second_response.output = [
+    second_response = get_model_response(
+        Response(
+            id="resp_2",
+            output=[
                 ResponseOutputMessage(
                     id="msg_final",
                     type="message",
                     status="completed",
                     content=[
                         ResponseOutputText(
-                            text="The result is 8",
+                            text="I'm the specialist and I can help with that!",
                             type="output_text",
                             annotations=[],
                         )
                     ],
                     role="assistant",
                 )
-            ]
-            second_response.usage = MagicMock()
-            second_response.usage.input_tokens = 20
-            second_response.usage.output_tokens = 15
-            second_response.usage.total_tokens = 35
-            second_response.usage.input_tokens_details = InputTokensDetails(
-                cached_tokens=5
-            )
-            second_response.usage.output_tokens_details = OutputTokensDetails(
-                reasoning_tokens=3
-            )
-
-            mock_fetch_response.side_effect = [first_response, second_response]
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4.1-2025-04-14",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=20,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=0,
+                ),
+                output_tokens=15,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=5,
+                ),
+                total_tokens=35,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-                send_default_pii=True,
-            )
+    with patch.object(
+        agent_with_tool.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+            send_default_pii=True,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            result = await agents.Runner.run(
-                agent_with_tool,
-                "What is 5 + 3?",
-                run_config=test_run_config,
-            )
+        result = await agents.Runner.run(
+            agent_with_tool,
+            "What is 5 + 3?",
+            run_config=test_run_config,
+        )
 
-            assert result is not None
+        assert result is not None
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -2966,7 +3240,9 @@ async def test_conversation_id_on_all_spans(
     reason="conversation_id support requires openai-agents >= 0.4.0",
 )
 @pytest.mark.asyncio
-async def test_conversation_id_on_tool_span(sentry_init, capture_events, test_agent):
+async def test_conversation_id_on_tool_span(
+    sentry_init, capture_events, test_agent, get_model_response
+):
     """
     Test that gen_ai.conversation.id is set on tool execution spans when passed to Runner.run().
     """
@@ -2976,65 +3252,98 @@ def simple_tool(message: str) -> str:
         """A simple tool"""
         return f"Result: {message}"
 
-    agent_with_tool = test_agent.clone(tools=[simple_tool])
-
-    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
-        with patch(
-            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
-        ) as mock_get_response:
-            tool_call = ResponseFunctionToolCall(
-                id="call_123",
-                call_id="call_123",
-                name="simple_tool",
-                type="function_call",
-                arguments='{"message": "hello"}',
-            )
+    client = AsyncOpenAI(api_key="test-key")
+    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
+    agent_with_tool = test_agent.clone(tools=[simple_tool], model=model)
 
-            tool_response = ModelResponse(
-                output=[tool_call],
-                usage=Usage(
-                    requests=1, input_tokens=10, output_tokens=5, total_tokens=15
+    first_response = get_model_response(
+        Response(
+            id="call_123",
+            output=[
+                ResponseFunctionToolCall(
+                    id="call_123",
+                    call_id="call_123",
+                    name="simple_tool",
+                    type="function_call",
+                    arguments='{"message": "hello"}',
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=20,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=5,
                 ),
-                response_id="resp_tool_456",
-            )
-
-            final_response = ModelResponse(
-                output=[
-                    ResponseOutputMessage(
-                        id="msg_final",
-                        type="message",
-                        status="completed",
-                        content=[
-                            ResponseOutputText(
-                                text="Done",
-                                type="output_text",
-                                annotations=[],
-                            )
-                        ],
-                        role="assistant",
-                    )
-                ],
-                usage=Usage(
-                    requests=1, input_tokens=15, output_tokens=10, total_tokens=25
+                output_tokens=10,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=8,
                 ),
-                response_id="resp_final_789",
-            )
+                total_tokens=30,
+            ),
+        )
+    )
 
-            mock_get_response.side_effect = [tool_response, final_response]
+    second_response = get_model_response(
+        Response(
+            id="resp_final_789",
+            output=[
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Done",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ],
+            parallel_tool_calls=False,
+            tool_choice="none",
+            tools=[],
+            created_at=10000000,
+            model="gpt-4",
+            object="response",
+            usage=ResponseUsage(
+                input_tokens=20,
+                input_tokens_details=InputTokensDetails(
+                    cached_tokens=5,
+                ),
+                output_tokens=10,
+                output_tokens_details=OutputTokensDetails(
+                    reasoning_tokens=8,
+                ),
+                total_tokens=30,
+            ),
+        )
+    )
 
-            sentry_init(
-                integrations=[OpenAIAgentsIntegration()],
-                traces_sample_rate=1.0,
-            )
+    with patch.object(
+        agent_with_tool.model._client._client,
+        "send",
+        side_effect=[first_response, second_response],
+    ) as _:
+        sentry_init(
+            integrations=[OpenAIAgentsIntegration()],
+            traces_sample_rate=1.0,
+        )
 
-            events = capture_events()
+        events = capture_events()
 
-            await agents.Runner.run(
-                agent_with_tool,
-                "Use the tool",
-                run_config=test_run_config,
-                conversation_id="conv_tool_test_456",
-            )
+        await agents.Runner.run(
+            agent_with_tool,
+            "Use the tool",
+            run_config=test_run_config,
+            conversation_id="conv_tool_test_456",
+        )
 
     (transaction,) = events
     spans = transaction["spans"]

From a6c52d69c59d5cb010d10da616459d49477e02be Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Fri, 6 Mar 2026 13:10:39 +0100
Subject: [PATCH 2/6] .

---
 .../openai_agents/test_openai_agents.py       | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index af7c6011a7..aca7b96461 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -1382,18 +1382,22 @@ def simple_test_tool(message: str) -> str:
     assert ai_client_span1["data"]["gen_ai.usage.output_tokens"] == 5
     assert ai_client_span1["data"]["gen_ai.usage.output_tokens.reasoning"] == 0
     assert ai_client_span1["data"]["gen_ai.usage.total_tokens"] == 15
-    assert ai_client_span1["data"]["gen_ai.response.tool_calls"] == safe_serialize(
-        [
-            {
-                "arguments": '{"message": "hello"}',
-                "call_id": "call_123",
-                "name": "simple_test_tool",
-                "type": "function_call",
-                "id": "call_123",
-                "status": None,
-            }
-        ]
-    )
+
+    tool_call = {
+        "arguments": '{"message": "hello"}',
+        "call_id": "call_123",
+        "name": "simple_test_tool",
+        "type": "function_call",
+        "id": "call_123",
+        "status": None,
+    }
+
+    if OPENAI_VERSION >= (2, 25, 0):
+        tool_call["namespace"] = None
+
+    assert json.loads(ai_client_span1["data"]["gen_ai.response.tool_calls"]) == [
+        tool_call
+    ]
 
     assert tool_span["description"] == "execute_tool simple_test_tool"
     assert tool_span["data"]["gen_ai.agent.name"] == "test_agent"

From 298245da2717b11dd5b8ebb866fcb028cb23b0cc Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Fri, 6 Mar 2026 13:53:06 +0100
Subject: [PATCH 3/6] restore token numbers

---
 .../openai_agents/test_openai_agents.py       | 78 +++++++++----------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index aca7b96461..cd52f30579 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -981,11 +981,11 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response):
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=5,
+                output_tokens=20,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                    reasoning_tokens=5,
                 ),
-                total_tokens=15,
+                total_tokens=30,
             ),
         )
     )
@@ -1015,15 +1015,15 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response):
             model="gpt-4",
             object="response",
             usage=ResponseUsage(
-                input_tokens=15,
+                input_tokens=10,
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=10,
+                output_tokens=20,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                    reasoning_tokens=5,
                 ),
-                total_tokens=25,
+                total_tokens=30,
             ),
         )
     )
@@ -1108,11 +1108,11 @@ async def test_max_turns_before_handoff_span(
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=5,
+                output_tokens=20,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                    reasoning_tokens=5,
                 ),
-                total_tokens=15,
+                total_tokens=30,
             ),
         )
     )
@@ -1142,15 +1142,15 @@ async def test_max_turns_before_handoff_span(
             model="gpt-4",
             object="response",
             usage=ResponseUsage(
-                input_tokens=15,
+                input_tokens=10,
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=10,
+                output_tokens=20,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                    reasoning_tokens=5,
                 ),
-                total_tokens=25,
+                total_tokens=30,
             ),
         )
     )
@@ -1901,15 +1901,15 @@ async def test_mcp_tool_execution_spans(
             model="gpt-4.1-2025-04-14",
             object="response",
             usage=ResponseUsage(
-                input_tokens=10,
+                input_tokens=15,
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=20,
+                output_tokens=10,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=5,
+                    reasoning_tokens=0,
                 ),
-                total_tokens=30,
+                total_tokens=25,
             ),
         )
     )
@@ -2031,15 +2031,15 @@ async def test_mcp_tool_execution_with_error(
             model="gpt-4.1-2025-04-14",
             object="response",
             usage=ResponseUsage(
-                input_tokens=10,
+                input_tokens=15,
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=20,
+                output_tokens=10,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=5,
+                    reasoning_tokens=0,
                 ),
-                total_tokens=30,
+                total_tokens=25,
             ),
         )
     )
@@ -2159,15 +2159,15 @@ async def test_mcp_tool_execution_without_pii(
             model="gpt-4.1-2025-04-14",
             object="response",
             usage=ResponseUsage(
-                input_tokens=10,
+                input_tokens=15,
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=20,
+                output_tokens=10,
                 output_tokens_details=OutputTokensDetails(
                     reasoning_tokens=5,
                 ),
-                total_tokens=30,
+                total_tokens=25,
             ),
         )
     )
@@ -2345,11 +2345,11 @@ def failing_tool(message: str) -> str:
             usage=ResponseUsage(
                 input_tokens=10,
                 input_tokens_details=InputTokensDetails(
-                    cached_tokens=5,
+                    cached_tokens=0,
                 ),
                 output_tokens=5,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=3,
+                    reasoning_tokens=0,
                 ),
                 total_tokens=15,
             ),
@@ -2381,15 +2381,15 @@ def failing_tool(message: str) -> str:
             model="gpt-4-0613",
             object="response",
             usage=ResponseUsage(
-                input_tokens=20,
+                input_tokens=15,
                 input_tokens_details=InputTokensDetails(
                     cached_tokens=0,
                 ),
-                output_tokens=15,
+                output_tokens=10,
                 output_tokens_details=OutputTokensDetails(
                     reasoning_tokens=0,
                 ),
-                total_tokens=35,
+                total_tokens=25,
             ),
         )
     )
@@ -2686,11 +2686,11 @@ def calculator(a: int, b: int) -> int:
             usage=ResponseUsage(
                 input_tokens=10,
                 input_tokens_details=InputTokensDetails(
-                    cached_tokens=5,
+                    cached_tokens=0,
                 ),
                 output_tokens=5,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=3,
+                    reasoning_tokens=0,
                 ),
                 total_tokens=15,
             ),
@@ -2724,11 +2724,11 @@ def calculator(a: int, b: int) -> int:
             usage=ResponseUsage(
                 input_tokens=20,
                 input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
+                    cached_tokens=5,
                 ),
                 output_tokens=15,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                    reasoning_tokens=3,
                 ),
                 total_tokens=35,
             ),
@@ -3279,15 +3279,15 @@ def simple_tool(message: str) -> str:
             model="gpt-4",
             object="response",
             usage=ResponseUsage(
-                input_tokens=20,
+                input_tokens=10,
                 input_tokens_details=InputTokensDetails(
-                    cached_tokens=5,
+                    cached_tokens=0,
                 ),
-                output_tokens=10,
+                output_tokens=5,
                 output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=8,
+                    reasoning_tokens=0,
                 ),
-                total_tokens=30,
+                total_tokens=15,
             ),
         )
     )

From 7d966bd015cb99587b331bba5b3491ba740320ec Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Fri, 6 Mar 2026 14:05:36 +0100
Subject: [PATCH 4/6] rename responses

---
 .../openai_agents/test_openai_agents.py       | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index cd52f30579..7663ea297f 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -958,7 +958,7 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response):
         handoffs=[secondary_agent],
     )
 
-    first_response = get_model_response(
+    handoff_response = get_model_response(
         Response(
             id="resp_tool_123",
             output=[
@@ -990,7 +990,7 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response):
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_final_123",
             output=[
@@ -1031,7 +1031,7 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response):
     with patch.object(
         primary_agent.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[handoff_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -1085,7 +1085,7 @@ async def test_max_turns_before_handoff_span(
         handoffs=[secondary_agent],
     )
 
-    first_response = get_model_response(
+    handoff_response = get_model_response(
         Response(
             id="resp_tool_123",
             output=[
@@ -1117,7 +1117,7 @@ async def test_max_turns_before_handoff_span(
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_final_123",
             output=[
@@ -1158,7 +1158,7 @@ async def test_max_turns_before_handoff_span(
     with patch.object(
         primary_agent.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[handoff_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -1206,7 +1206,7 @@ def simple_test_tool(message: str) -> str:
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model)
 
-    first_response = get_model_response(
+    tool_call_response = get_model_response(
         Response(
             id="resp_tool_123",
             output=[
@@ -1238,7 +1238,7 @@ def simple_test_tool(message: str) -> str:
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_final_123",
             output=[
@@ -1279,7 +1279,7 @@ def simple_test_tool(message: str) -> str:
     with patch.object(
         agent_with_tool.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[tool_call_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -1842,7 +1842,7 @@ async def test_mcp_tool_execution_spans(
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent = test_agent.clone(model=model)
 
-    first_response = get_model_response(
+    mcp_response = get_model_response(
         Response(
             id="resp_mcp_123",
             output=[
@@ -1876,7 +1876,7 @@ async def test_mcp_tool_execution_spans(
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_final_123",
             output=[
@@ -1917,7 +1917,7 @@ async def test_mcp_tool_execution_spans(
     with patch.object(
         agent.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[mcp_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -1972,7 +1972,7 @@ async def test_mcp_tool_execution_with_error(
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent = test_agent.clone(model=model)
 
-    first_response = get_model_response(
+    mcp_call_with_error_response = get_model_response(
         Response(
             id="resp_mcp_123",
             output=[
@@ -2006,7 +2006,7 @@ async def test_mcp_tool_execution_with_error(
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_final_123",
             output=[
@@ -2047,7 +2047,7 @@ async def test_mcp_tool_execution_with_error(
     with patch.object(
         agent.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[mcp_call_with_error_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -2100,7 +2100,7 @@ async def test_mcp_tool_execution_without_pii(
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent = test_agent.clone(model=model)
 
-    first_response = get_model_response(
+    mcp_response = get_model_response(
         Response(
             id="resp_mcp_123",
             output=[
@@ -2134,7 +2134,7 @@ async def test_mcp_tool_execution_without_pii(
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_final_123",
             output=[
@@ -2175,7 +2175,7 @@ async def test_mcp_tool_execution_without_pii(
     with patch.object(
         agent.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[mcp_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -2324,7 +2324,7 @@ def failing_tool(message: str) -> str:
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent_with_tool = test_agent.clone(tools=[failing_tool], model=model)
 
-    first_response = get_model_response(
+    tool_response = get_model_response(
         Response(
             id="resp_1",
             output=[
@@ -2356,7 +2356,7 @@ def failing_tool(message: str) -> str:
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_2",
             output=[
@@ -2397,7 +2397,7 @@ def failing_tool(message: str) -> str:
     with patch.object(
         agent_with_tool.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[tool_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -2665,7 +2665,7 @@ def calculator(a: int, b: int) -> int:
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent_with_tool = test_agent.clone(tools=[calculator], model=model)
 
-    first_response = get_model_response(
+    tool_call_response = get_model_response(
         Response(
             id="resp_1",
             output=[
@@ -2697,7 +2697,7 @@ def calculator(a: int, b: int) -> int:
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_2",
             output=[
@@ -2738,7 +2738,7 @@ def calculator(a: int, b: int) -> int:
     with patch.object(
         agent_with_tool.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[tool_call_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -3260,7 +3260,7 @@ def simple_tool(message: str) -> str:
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent_with_tool = test_agent.clone(tools=[simple_tool], model=model)
 
-    first_response = get_model_response(
+    tool_response = get_model_response(
         Response(
             id="call_123",
             output=[
@@ -3292,7 +3292,7 @@ def simple_tool(message: str) -> str:
         )
     )
 
-    second_response = get_model_response(
+    final_response = get_model_response(
         Response(
             id="resp_final_789",
             output=[
@@ -3333,7 +3333,7 @@ def simple_tool(message: str) -> str:
     with patch.object(
         agent_with_tool.model._client._client,
         "send",
-        side_effect=[first_response, second_response],
+        side_effect=[tool_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],

From 085dab99df3dda8682570d73bc696d9a63a92cea Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Fri, 6 Mar 2026 14:07:20 +0100
Subject: [PATCH 5/6] rename responses

---
 tests/integrations/openai_agents/test_openai_agents.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index 7663ea297f..6d2b782cca 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -1206,7 +1206,7 @@ def simple_test_tool(message: str) -> str:
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent_with_tool = test_agent.clone(tools=[simple_test_tool], model=model)
 
-    tool_call_response = get_model_response(
+    tool_response = get_model_response(
         Response(
             id="resp_tool_123",
             output=[
@@ -1279,7 +1279,7 @@ def simple_test_tool(message: str) -> str:
     with patch.object(
         agent_with_tool.model._client._client,
         "send",
-        side_effect=[tool_call_response, final_response],
+        side_effect=[tool_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],
@@ -1972,7 +1972,7 @@ async def test_mcp_tool_execution_with_error(
     model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
     agent = test_agent.clone(model=model)
 
-    mcp_call_with_error_response = get_model_response(
+    mcp_response = get_model_response(
         Response(
             id="resp_mcp_123",
             output=[
@@ -2047,7 +2047,7 @@ async def test_mcp_tool_execution_with_error(
     with patch.object(
         agent.model._client._client,
         "send",
-        side_effect=[mcp_call_with_error_response, final_response],
+        side_effect=[mcp_response, final_response],
     ) as _:
         sentry_init(
             integrations=[OpenAIAgentsIntegration()],

From b8af88b112f03dcd984b926660f224a74f402ac9 Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Fri, 6 Mar 2026 14:33:20 +0100
Subject: [PATCH 6/6] restore handoff and mcp tests

---
 .../openai_agents/test_openai_agents.py       | 759 +++++++-----------
 1 file changed, 302 insertions(+), 457 deletions(-)

diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index 6d2b782cca..b9cb43887a 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -937,121 +937,85 @@ def test_agent_invocation_span_sync(
 
 
 @pytest.mark.asyncio
-async def test_handoff_span(sentry_init, capture_events, get_model_response):
+async def test_handoff_span(sentry_init, capture_events, mock_usage):
     """
     Test that handoff spans are created when agents hand off to other agents.
     """
-    client = AsyncOpenAI(api_key="test-key")
-    model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client)
-
     # Create two simple agents with a handoff relationship
     secondary_agent = agents.Agent(
         name="secondary_agent",
         instructions="You are a secondary agent.",
-        model=model,
+        model="gpt-4o-mini",
     )
 
     primary_agent = agents.Agent(
         name="primary_agent",
         instructions="You are a primary agent that hands off to secondary agent.",
-        model=model,
+        model="gpt-4o-mini",
         handoffs=[secondary_agent],
     )
 
-    handoff_response = get_model_response(
-        Response(
-            id="resp_tool_123",
-            output=[
-                ResponseFunctionToolCall(
-                    id="call_handoff_123",
-                    call_id="call_handoff_123",
-                    name="transfer_to_secondary_agent",
-                    type="function_call",
-                    arguments="{}",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=20,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=5,
-                ),
-                total_tokens=30,
-            ),
-        )
-    )
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Mock two responses:
+            # 1. Primary agent calls handoff tool
+            # 2. Secondary agent provides final response
+            handoff_response = ModelResponse(
+                output=[
+                    ResponseFunctionToolCall(
+                        id="call_handoff_123",
+                        call_id="call_handoff_123",
+                        name="transfer_to_secondary_agent",
+                        type="function_call",
+                        arguments="{}",
+                    )
+                ],
+                usage=mock_usage,
+                response_id="resp_handoff_123",
+            )
 
-    final_response = get_model_response(
-        Response(
-            id="resp_final_123",
-            output=[
-                ResponseOutputMessage(
-                    id="msg_final",
-                    type="message",
-                    status="completed",
-                    content=[
-                        ResponseOutputText(
-                            text="I'm the specialist and I can help with that!",
-                            type="output_text",
-                            annotations=[],
-                        )
-                    ],
-                    role="assistant",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=20,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=5,
-                ),
-                total_tokens=30,
-            ),
-        )
-    )
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="I'm the specialist and I can help with that!",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=mock_usage,
+                response_id="resp_final_123",
+            )
 
-    with patch.object(
-        primary_agent.model._client._client,
-        "send",
-        side_effect=[handoff_response, final_response],
-    ) as _:
-        sentry_init(
-            integrations=[OpenAIAgentsIntegration()],
-            traces_sample_rate=1.0,
-        )
+            mock_get_response.side_effect = [handoff_response, final_response]
 
-        events = capture_events()
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+            )
 
-        result = await agents.Runner.run(
-            primary_agent,
-            "Please hand off to secondary agent",
-            run_config=test_run_config,
-        )
+            events = capture_events()
 
-        assert result is not None
+            result = await agents.Runner.run(
+                primary_agent,
+                "Please hand off to secondary agent",
+                run_config=test_run_config,
+            )
+
+            assert result is not None
 
     (transaction,) = events
     spans = transaction["spans"]
-
-    handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF)
+    handoff_span = spans[2]
 
     # Verify handoff span was created
     assert handoff_span is not None
@@ -1062,123 +1026,85 @@ async def test_handoff_span(sentry_init, capture_events, get_model_response):
 
 
 @pytest.mark.asyncio
-async def test_max_turns_before_handoff_span(
-    sentry_init, capture_events, get_model_response
-):
+async def test_max_turns_before_handoff_span(sentry_init, capture_events, mock_usage):
     """
     Example raising agents.exceptions.AgentsException after the agent invocation span is complete.
     """
-    client = AsyncOpenAI(api_key="test-key")
-    model = OpenAIResponsesModel(model="gpt-4-mini", openai_client=client)
-
     # Create two simple agents with a handoff relationship
     secondary_agent = agents.Agent(
         name="secondary_agent",
         instructions="You are a secondary agent.",
-        model=model,
+        model="gpt-4o-mini",
     )
 
     primary_agent = agents.Agent(
         name="primary_agent",
         instructions="You are a primary agent that hands off to secondary agent.",
-        model=model,
+        model="gpt-4o-mini",
         handoffs=[secondary_agent],
     )
 
-    handoff_response = get_model_response(
-        Response(
-            id="resp_tool_123",
-            output=[
-                ResponseFunctionToolCall(
-                    id="call_handoff_123",
-                    call_id="call_handoff_123",
-                    name="transfer_to_secondary_agent",
-                    type="function_call",
-                    arguments="{}",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=20,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=5,
-                ),
-                total_tokens=30,
-            ),
-        )
-    )
-
-    final_response = get_model_response(
-        Response(
-            id="resp_final_123",
-            output=[
-                ResponseOutputMessage(
-                    id="msg_final",
-                    type="message",
-                    status="completed",
-                    content=[
-                        ResponseOutputText(
-                            text="I'm the specialist and I can help with that!",
-                            type="output_text",
-                            annotations=[],
-                        )
-                    ],
-                    role="assistant",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=20,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=5,
-                ),
-                total_tokens=30,
-            ),
-        )
-    )
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Mock two responses:
+            # 1. Primary agent calls handoff tool
+            # 2. Secondary agent provides final response
+            handoff_response = ModelResponse(
+                output=[
+                    ResponseFunctionToolCall(
+                        id="call_handoff_123",
+                        call_id="call_handoff_123",
+                        name="transfer_to_secondary_agent",
+                        type="function_call",
+                        arguments="{}",
+                    )
+                ],
+                usage=mock_usage,
+                response_id="resp_handoff_123",
+            )
 
-    with patch.object(
-        primary_agent.model._client._client,
-        "send",
-        side_effect=[handoff_response, final_response],
-    ) as _:
-        sentry_init(
-            integrations=[OpenAIAgentsIntegration()],
-            traces_sample_rate=1.0,
-        )
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="I'm the specialist and I can help with that!",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=mock_usage,
+                response_id="resp_final_123",
+            )
 
-        events = capture_events()
+            mock_get_response.side_effect = [handoff_response, final_response]
 
-        with pytest.raises(MaxTurnsExceeded):
-            await agents.Runner.run(
-                primary_agent,
-                "Please hand off to secondary agent",
-                run_config=test_run_config,
-                max_turns=1,
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
             )
 
+            events = capture_events()
+
+            with pytest.raises(MaxTurnsExceeded):
+                await agents.Runner.run(
+                    primary_agent,
+                    "Please hand off to secondary agent",
+                    run_config=test_run_config,
+                    max_turns=1,
+                )
+
     (error, transaction) = events
     spans = transaction["spans"]
-
-    handoff_span = next(span for span in spans if span.get("op") == OP.GEN_AI_HANDOFF)
+    handoff_span = spans[2]
 
     # Verify handoff span was created
     assert handoff_span is not None
@@ -1832,106 +1758,79 @@ async def test_span_status_error(sentry_init, capture_events, test_agent):
 
 
 @pytest.mark.asyncio
-async def test_mcp_tool_execution_spans(
-    sentry_init, capture_events, test_agent, get_model_response
-):
+async def test_mcp_tool_execution_spans(sentry_init, capture_events, test_agent):
     """
     Test that MCP (Model Context Protocol) tool calls create execute_tool spans.
     """
-    client = AsyncOpenAI(api_key="test-key")
-    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
-    agent = test_agent.clone(model=model)
 
-    mcp_response = get_model_response(
-        Response(
-            id="resp_mcp_123",
-            output=[
-                McpCall(
-                    id="mcp_call_123",
-                    name="test_mcp_tool",
-                    arguments='{"query": "search term"}',
-                    output="MCP tool executed successfully",
-                    error=None,
-                    type="mcp_call",
-                    server_label="test_server",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4.1-2025-04-14",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Create a McpCall object
+            mcp_call = McpCall(
+                id="mcp_call_123",
+                name="test_mcp_tool",
+                arguments='{"query": "search term"}',
+                output="MCP tool executed successfully",
+                error=None,
+                type="mcp_call",
+                server_label="test_server",
+            )
+
+            # Create a ModelResponse with an McpCall in the output
+            mcp_response = ModelResponse(
+                output=[mcp_call],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=5,
+                    total_tokens=15,
                 ),
-                output_tokens=5,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
-                ),
-                total_tokens=15,
-            ),
-        )
-    )
+                response_id="resp_mcp_123",
+            )
 
-    final_response = get_model_response(
-        Response(
-            id="resp_final_123",
-            output=[
-                ResponseOutputMessage(
-                    id="msg_final",
-                    type="message",
-                    status="completed",
-                    content=[
-                        ResponseOutputText(
-                            text="Task completed using MCP tool",
-                            type="output_text",
-                            annotations=[],
-                        )
-                    ],
-                    role="assistant",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4.1-2025-04-14",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=15,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=10,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+            # Final response after MCP tool execution
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="Task completed using MCP tool",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=15,
+                    output_tokens=10,
+                    total_tokens=25,
                 ),
-                total_tokens=25,
-            ),
-        )
-    )
+                response_id="resp_final_123",
+            )
 
-    with patch.object(
-        agent.model._client._client,
-        "send",
-        side_effect=[mcp_response, final_response],
-    ) as _:
-        sentry_init(
-            integrations=[OpenAIAgentsIntegration()],
-            traces_sample_rate=1.0,
-            send_default_pii=True,
-        )
+            mock_get_response.side_effect = [mcp_response, final_response]
 
-        events = capture_events()
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
 
-        await agents.Runner.run(
-            agent,
-            "Please use MCP tool",
-            run_config=test_run_config,
-        )
+            events = capture_events()
+
+            await agents.Runner.run(
+                test_agent,
+                "Please use MCP tool",
+                run_config=test_run_config,
+            )
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -1962,106 +1861,79 @@ async def test_mcp_tool_execution_spans(
 
 
 @pytest.mark.asyncio
-async def test_mcp_tool_execution_with_error(
-    sentry_init, capture_events, test_agent, get_model_response
-):
+async def test_mcp_tool_execution_with_error(sentry_init, capture_events, test_agent):
     """
     Test that MCP tool calls with errors are tracked with error status.
     """
-    client = AsyncOpenAI(api_key="test-key")
-    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
-    agent = test_agent.clone(model=model)
 
-    mcp_response = get_model_response(
-        Response(
-            id="resp_mcp_123",
-            output=[
-                McpCall(
-                    id="mcp_call_error_123",
-                    name="failing_mcp_tool",
-                    arguments='{"query": "test"}',
-                    output=None,
-                    error="MCP tool execution failed",
-                    type="mcp_call",
-                    server_label="test_server",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4.1-2025-04-14",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=5,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
-                ),
-                total_tokens=15,
-            ),
-        )
-    )
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Create a McpCall object with an error
+            mcp_call_with_error = McpCall(
+                id="mcp_call_error_123",
+                name="failing_mcp_tool",
+                arguments='{"query": "test"}',
+                output=None,
+                error="MCP tool execution failed",
+                type="mcp_call",
+                server_label="test_server",
+            )
 
-    final_response = get_model_response(
-        Response(
-            id="resp_final_123",
-            output=[
-                ResponseOutputMessage(
-                    id="msg_final",
-                    type="message",
-                    status="completed",
-                    content=[
-                        ResponseOutputText(
-                            text="Task completed using MCP tool",
-                            type="output_text",
-                            annotations=[],
-                        )
-                    ],
-                    role="assistant",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4.1-2025-04-14",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=15,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
+            # Create a ModelResponse with a failing McpCall
+            mcp_response = ModelResponse(
+                output=[mcp_call_with_error],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=5,
+                    total_tokens=15,
                 ),
-                output_tokens=10,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
+                response_id="resp_mcp_error_123",
+            )
+
+            # Final response after error
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="The MCP tool encountered an error",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=15,
+                    output_tokens=10,
+                    total_tokens=25,
                 ),
-                total_tokens=25,
-            ),
-        )
-    )
+                response_id="resp_final_error_123",
+            )
 
-    with patch.object(
-        agent.model._client._client,
-        "send",
-        side_effect=[mcp_response, final_response],
-    ) as _:
-        sentry_init(
-            integrations=[OpenAIAgentsIntegration()],
-            traces_sample_rate=1.0,
-            send_default_pii=True,
-        )
+            mock_get_response.side_effect = [mcp_response, final_response]
 
-        events = capture_events()
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
 
-        await agents.Runner.run(
-            agent,
-            "Please use failing MCP tool",
-            run_config=test_run_config,
-        )
+            events = capture_events()
+
+            await agents.Runner.run(
+                test_agent,
+                "Please use failing MCP tool",
+                run_config=test_run_config,
+            )
 
     (transaction,) = events
     spans = transaction["spans"]
@@ -2090,106 +1962,79 @@ async def test_mcp_tool_execution_with_error(
 
 
 @pytest.mark.asyncio
-async def test_mcp_tool_execution_without_pii(
-    sentry_init, capture_events, test_agent, get_model_response
-):
+async def test_mcp_tool_execution_without_pii(sentry_init, capture_events, test_agent):
     """
     Test that MCP tool input/output are not included when send_default_pii is False.
     """
-    client = AsyncOpenAI(api_key="test-key")
-    model = OpenAIResponsesModel(model="gpt-4", openai_client=client)
-    agent = test_agent.clone(model=model)
 
-    mcp_response = get_model_response(
-        Response(
-            id="resp_mcp_123",
-            output=[
-                McpCall(
-                    id="mcp_call_pii_123",
-                    name="test_mcp_tool",
-                    arguments='{"query": "sensitive data"}',
-                    output="Result with sensitive info",
-                    error=None,
-                    type="mcp_call",
-                    server_label="test_server",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4.1-2025-04-14",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=10,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
-                ),
-                output_tokens=5,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=0,
-                ),
-                total_tokens=15,
-            ),
-        )
-    )
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Create a McpCall object
+            mcp_call = McpCall(
+                id="mcp_call_pii_123",
+                name="test_mcp_tool",
+                arguments='{"query": "sensitive data"}',
+                output="Result with sensitive info",
+                error=None,
+                type="mcp_call",
+                server_label="test_server",
+            )
 
-    final_response = get_model_response(
-        Response(
-            id="resp_final_123",
-            output=[
-                ResponseOutputMessage(
-                    id="msg_final",
-                    type="message",
-                    status="completed",
-                    content=[
-                        ResponseOutputText(
-                            text="Task completed",
-                            type="output_text",
-                            annotations=[],
-                        )
-                    ],
-                    role="assistant",
-                )
-            ],
-            parallel_tool_calls=False,
-            tool_choice="none",
-            tools=[],
-            created_at=10000000,
-            model="gpt-4.1-2025-04-14",
-            object="response",
-            usage=ResponseUsage(
-                input_tokens=15,
-                input_tokens_details=InputTokensDetails(
-                    cached_tokens=0,
+            # Create a ModelResponse with an McpCall
+            mcp_response = ModelResponse(
+                output=[mcp_call],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=5,
+                    total_tokens=15,
                 ),
-                output_tokens=10,
-                output_tokens_details=OutputTokensDetails(
-                    reasoning_tokens=5,
+                response_id="resp_mcp_123",
+            )
+
+            # Final response
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="Task completed",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=15,
+                    output_tokens=10,
+                    total_tokens=25,
                 ),
-                total_tokens=25,
-            ),
-        )
-    )
+                response_id="resp_final_123",
+            )
 
-    with patch.object(
-        agent.model._client._client,
-        "send",
-        side_effect=[mcp_response, final_response],
-    ) as _:
-        sentry_init(
-            integrations=[OpenAIAgentsIntegration()],
-            traces_sample_rate=1.0,
-            send_default_pii=False,  # PII disabled
-        )
+            mock_get_response.side_effect = [mcp_response, final_response]
 
-        events = capture_events()
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=False,  # PII disabled
+            )
 
-        await agents.Runner.run(
-            agent,
-            "Please use MCP tool",
-            run_config=test_run_config,
-        )
+            events = capture_events()
+
+            await agents.Runner.run(
+                test_agent,
+                "Please use MCP tool",
+                run_config=test_run_config,
+            )
 
     (transaction,) = events
     spans = transaction["spans"]