From a2d639b6d2814e0312d9703e4abb588d8f775578 Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 09:13:19 +0800
Subject: [PATCH 1/8] feat: emit agent spans in RealtimeSession for SDK tracing

RealtimeSession.__aenter__ now creates and starts an agent_span for the
starting agent, and finishes it when the session closes. On handoff, the
outgoing agent's span is finished and a new span is started for the
incoming agent, matching the tracing behaviour of the regular Runner.

The span records the agent name, tool names, and handoff target names.
Passing tracing_disabled=True in run_config suppresses all spans.

Fixes #1845
---
 src/agents/realtime/session.py            |  31 ++++
 tests/realtime/test_session.py            |   8 +
 tests/realtime/test_session_exceptions.py |   2 +
 tests/realtime/test_session_spans.py      | 169 ++++++++++++++++++++++
 4 files changed, 210 insertions(+)
 create mode 100644 tests/realtime/test_session_spans.py

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index ca809dd9c4..cbcbd02874 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -24,6 +24,8 @@
 from ..run_context import RunContextWrapper, TContext
 from ..tool import DEFAULT_APPROVAL_REJECTION_MESSAGE, FunctionTool, invoke_function_tool
 from ..tool_context import ToolContext
+from ..tracing import Span, agent_span
+from ..tracing.span_data import AgentSpanData
 from ..util._approvals import evaluate_needs_approval_setting
 from .agent import RealtimeAgent
 from .config import RealtimeRunConfig, RealtimeSessionModelSettings, RealtimeUserInput
@@ -193,6 +195,7 @@ def __init__(
         self._guardrail_tasks: set[asyncio.Task[Any]] = set()
         self._tool_call_tasks: set[asyncio.Task[Any]] = set()
         self._async_tool_calls: bool = bool(self._run_config.get("async_tool_calls", True))
+        self._current_agent_span: Span[AgentSpanData] | None = None
 
     @property
     def model(self) -> RealtimeModel:
@@ -203,6 +206,10 @@ async def __aenter__(self) -> RealtimeSession:
         """Start the session by connecting to the model. After this, you will be able to stream
         events from the model and send messages and audio to the model.
         """
+        # Start an agent span for the initial agent.
+        self._current_agent_span = self._make_agent_span(self._current_agent)
+        self._current_agent_span.start(mark_as_current=True)
+
         # Add ourselves as a listener
         self._model.add_listener(self)
 
@@ -815,9 +822,16 @@ async def _handle_tool_call(
                 # Store previous agent for event
                 previous_agent = agent
 
+                # Finish the span for the outgoing agent and start one for the new agent.
+                if self._current_agent_span is not None:
+                    self._current_agent_span.finish(reset_current=True)
+
                 # Update current agent
                 self._current_agent = result
 
+                self._current_agent_span = self._make_agent_span(self._current_agent)
+                self._current_agent_span.start(mark_as_current=True)
+
                 # Get updated model settings from new agent
                 updated_settings = await self._get_updated_model_settings_from_agent(
                     starting_settings=None,
@@ -1235,6 +1249,11 @@ async def _cleanup(self) -> None:
             self._wake_event_iterators()
             return
 
+        # Finish the active agent span.
+        if self._current_agent_span is not None:
+            self._current_agent_span.finish(reset_current=True)
+            self._current_agent_span = None
+
         # Cancel and cleanup guardrail tasks
         self._cleanup_guardrail_tasks()
         self._cleanup_tool_call_tasks()
@@ -1253,6 +1272,18 @@ async def _cleanup(self) -> None:
         self._closed = True
         self._wake_event_iterators()
 
+    def _make_agent_span(self, agent: RealtimeAgent) -> Span[AgentSpanData]:
+        """Create a new agent span for the given agent, respecting tracing_disabled."""
+        disabled: bool = bool(self._run_config.get("tracing_disabled", False))
+        handoff_names = [h.agent_name if isinstance(h, Handoff) else h.name for h in agent.handoffs]
+        tool_names = [t.name for t in agent.tools if isinstance(t, FunctionTool)]
+        return agent_span(
+            name=agent.name,
+            handoffs=handoff_names or None,
+            tools=tool_names or None,
+            disabled=disabled,
+        )
+
     async def _get_updated_model_settings_from_agent(
         self,
         starting_settings: RealtimeSessionModelSettings | None,
diff --git a/tests/realtime/test_session.py b/tests/realtime/test_session.py
index 03148c739a..e45db679f5 100644
--- a/tests/realtime/test_session.py
+++ b/tests/realtime/test_session.py
@@ -325,9 +325,11 @@ async def close(self):
 @pytest.fixture
 def mock_agent():
     agent = Mock(spec=RealtimeAgent)
+    agent.name = "mock_agent"
     agent.get_all_tools = AsyncMock(return_value=[])
 
     type(agent).handoffs = PropertyMock(return_value=[])
+    type(agent).tools = PropertyMock(return_value=[])
     type(agent).output_guardrails = PropertyMock(return_value=[])
     return agent
 
@@ -2463,9 +2465,11 @@ async def test_session_gets_model_settings_from_agent_during_connection(self):
 
         # Create agent with specific settings
         agent = Mock(spec=RealtimeAgent)
+        agent.name = "test_agent"
         agent.get_system_prompt = AsyncMock(return_value="Test agent instructions")
         agent.get_all_tools = AsyncMock(return_value=[{"type": "function", "name": "test_tool"}])
         agent.handoffs = []
+        agent.tools = []
 
         session = RealtimeSession(mock_model, agent, None)
 
@@ -2492,9 +2496,11 @@ async def test_model_config_overrides_model_settings_not_agent(self):
         mock_model.add_listener = Mock()
 
         agent = Mock(spec=RealtimeAgent)
+        agent.name = "test_agent"
         agent.get_system_prompt = AsyncMock(return_value="Agent instructions")
         agent.get_all_tools = AsyncMock(return_value=[{"type": "function", "name": "agent_tool"}])
         agent.handoffs = []
+        agent.tools = []
 
         # Provide model config with settings
         model_config: RealtimeModelConfig = {
@@ -2530,8 +2536,10 @@ async def test_handoffs_are_included_in_model_settings(self):
 
         # Create agent with handoffs
         agent = Mock(spec=RealtimeAgent)
+        agent.name = "test_agent"
         agent.get_system_prompt = AsyncMock(return_value="Agent with handoffs")
         agent.get_all_tools = AsyncMock(return_value=[])
+        agent.tools = []
 
         # Create a mock handoff
         handoff_agent = Mock(spec=RealtimeAgent)
diff --git a/tests/realtime/test_session_exceptions.py b/tests/realtime/test_session_exceptions.py
index da93902368..f306761154 100644
--- a/tests/realtime/test_session_exceptions.py
+++ b/tests/realtime/test_session_exceptions.py
@@ -89,9 +89,11 @@ async def interrupt(self) -> None:
 def fake_agent():
     """Create a fake agent for testing."""
     agent = Mock()
+    agent.name = "fake_agent"
     agent.get_all_tools = AsyncMock(return_value=[])
     agent.get_system_prompt = AsyncMock(return_value="test instructions")
     agent.handoffs = []
+    agent.tools = []
     return agent
 
 
diff --git a/tests/realtime/test_session_spans.py b/tests/realtime/test_session_spans.py
new file mode 100644
index 0000000000..3eea62c975
--- /dev/null
+++ b/tests/realtime/test_session_spans.py
@@ -0,0 +1,169 @@
+"""Tests that RealtimeSession creates agent spans for SDK-level tracing."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from agents.realtime.agent import RealtimeAgent
+from agents.realtime.model import RealtimeModel, RealtimeModelConfig, RealtimeModelListener
+from agents.realtime.model_events import RealtimeModelEvent
+from agents.realtime.session import RealtimeSession
+from agents.tracing import trace
+from agents.tracing.span_data import AgentSpanData
+from tests.testing_processor import SPAN_PROCESSOR_TESTING
+
+
+class _FakeRealtimeModel(RealtimeModel):
+    """Minimal fake that never sends events and succeeds immediately."""
+
+    def __init__(self) -> None:
+        self._listeners: list[RealtimeModelListener] = []
+
+    def add_listener(self, listener: RealtimeModelListener) -> None:
+        self._listeners.append(listener)
+
+    def remove_listener(self, listener: RealtimeModelListener) -> None:
+        if listener in self._listeners:
+            self._listeners.remove(listener)
+
+    async def connect(self, options: RealtimeModelConfig) -> None:
+        pass
+
+    async def close(self) -> None:
+        pass
+
+    async def send_event(self, event: Any) -> None:
+        pass
+
+    async def send_message(
+        self, message: Any, other_event_data: dict[str, Any] | None = None
+    ) -> None:
+        pass
+
+    async def send_audio(self, audio: bytes, *, commit: bool = False) -> None:
+        pass
+
+    async def send_tool_output(self, tool_call: Any, output: str, start_response: bool) -> None:
+        pass
+
+    async def interrupt(self) -> None:
+        pass
+
+    async def dispatch(self, event: RealtimeModelEvent) -> None:
+        """Send an event to all listeners (test helper)."""
+        for listener in self._listeners:
+            await listener.on_event(event)
+
+
+def _make_session(
+    agent: RealtimeAgent,
+    model: _FakeRealtimeModel | None = None,
+    *,
+    tracing_disabled: bool = False,
+) -> RealtimeSession:
+    return RealtimeSession(
+        model=model or _FakeRealtimeModel(),
+        agent=agent,
+        context=None,
+        run_config={"tracing_disabled": tracing_disabled} if tracing_disabled else {},
+    )
+
+
+@pytest.mark.asyncio
+async def test_session_creates_agent_span_on_enter():
+    """Entering a RealtimeSession context must create an agent span."""
+    agent = RealtimeAgent(name="greeter")
+    session = _make_session(agent)
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert len(agent_spans) == 1, f"Expected 1 agent span, got {len(agent_spans)}"
+
+
+@pytest.mark.asyncio
+async def test_session_agent_span_has_correct_name():
+    """The agent span name must match the RealtimeAgent name."""
+    agent = RealtimeAgent(name="support_bot")
+    session = _make_session(agent)
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert agent_spans[0].span_data.name == "support_bot"
+
+
+@pytest.mark.asyncio
+async def test_session_agent_span_finished_after_close():
+    """The agent span must be finished (exported) once the session closes."""
+    agent = RealtimeAgent(name="closer")
+    session = _make_session(agent)
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert agent_spans[0].ended_at is not None
+
+
+@pytest.mark.asyncio
+async def test_session_span_includes_tool_names():
+    """The agent span records the names of tools available to the agent."""
+    from agents.tool import function_tool
+
+    @function_tool
+    def my_tool() -> str:
+        """A test tool."""
+        return "ok"
+
+    agent = RealtimeAgent(name="tool_agent", tools=[my_tool])
+    session = _make_session(agent)
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert agent_spans[0].span_data.tools == ["my_tool"]
+
+
+@pytest.mark.asyncio
+async def test_session_span_includes_handoff_names():
+    """The agent span records the names of handoff targets."""
+    child = RealtimeAgent(name="specialist")
+    agent = RealtimeAgent(name="router", handoffs=[child])
+    session = _make_session(agent)
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert agent_spans[0].span_data.handoffs == ["specialist"]
+
+
+@pytest.mark.asyncio
+async def test_tracing_disabled_creates_no_agent_spans():
+    """When tracing_disabled=True, no agent spans should be emitted."""
+    agent = RealtimeAgent(name="silent")
+    session = _make_session(agent, tracing_disabled=True)
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert len(agent_spans) == 0, f"Expected 0 agent spans, got {len(agent_spans)}"

From ed34935eea40a88187e5939c350d52751ece9710 Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 09:29:52 +0800
Subject: [PATCH 2/8] fix: avoid cross-context span token errors and use
 resolved tool list

Two issues raised in automated review of the initial commit:

P1 (crash): When async_tool_calls=True (the default), handoffs execute
inside asyncio.create_task(). start(mark_as_current=True) stores a
contextvars token in the background task's context. _cleanup() runs in
the main task and cannot reset that token; finish(reset_current=True)
raises ValueError: Token was created in a different Context.

Fix: _cleanup uses reset_current=False (session is ending; no need to
restore the previous current span). The new agent span started during a
handoff uses mark_as_current=False for the same reason.

P2 (inaccurate metadata): _make_agent_span read agent.tools (static
list) instead of the resolved list from get_all_tools(), so disabled
function tools appeared in the span and MCP tools were omitted.

Fix: tool names are no longer set in _make_agent_span; both __aenter__
and the handoff handler call agent.get_all_tools() after obtaining model
settings and write the resolved names to span_data.tools via
get_tool_trace_name_for_tool(), matching the pattern used by Runner.
---
 src/agents/realtime/session.py | 43 +++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index cbcbd02874..281a1d847c 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -14,6 +14,7 @@
     FunctionToolLookupKey,
     get_function_tool_lookup_key_for_tool,
     get_function_tool_namespace,
+    get_tool_trace_name_for_tool,
 )
 from ..agent import Agent
 from ..exceptions import UserError
@@ -219,6 +220,13 @@ async def __aenter__(self) -> RealtimeSession:
             agent=self._current_agent,
         )
 
+        # Update span with the resolved tool list (includes MCP tools and respects is_enabled).
+        if self._current_agent_span is not None:
+            resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
+            self._current_agent_span.span_data.tools = [
+                n for t in resolved_tools if (n := get_tool_trace_name_for_tool(t)) is not None
+            ] or None
+
         # Connect to the model
         await self._model.connect(model_config)
 
@@ -822,15 +830,20 @@ async def _handle_tool_call(
                 # Store previous agent for event
                 previous_agent = agent
 
-                # Finish the span for the outgoing agent and start one for the new agent.
+                # Finish the span for the outgoing agent. Use reset_current=False because this
+                # runs inside an asyncio background task; resetting a token from a different
+                # context raises ValueError.
                 if self._current_agent_span is not None:
-                    self._current_agent_span.finish(reset_current=True)
+                    self._current_agent_span.finish(reset_current=False)
 
                 # Update current agent
                 self._current_agent = result
 
+                # Start a span for the new agent. Use mark_as_current=False for the same
+                # cross-context reason: _cleanup runs in the main task and cannot reset a
+                # token created here.
                 self._current_agent_span = self._make_agent_span(self._current_agent)
-                self._current_agent_span.start(mark_as_current=True)
+                self._current_agent_span.start(mark_as_current=False)
 
                 # Get updated model settings from new agent
                 updated_settings = await self._get_updated_model_settings_from_agent(
@@ -838,6 +851,15 @@ async def _handle_tool_call(
                     agent=self._current_agent,
                 )
 
+                # Update span with the resolved tool list for the new agent.
+                if self._current_agent_span is not None:
+                    resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
+                    self._current_agent_span.span_data.tools = [
+                        n
+                        for t in resolved_tools
+                        if (n := get_tool_trace_name_for_tool(t)) is not None
+                    ] or None
+
                 # Send handoff event
                 await self._put_event(
                     RealtimeHandoffEvent(
@@ -1249,9 +1271,12 @@ async def _cleanup(self) -> None:
             self._wake_event_iterators()
             return
 
-        # Finish the active agent span.
+        # Finish the active agent span. Use reset_current=False because _cleanup may be called
+        # from a different asyncio context than the one that started the span (e.g. after a
+        # handoff that ran in a background task), and resetting a token across contexts raises
+        # ValueError.
         if self._current_agent_span is not None:
-            self._current_agent_span.finish(reset_current=True)
+            self._current_agent_span.finish(reset_current=False)
             self._current_agent_span = None
 
         # Cancel and cleanup guardrail tasks
@@ -1273,14 +1298,16 @@ async def _cleanup(self) -> None:
         self._wake_event_iterators()
 
     def _make_agent_span(self, agent: RealtimeAgent) -> Span[AgentSpanData]:
-        """Create a new agent span for the given agent, respecting tracing_disabled."""
+        """Create a new agent span for the given agent, respecting tracing_disabled.
+
+        Tool names are intentionally omitted here; callers must update span_data.tools
+        asynchronously via get_all_tools() to include MCP tools and respect is_enabled.
+        """
         disabled: bool = bool(self._run_config.get("tracing_disabled", False))
         handoff_names = [h.agent_name if isinstance(h, Handoff) else h.name for h in agent.handoffs]
-        tool_names = [t.name for t in agent.tools if isinstance(t, FunctionTool)]
         return agent_span(
             name=agent.name,
             handoffs=handoff_names or None,
-            tools=tool_names or None,
             disabled=disabled,
         )
 

From ceb50e07e90fe14aedd673e9f4ddf5de40adc2d0 Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 09:44:24 +0800
Subject: [PATCH 3/8] fix: reset initial agent span context token in _cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two issues fixed in RealtimeSession tracing:

P1 – After closing a session without a handoff, the agent span started in
__aenter__ with mark_as_current=True left its context-var token unreset
because _cleanup always used reset_current=False. Subsequent spans in the
same task were incorrectly parented under the stale (finished) span.

Fix: manage the context-var token explicitly via Scope.set_current_span() in
__aenter__ and store it in self._initial_span_token. _cleanup now calls
Scope.reset_current_span() on that token, which is always safe because
_cleanup runs via __aexit__ in the same task as __aenter__. The current span
(which may be a handoff span started with mark_as_current=False) is still
finished with reset_current=False to avoid cross-context token errors.

P2 – If __aenter__ raised an exception after span start (e.g. model.connect()
failed), __aexit__ was not called, leaving the span unfinished and current
in the context. Fix: wrap the fallible setup section in a try/except that
finishes the span and resets the token before re-raising.

Fixes #1845
---
 src/agents/realtime/session.py | 83 ++++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 29 deletions(-)

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index 281a1d847c..289539dab9 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import contextvars
 import dataclasses
 import inspect
 import json
@@ -26,6 +27,7 @@
 from ..tool import DEFAULT_APPROVAL_REJECTION_MESSAGE, FunctionTool, invoke_function_tool
 from ..tool_context import ToolContext
 from ..tracing import Span, agent_span
+from ..tracing.scope import Scope
 from ..tracing.span_data import AgentSpanData
 from ..util._approvals import evaluate_needs_approval_setting
 from .agent import RealtimeAgent
@@ -197,6 +199,9 @@ def __init__(
         self._tool_call_tasks: set[asyncio.Task[Any]] = set()
         self._async_tool_calls: bool = bool(self._run_config.get("async_tool_calls", True))
         self._current_agent_span: Span[AgentSpanData] | None = None
+        # Context-var token from the span created in __aenter__. Tracked here so _cleanup can
+        # reset it even if a handoff already called finish() on that span in a background task.
+        self._initial_span_token: contextvars.Token[Any] | None = None
 
     @property
     def model(self) -> RealtimeModel:
@@ -207,38 +212,53 @@ async def __aenter__(self) -> RealtimeSession:
         """Start the session by connecting to the model. After this, you will be able to stream
         events from the model and send messages and audio to the model.
         """
-        # Start an agent span for the initial agent.
+        # Start an agent span for the initial agent. Use mark_as_current=False and manage
+        # the context-var token ourselves so _cleanup can reset it even when a handoff
+        # already called finish() on this span from a background task.
         self._current_agent_span = self._make_agent_span(self._current_agent)
-        self._current_agent_span.start(mark_as_current=True)
+        self._current_agent_span.start(mark_as_current=False)
+        self._initial_span_token = Scope.set_current_span(self._current_agent_span)
 
-        # Add ourselves as a listener
-        self._model.add_listener(self)
-
-        model_config = self._model_config.copy()
-        model_config["initial_model_settings"] = await self._get_updated_model_settings_from_agent(
-            starting_settings=self._model_config.get("initial_model_settings", None),
-            agent=self._current_agent,
-        )
+        try:
+            # Add ourselves as a listener
+            self._model.add_listener(self)
+
+            model_config = self._model_config.copy()
+            model_config[
+                "initial_model_settings"
+            ] = await self._get_updated_model_settings_from_agent(
+                starting_settings=self._model_config.get("initial_model_settings", None),
+                agent=self._current_agent,
+            )
 
-        # Update span with the resolved tool list (includes MCP tools and respects is_enabled).
-        if self._current_agent_span is not None:
-            resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
-            self._current_agent_span.span_data.tools = [
-                n for t in resolved_tools if (n := get_tool_trace_name_for_tool(t)) is not None
-            ] or None
+            # Update span with the resolved tool list (includes MCP tools and respects is_enabled).
+            if self._current_agent_span is not None:
+                resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
+                self._current_agent_span.span_data.tools = [
+                    n for t in resolved_tools if (n := get_tool_trace_name_for_tool(t)) is not None
+                ] or None
 
-        # Connect to the model
-        await self._model.connect(model_config)
+            # Connect to the model
+            await self._model.connect(model_config)
 
-        # Emit initial history update
-        await self._put_event(
-            RealtimeHistoryUpdated(
-                history=self._history,
-                info=self._event_info,
+            # Emit initial history update
+            await self._put_event(
+                RealtimeHistoryUpdated(
+                    history=self._history,
+                    info=self._event_info,
+                )
             )
-        )
 
-        return self
+            return self
+        except BaseException:
+            # __aexit__ is not called when __aenter__ raises, so clean up the span here.
+            if self._current_agent_span is not None:
+                self._current_agent_span.finish(reset_current=False)
+                self._current_agent_span = None
+            if self._initial_span_token is not None:
+                Scope.reset_current_span(self._initial_span_token)
+                self._initial_span_token = None
+            raise
 
     async def enter(self) -> RealtimeSession:
         """Enter the async context manager. We strongly recommend using the async context manager
@@ -1271,13 +1291,18 @@ async def _cleanup(self) -> None:
             self._wake_event_iterators()
             return
 
-        # Finish the active agent span. Use reset_current=False because _cleanup may be called
-        # from a different asyncio context than the one that started the span (e.g. after a
-        # handoff that ran in a background task), and resetting a token across contexts raises
-        # ValueError.
+        # Finish the active agent span. Use reset_current=False because, after a handoff,
+        # the current span was started in a background task (mark_as_current=False) and has
+        # no token to reset. The context-var token for the *initial* span (created in
+        # __aenter__) is reset separately below.
         if self._current_agent_span is not None:
             self._current_agent_span.finish(reset_current=False)
             self._current_agent_span = None
+        # Reset the context-var token that __aenter__ stored. _cleanup is always called from
+        # __aexit__ (same task context as __aenter__), so this reset is always safe here.
+        if self._initial_span_token is not None:
+            Scope.reset_current_span(self._initial_span_token)
+            self._initial_span_token = None
 
         # Cancel and cleanup guardrail tasks
         self._cleanup_guardrail_tasks()

From b61f80f55743ae24bd15b663d6c6eae312c6a1c1 Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 10:09:22 +0800
Subject: [PATCH 4/8] fix: guard NoOpSpan context install, clear span for
 handoff siblings, filter handoffs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three issues fixed per tracing conventions (local_dev/notes/tracing-conventions.md):

Convention 3 — NoOpSpan poisoning:
__aenter__ now skips Scope.set_current_span() when _make_agent_span() returns a
NoOpSpan (no active trace, or tracing_disabled). Installing a NoOpSpan as current
would cause provider.create_span() to return NoOpSpan for every subsequent child
span in the same context (_is_noop_span check in provider.py).

Convention 5 — Span parent fixed at create_span() call time:
Before creating the incoming agent span in the handoff handler (background task),
the current span context is temporarily cleared via Scope.set_current_span(None)
and immediately restored. Without this, the outgoing agent span (still "current"
in the background task's copied context because finish(reset_current=False) does
not reset the context var) would be used as parent_id for the new span, producing
incorrect parent-child nesting instead of sibling spans under the trace root.

Convention 7 — is_enabled filtering for handoffs:
_make_agent_span() no longer reads agent.handoffs directly. Both __aenter__ and
the handoff handler now call _get_handoffs() (which applies each Handoff.is_enabled
predicate) and write the filtered result to span_data.handoffs, matching the pattern
used by Runner for handoff metadata.

Tests added for all three behaviors:
- test_no_active_trace_does_not_poison_span_context
- test_disabled_handoff_excluded_from_span_metadata
- test_handoff_span_is_sibling_not_child_of_initial_span

Fixes #1845
---
 src/agents/realtime/session.py       | 53 +++++++++++-----
 tests/realtime/test_session_spans.py | 93 +++++++++++++++++++++++++++-
 2 files changed, 129 insertions(+), 17 deletions(-)

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index 289539dab9..16b50c1ff5 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -29,6 +29,7 @@
 from ..tracing import Span, agent_span
 from ..tracing.scope import Scope
 from ..tracing.span_data import AgentSpanData
+from ..tracing.spans import NoOpSpan
 from ..util._approvals import evaluate_needs_approval_setting
 from .agent import RealtimeAgent
 from .config import RealtimeRunConfig, RealtimeSessionModelSettings, RealtimeUserInput
@@ -217,7 +218,11 @@ async def __aenter__(self) -> RealtimeSession:
         # already called finish() on this span from a background task.
         self._current_agent_span = self._make_agent_span(self._current_agent)
         self._current_agent_span.start(mark_as_current=False)
-        self._initial_span_token = Scope.set_current_span(self._current_agent_span)
+        # Only install the span as current when it is a real span. Setting a NoOpSpan as
+        # current poisons the context: provider.create_span() returns NoOpSpan for every
+        # child span when it detects a no-op parent (provider.py _is_noop_span check).
+        if not isinstance(self._current_agent_span, NoOpSpan):
+            self._initial_span_token = Scope.set_current_span(self._current_agent_span)
 
         try:
             # Add ourselves as a listener
@@ -231,12 +236,19 @@ async def __aenter__(self) -> RealtimeSession:
                 agent=self._current_agent,
             )
 
-            # Update span with the resolved tool list (includes MCP tools and respects is_enabled).
-            if self._current_agent_span is not None:
+            # Update span metadata: tools (respects is_enabled, includes MCP) and
+            # handoffs (filtered by _get_handoffs to exclude disabled ones).
+            if not isinstance(self._current_agent_span, NoOpSpan):
                 resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
                 self._current_agent_span.span_data.tools = [
                     n for t in resolved_tools if (n := get_tool_trace_name_for_tool(t)) is not None
                 ] or None
+                enabled_handoffs = await self._get_handoffs(
+                    self._current_agent, self._context_wrapper
+                )
+                self._current_agent_span.span_data.handoffs = [
+                    h.agent_name for h in enabled_handoffs
+                ] or None
 
             # Connect to the model
             await self._model.connect(model_config)
@@ -859,11 +871,17 @@ async def _handle_tool_call(
                 # Update current agent
                 self._current_agent = result
 
-                # Start a span for the new agent. Use mark_as_current=False for the same
-                # cross-context reason: _cleanup runs in the main task and cannot reset a
-                # token created here.
+                # Create the incoming agent span with a clear current-span context so that
+                # its parent is the trace root, not the finished outgoing agent span.
+                # The outgoing span is still "current" in this background task context
+                # (finish(reset_current=False) does not reset the context var), so we must
+                # temporarily clear it before calling _make_agent_span() / agent_span() —
+                # provider.create_span() reads Scope.get_current_span() at creation time to
+                # determine parent_id.
+                _handoff_clear_token = Scope.set_current_span(None)
                 self._current_agent_span = self._make_agent_span(self._current_agent)
                 self._current_agent_span.start(mark_as_current=False)
+                Scope.reset_current_span(_handoff_clear_token)
 
                 # Get updated model settings from new agent
                 updated_settings = await self._get_updated_model_settings_from_agent(
@@ -871,14 +889,20 @@ async def _handle_tool_call(
                     agent=self._current_agent,
                 )
 
-                # Update span with the resolved tool list for the new agent.
-                if self._current_agent_span is not None:
+                # Update span metadata: tools and filtered handoffs.
+                if not isinstance(self._current_agent_span, NoOpSpan):
                     resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
                     self._current_agent_span.span_data.tools = [
                         n
                         for t in resolved_tools
                         if (n := get_tool_trace_name_for_tool(t)) is not None
                     ] or None
+                    enabled_handoffs = await self._get_handoffs(
+                        self._current_agent, self._context_wrapper
+                    )
+                    self._current_agent_span.span_data.handoffs = [
+                        h.agent_name for h in enabled_handoffs
+                    ] or None
 
                 # Send handoff event
                 await self._put_event(
@@ -1325,16 +1349,13 @@ async def _cleanup(self) -> None:
     def _make_agent_span(self, agent: RealtimeAgent) -> Span[AgentSpanData]:
         """Create a new agent span for the given agent, respecting tracing_disabled.
 
-        Tool names are intentionally omitted here; callers must update span_data.tools
-        asynchronously via get_all_tools() to include MCP tools and respect is_enabled.
+        Both tool names and handoff names are intentionally omitted here. Callers must
+        update span_data.tools via get_all_tools() and span_data.handoffs via
+        _get_handoffs() asynchronously to reflect only what is actually sent to the model
+        (respects is_enabled on both tools and handoffs).
         """
         disabled: bool = bool(self._run_config.get("tracing_disabled", False))
-        handoff_names = [h.agent_name if isinstance(h, Handoff) else h.name for h in agent.handoffs]
-        return agent_span(
-            name=agent.name,
-            handoffs=handoff_names or None,
-            disabled=disabled,
-        )
+        return agent_span(name=agent.name, disabled=disabled)
 
     async def _get_updated_model_settings_from_agent(
         self,
diff --git a/tests/realtime/test_session_spans.py b/tests/realtime/test_session_spans.py
index 3eea62c975..80a13185a3 100644
--- a/tests/realtime/test_session_spans.py
+++ b/tests/realtime/test_session_spans.py
@@ -2,15 +2,17 @@
 
 from __future__ import annotations
 
+import asyncio
 from typing import Any
 
 import pytest
 
 from agents.realtime.agent import RealtimeAgent
 from agents.realtime.model import RealtimeModel, RealtimeModelConfig, RealtimeModelListener
-from agents.realtime.model_events import RealtimeModelEvent
+from agents.realtime.model_events import RealtimeModelEvent, RealtimeModelToolCallEvent
 from agents.realtime.session import RealtimeSession
 from agents.tracing import trace
+from agents.tracing.scope import Scope
 from agents.tracing.span_data import AgentSpanData
 from tests.testing_processor import SPAN_PROCESSOR_TESTING
 
@@ -167,3 +169,92 @@ async def test_tracing_disabled_creates_no_agent_spans():
     spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
     agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
     assert len(agent_spans) == 0, f"Expected 0 agent spans, got {len(agent_spans)}"
+
+
+@pytest.mark.asyncio
+async def test_no_active_trace_does_not_poison_span_context():
+    """Without an outer trace(), the session must not install a NoOpSpan as current.
+
+    Convention: provider returns NoOpSpan when no active trace exists. Installing
+    a NoOpSpan as current would make every span created afterward also a NoOpSpan
+    (provider._is_noop_span check). The session must skip Scope.set_current_span()
+    for NoOpSpans so ambient context is unchanged after the session closes.
+    """
+    span_before = Scope.get_current_span()
+    agent = RealtimeAgent(name="agent")
+    session = _make_session(agent)
+
+    # Enter/exit WITHOUT any enclosing trace — span will be a NoOpSpan.
+    async with session:
+        pass
+
+    span_after = Scope.get_current_span()
+    assert span_before is span_after, (
+        "Session must not permanently alter the current span context when no active trace exists."
+    )
+
+
+@pytest.mark.asyncio
+async def test_disabled_handoff_excluded_from_span_metadata():
+    """Handoffs with is_enabled=False must not appear in span handoff metadata.
+
+    Convention: span metadata must reflect what was actually sent to the model.
+    _get_handoffs() filters by is_enabled; raw agent.handoffs must not be used.
+    """
+    from agents.realtime.handoffs import realtime_handoff
+
+    specialist = RealtimeAgent(name="specialist")
+    disabled_handoff = realtime_handoff(specialist, is_enabled=False)
+    agent = RealtimeAgent(name="router", handoffs=[disabled_handoff])
+    session = _make_session(agent)
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert agent_spans[0].span_data.handoffs is None, (
+        f"Disabled handoff should not appear in span metadata, "
+        f"got: {agent_spans[0].span_data.handoffs}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_handoff_span_is_sibling_not_child_of_initial_span():
+    """After a handoff the new agent span must be a sibling of the first, not its child.
+
+    Convention: the incoming agent span's parent_id must not equal the outgoing agent
+    span's span_id. Both should be direct children of the trace root (parent_id=None).
+    """
+    specialist = RealtimeAgent(name="specialist")
+    router = RealtimeAgent(name="router", handoffs=[specialist])
+    model = _FakeRealtimeModel()
+    session = _make_session(router, model)
+
+    with trace("test"):
+        async with session:
+            # Fire the handoff tool call that the model would send.
+            await model.dispatch(
+                RealtimeModelToolCallEvent(
+                    name="transfer_to_specialist",
+                    call_id="call_001",
+                    arguments="{}",
+                )
+            )
+            # Let the background task spawned by async_tool_calls complete.
+            await asyncio.sleep(0.05)
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert len(agent_spans) == 2, (
+        f"Expected 2 agent spans (router + specialist), got {len(agent_spans)}"
+    )
+
+    router_span = next(s for s in agent_spans if s.span_data.name == "router")
+    specialist_span = next(s for s in agent_spans if s.span_data.name == "specialist")
+
+    assert specialist_span.parent_id != router_span.span_id, (
+        "Specialist span must not be a child of the router span. "
+        f"specialist.parent_id={specialist_span.parent_id}, router.span_id={router_span.span_id}"
+    )

From 0470b2d0e3ed595ce735257742dc0a8546b2c2cf Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 10:47:53 +0800
Subject: [PATCH 5/8] fix: return (settings, tools, handoffs) tuple from
 _get_updated_model_settings_from_agent

Eliminates duplicate get_all_tools() / _get_handoffs() calls by returning
already-computed results as a tuple. Callers in __aenter__, update_agent(),
and handoff handler now unpack the tuple; test callers updated to match.
Also guard _cleanup token reset against cross-task ValueError; defer span
context reset to __aexit__ which is guaranteed to run in the same task as
__aenter__. Add 5 new regression tests covering NoOpSpan poisoning, disabled
handoff filtering, sibling span hierarchy, cross-task cleanup, and direct
close() context reset.
---
 src/agents/realtime/session.py       | 68 +++++++++++++++++-----------
 tests/realtime/test_session.py       |  8 ++--
 tests/realtime/test_session_spans.py | 42 +++++++++++++++++
 tests/realtime/test_tracing.py       |  2 +-
 4 files changed, 89 insertions(+), 31 deletions(-)

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index 16b50c1ff5..24fc374e50 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -229,23 +229,22 @@ async def __aenter__(self) -> RealtimeSession:
             self._model.add_listener(self)
 
             model_config = self._model_config.copy()
-            model_config[
-                "initial_model_settings"
-            ] = await self._get_updated_model_settings_from_agent(
+            (
+                initial_settings,
+                resolved_tools,
+                enabled_handoffs,
+            ) = await self._get_updated_model_settings_from_agent(
                 starting_settings=self._model_config.get("initial_model_settings", None),
                 agent=self._current_agent,
             )
+            model_config["initial_model_settings"] = initial_settings
 
-            # Update span metadata: tools (respects is_enabled, includes MCP) and
-            # handoffs (filtered by _get_handoffs to exclude disabled ones).
+            # Reuse the tools/handoffs already resolved above — avoids a second call to
+            # get_all_tools()/get_handoffs() and ensures span metadata matches the model.
             if not isinstance(self._current_agent_span, NoOpSpan):
-                resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
                 self._current_agent_span.span_data.tools = [
                     n for t in resolved_tools if (n := get_tool_trace_name_for_tool(t)) is not None
                 ] or None
-                enabled_handoffs = await self._get_handoffs(
-                    self._current_agent, self._context_wrapper
-                )
                 self._current_agent_span.span_data.handoffs = [
                     h.agent_name for h in enabled_handoffs
                 ] or None
@@ -281,6 +280,14 @@ async def enter(self) -> RealtimeSession:
 
     async def __aexit__(self, _exc_type: Any, _exc_val: Any, _exc_tb: Any) -> None:
         """End the session."""
+        # Reset the initial span token here. __aexit__ is always invoked in the same
+        # asyncio task as __aenter__, so this reset is unconditionally safe. We do it
+        # here rather than relying solely on _cleanup because _cleanup is also reachable
+        # from close() and __aiter__, which may run in different tasks and would raise
+        # ValueError on ContextVar.reset().
+        if self._initial_span_token is not None:
+            Scope.reset_current_span(self._initial_span_token)
+            self._initial_span_token = None
         await self.close()
 
     async def __aiter__(self) -> AsyncIterator[RealtimeSessionEvent]:
@@ -327,7 +334,7 @@ async def update_agent(self, agent: RealtimeAgent) -> None:
         """Update the active agent for this session and apply its settings to the model."""
         self._current_agent = agent
 
-        updated_settings = await self._get_updated_model_settings_from_agent(
+        updated_settings, _, _ = await self._get_updated_model_settings_from_agent(
             starting_settings=None,
             agent=self._current_agent,
         )
@@ -883,23 +890,23 @@ async def _handle_tool_call(
                 self._current_agent_span.start(mark_as_current=False)
                 Scope.reset_current_span(_handoff_clear_token)
 
-                # Get updated model settings from new agent
-                updated_settings = await self._get_updated_model_settings_from_agent(
+                # Get updated model settings from new agent; reuse resolved tools and
+                # handoffs for span metadata to avoid a redundant second call.
+                (
+                    updated_settings,
+                    resolved_tools,
+                    enabled_handoffs,
+                ) = await self._get_updated_model_settings_from_agent(
                     starting_settings=None,
                     agent=self._current_agent,
                 )
 
-                # Update span metadata: tools and filtered handoffs.
                 if not isinstance(self._current_agent_span, NoOpSpan):
-                    resolved_tools = await self._current_agent.get_all_tools(self._context_wrapper)
                     self._current_agent_span.span_data.tools = [
                         n
                         for t in resolved_tools
                         if (n := get_tool_trace_name_for_tool(t)) is not None
                     ] or None
-                    enabled_handoffs = await self._get_handoffs(
-                        self._current_agent, self._context_wrapper
-                    )
                     self._current_agent_span.span_data.handoffs = [
                         h.agent_name for h in enabled_handoffs
                     ] or None
@@ -1315,17 +1322,20 @@ async def _cleanup(self) -> None:
             self._wake_event_iterators()
             return
 
-        # Finish the active agent span. Use reset_current=False because, after a handoff,
-        # the current span was started in a background task (mark_as_current=False) and has
-        # no token to reset. The context-var token for the *initial* span (created in
-        # __aenter__) is reset separately below.
+        # Finish the active agent span.
         if self._current_agent_span is not None:
             self._current_agent_span.finish(reset_current=False)
             self._current_agent_span = None
-        # Reset the context-var token that __aenter__ stored. _cleanup is always called from
-        # __aexit__ (same task context as __aenter__), so this reset is always safe here.
+        # Reset the initial span's context-var token. __aexit__ handles this
+        # unconditionally (it runs in the same task as __aenter__). This fallback
+        # handles direct close() calls from the same task; if close() or __aiter__
+        # triggers _cleanup from a different task the ValueError is caught and the
+        # token is left for __aexit__ to reset.
         if self._initial_span_token is not None:
-            Scope.reset_current_span(self._initial_span_token)
+            try:
+                Scope.reset_current_span(self._initial_span_token)
+            except ValueError:
+                pass  # Cross-task call; __aexit__ will reset from the correct task.
             self._initial_span_token = None
 
         # Cancel and cleanup guardrail tasks
@@ -1361,7 +1371,13 @@ async def _get_updated_model_settings_from_agent(
         self,
         starting_settings: RealtimeSessionModelSettings | None,
         agent: RealtimeAgent,
-    ) -> RealtimeSessionModelSettings:
+    ) -> tuple[RealtimeSessionModelSettings, list[Any], list[Handoff[Any, RealtimeAgent[Any]]]]:
+        """Return (settings, resolved_tools, enabled_handoffs).
+
+        resolved_tools and enabled_handoffs are captured before starting_settings overrides
+        so callers can use them for span metadata without re-invoking get_all_tools() or
+        _get_handoffs() a second time.
+        """
         # Start with the merged base settings from run and model configuration.
         updated_settings = self._base_model_settings.copy()
 
@@ -1385,7 +1401,7 @@ async def _get_updated_model_settings_from_agent(
         if disable_tracing:
             updated_settings["tracing"] = None
 
-        return updated_settings
+        return updated_settings, list(tools or []), list(handoffs or [])
 
     @classmethod
     async def _get_handoffs(
diff --git a/tests/realtime/test_session.py b/tests/realtime/test_session.py
index e45db679f5..d4816f3a32 100644
--- a/tests/realtime/test_session.py
+++ b/tests/realtime/test_session.py
@@ -2627,7 +2627,7 @@ async def mock_get_handoffs(cls, agent, context_wrapper):
             m.setattr("agents.realtime.session.RealtimeSession._get_handoffs", mock_get_handoffs)
 
             # Test the method directly
-            model_settings = await session._get_updated_model_settings_from_agent(
+            model_settings, _, _ = await session._get_updated_model_settings_from_agent(
                 starting_settings=model_config_initial_settings, agent=agent
             )
 
@@ -2677,7 +2677,7 @@ async def mock_get_handoffs(cls, agent, context_wrapper):
         with pytest.MonkeyPatch().context() as m:
             m.setattr("agents.realtime.session.RealtimeSession._get_handoffs", mock_get_handoffs)
 
-            model_settings = await session._get_updated_model_settings_from_agent(
+            model_settings, _, _ = await session._get_updated_model_settings_from_agent(
                 starting_settings=None,  # No initial settings
                 agent=agent,
             )
@@ -2723,7 +2723,7 @@ async def mock_get_handoffs(cls, agent, context_wrapper):
         with pytest.MonkeyPatch().context() as m:
             m.setattr("agents.realtime.session.RealtimeSession._get_handoffs", mock_get_handoffs)
 
-            model_settings = await session._get_updated_model_settings_from_agent(
+            model_settings, _, _ = await session._get_updated_model_settings_from_agent(
                 starting_settings=model_config_settings, agent=agent
             )
 
@@ -2770,7 +2770,7 @@ async def mock_get_handoffs(cls, agent, context_wrapper):
                 mock_get_handoffs,
             )
 
-            model_settings = await session._get_updated_model_settings_from_agent(
+            model_settings, _, _ = await session._get_updated_model_settings_from_agent(
                 starting_settings=None,
                 agent=agent,
             )
diff --git a/tests/realtime/test_session_spans.py b/tests/realtime/test_session_spans.py
index 80a13185a3..b37c38a5a5 100644
--- a/tests/realtime/test_session_spans.py
+++ b/tests/realtime/test_session_spans.py
@@ -220,6 +220,48 @@ async def test_disabled_handoff_excluded_from_span_metadata():
     )
 
 
+@pytest.mark.asyncio
+async def test_cleanup_from_different_task_does_not_raise():
+    """_cleanup called from a task other than __aenter__'s task must not raise ValueError.
+
+    close() is public and __aiter__ also calls _cleanup when _stored_exception is set.
+    Both can run in a different asyncio task than __aenter__. Resetting a contextvars
+    token from a different task raises ValueError — this must be caught gracefully.
+    """
+    agent = RealtimeAgent(name="agent")
+    session = _make_session(agent)
+
+    with trace("test"):
+        await session.enter()  # open the session in this (main) task
+
+        # Call _cleanup from a background task — it gets a copied context, so the
+        # token stored by __aenter__ in the main task cannot be reset here; must not raise.
+        async def close_from_other_task() -> None:
+            await session._cleanup()
+
+        await asyncio.create_task(close_from_other_task())
+
+    assert session._closed is True
+
+
+@pytest.mark.asyncio
+async def test_span_context_clean_after_close_called_directly():
+    """Span context must be reset even when close() is called directly (no async with).
+
+    Method: enter via session.enter(), call close() directly, verify Scope is clean.
+    """
+    span_before = Scope.get_current_span()
+    agent = RealtimeAgent(name="agent")
+    session = _make_session(agent)
+
+    with trace("test"):
+        await session.enter()
+        await session.close()
+
+    span_after = Scope.get_current_span()
+    assert span_before is span_after, "Calling close() directly must still reset the span context."
+
+
 @pytest.mark.asyncio
 async def test_handoff_span_is_sibling_not_child_of_initial_span():
     """After a handoff the new agent span must be a sibling of the first, not its child.
diff --git a/tests/realtime/test_tracing.py b/tests/realtime/test_tracing.py
index bacde6703c..ff6d297ec4 100644
--- a/tests/realtime/test_tracing.py
+++ b/tests/realtime/test_tracing.py
@@ -257,7 +257,7 @@ async def test_tracing_disabled_prevents_tracing(self, mock_websocket):
         )
 
         # Test the _get_updated_model_settings_from_agent method directly
-        model_settings = await session._get_updated_model_settings_from_agent(
+        model_settings, _, _ = await session._get_updated_model_settings_from_agent(
             starting_settings=None, agent=agent
         )
 

From 94084651fae1ff25684f831de1ed35a0b02a2e9e Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 11:03:46 +0800
Subject: [PATCH 6/8] fix: don't install agent spans as current ContextVar;
 derive span metadata from final settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolves two root-cause issues:

1. Stale current span after handoff (#3353121454): installing the initial agent
   span as the ContextVar current span in the main task causes it to remain
   current even after a handoff finishes that span in a background task.
   asyncio tasks inherit a snapshot of the parent context — the background task
   cannot update the main task's ContextVar. Remove all Scope.set_current_span
   calls; agent spans are created and finished without being set as current.
   This also eliminates _initial_span_token and all related cross-task token-
   reset complexity.

2. Span metadata mismatch after model_config override (#3353121446):
   _get_updated_model_settings_from_agent returned the pre-override tools and
   handoffs even when starting_settings had overridden them. Now returns
   updated_settings["tools"/"handoffs"] after the update() call so span metadata
   matches what was actually sent to the model.

Add test_span_tool_metadata_reflects_model_config_override to cover case 2.
Update docstrings on no-longer-ContextVar-dependent tests to reflect new design.
---
 src/agents/realtime/session.py       | 76 +++++++++-------------------
 tests/realtime/test_session_spans.py | 70 ++++++++++++++++++-------
 2 files changed, 74 insertions(+), 72 deletions(-)

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index 24fc374e50..2dc49fb88d 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import asyncio
-import contextvars
 import dataclasses
 import inspect
 import json
@@ -27,7 +26,6 @@
 from ..tool import DEFAULT_APPROVAL_REJECTION_MESSAGE, FunctionTool, invoke_function_tool
 from ..tool_context import ToolContext
 from ..tracing import Span, agent_span
-from ..tracing.scope import Scope
 from ..tracing.span_data import AgentSpanData
 from ..tracing.spans import NoOpSpan
 from ..util._approvals import evaluate_needs_approval_setting
@@ -200,9 +198,6 @@ def __init__(
         self._tool_call_tasks: set[asyncio.Task[Any]] = set()
         self._async_tool_calls: bool = bool(self._run_config.get("async_tool_calls", True))
         self._current_agent_span: Span[AgentSpanData] | None = None
-        # Context-var token from the span created in __aenter__. Tracked here so _cleanup can
-        # reset it even if a handoff already called finish() on that span in a background task.
-        self._initial_span_token: contextvars.Token[Any] | None = None
 
     @property
     def model(self) -> RealtimeModel:
@@ -213,16 +208,14 @@ async def __aenter__(self) -> RealtimeSession:
         """Start the session by connecting to the model. After this, you will be able to stream
         events from the model and send messages and audio to the model.
         """
-        # Start an agent span for the initial agent. Use mark_as_current=False and manage
-        # the context-var token ourselves so _cleanup can reset it even when a handoff
-        # already called finish() on this span from a background task.
+        # Create the agent span. Do not install it as the current ContextVar span:
+        # asyncio tasks inherit a snapshot of their parent's context, so a bg task
+        # cannot update the main task's context var. Installing the span would leave a
+        # stale (finished) span as "current" after any handoff that runs in a bg task.
+        # Agent spans are emitted as children of the enclosing trace without being set
+        # as current, which is correct and avoids all cross-task ContextVar management.
         self._current_agent_span = self._make_agent_span(self._current_agent)
         self._current_agent_span.start(mark_as_current=False)
-        # Only install the span as current when it is a real span. Setting a NoOpSpan as
-        # current poisons the context: provider.create_span() returns NoOpSpan for every
-        # child span when it detects a no-op parent (provider.py _is_noop_span check).
-        if not isinstance(self._current_agent_span, NoOpSpan):
-            self._initial_span_token = Scope.set_current_span(self._current_agent_span)
 
         try:
             # Add ourselves as a listener
@@ -239,8 +232,9 @@ async def __aenter__(self) -> RealtimeSession:
             )
             model_config["initial_model_settings"] = initial_settings
 
-            # Reuse the tools/handoffs already resolved above — avoids a second call to
-            # get_all_tools()/get_handoffs() and ensures span metadata matches the model.
+            # Reuse the resolved tools/handoffs returned above — avoids a second call and
+            # ensures span metadata matches what was actually sent to the model, including
+            # any overrides applied by starting_settings.
             if not isinstance(self._current_agent_span, NoOpSpan):
                 self._current_agent_span.span_data.tools = [
                     n for t in resolved_tools if (n := get_tool_trace_name_for_tool(t)) is not None
@@ -266,9 +260,6 @@ async def __aenter__(self) -> RealtimeSession:
             if self._current_agent_span is not None:
                 self._current_agent_span.finish(reset_current=False)
                 self._current_agent_span = None
-            if self._initial_span_token is not None:
-                Scope.reset_current_span(self._initial_span_token)
-                self._initial_span_token = None
             raise
 
     async def enter(self) -> RealtimeSession:
@@ -280,14 +271,6 @@ async def enter(self) -> RealtimeSession:
 
     async def __aexit__(self, _exc_type: Any, _exc_val: Any, _exc_tb: Any) -> None:
         """End the session."""
-        # Reset the initial span token here. __aexit__ is always invoked in the same
-        # asyncio task as __aenter__, so this reset is unconditionally safe. We do it
-        # here rather than relying solely on _cleanup because _cleanup is also reachable
-        # from close() and __aiter__, which may run in different tasks and would raise
-        # ValueError on ContextVar.reset().
-        if self._initial_span_token is not None:
-            Scope.reset_current_span(self._initial_span_token)
-            self._initial_span_token = None
         await self.close()
 
     async def __aiter__(self) -> AsyncIterator[RealtimeSessionEvent]:
@@ -878,17 +861,12 @@ async def _handle_tool_call(
                 # Update current agent
                 self._current_agent = result
 
-                # Create the incoming agent span with a clear current-span context so that
-                # its parent is the trace root, not the finished outgoing agent span.
-                # The outgoing span is still "current" in this background task context
-                # (finish(reset_current=False) does not reset the context var), so we must
-                # temporarily clear it before calling _make_agent_span() / agent_span() —
-                # provider.create_span() reads Scope.get_current_span() at creation time to
-                # determine parent_id.
-                _handoff_clear_token = Scope.set_current_span(None)
+                # Create the incoming agent span. Because we never install agent spans as
+                # current (see __aenter__), this background task's context already holds the
+                # trace root as the current span — provider.create_span() will parent the new
+                # span to the trace root, making it a sibling of the outgoing agent span.
                 self._current_agent_span = self._make_agent_span(self._current_agent)
                 self._current_agent_span.start(mark_as_current=False)
-                Scope.reset_current_span(_handoff_clear_token)
 
                 # Get updated model settings from new agent; reuse resolved tools and
                 # handoffs for span metadata to avoid a redundant second call.
@@ -1326,17 +1304,6 @@ async def _cleanup(self) -> None:
         if self._current_agent_span is not None:
             self._current_agent_span.finish(reset_current=False)
             self._current_agent_span = None
-        # Reset the initial span's context-var token. __aexit__ handles this
-        # unconditionally (it runs in the same task as __aenter__). This fallback
-        # handles direct close() calls from the same task; if close() or __aiter__
-        # triggers _cleanup from a different task the ValueError is caught and the
-        # token is left for __aexit__ to reset.
-        if self._initial_span_token is not None:
-            try:
-                Scope.reset_current_span(self._initial_span_token)
-            except ValueError:
-                pass  # Cross-task call; __aexit__ will reset from the correct task.
-            self._initial_span_token = None
 
         # Cancel and cleanup guardrail tasks
         self._cleanup_guardrail_tasks()
@@ -1371,12 +1338,12 @@ async def _get_updated_model_settings_from_agent(
         self,
         starting_settings: RealtimeSessionModelSettings | None,
         agent: RealtimeAgent,
-    ) -> tuple[RealtimeSessionModelSettings, list[Any], list[Handoff[Any, RealtimeAgent[Any]]]]:
-        """Return (settings, resolved_tools, enabled_handoffs).
+    ) -> tuple[RealtimeSessionModelSettings, list[Any], list[Any]]:
+        """Return (settings, final_tools, final_handoffs).
 
-        resolved_tools and enabled_handoffs are captured before starting_settings overrides
-        so callers can use them for span metadata without re-invoking get_all_tools() or
-        _get_handoffs() a second time.
+        final_tools and final_handoffs reflect the values in the returned settings after
+        starting_settings overrides are applied. Callers must use these for span metadata
+        to ensure the span reports exactly what was sent to the model.
         """
         # Start with the merged base settings from run and model configuration.
         updated_settings = self._base_model_settings.copy()
@@ -1393,7 +1360,7 @@ async def _get_updated_model_settings_from_agent(
         updated_settings["tools"] = tools or []
         updated_settings["handoffs"] = handoffs or []
 
-        # Apply starting settings (from model config) next
+        # Apply starting_settings (from model config) — may override tools and handoffs.
         if starting_settings:
             updated_settings.update(starting_settings)
 
@@ -1401,7 +1368,10 @@ async def _get_updated_model_settings_from_agent(
         if disable_tracing:
             updated_settings["tracing"] = None
 
-        return updated_settings, list(tools or []), list(handoffs or [])
+        # Return the final tools/handoffs AFTER overrides so span metadata matches the model.
+        final_tools = list(updated_settings.get("tools") or [])
+        final_handoffs = list(updated_settings.get("handoffs") or [])
+        return updated_settings, final_tools, final_handoffs
 
     @classmethod
     async def _get_handoffs(
diff --git a/tests/realtime/test_session_spans.py b/tests/realtime/test_session_spans.py
index b37c38a5a5..e006029bbf 100644
--- a/tests/realtime/test_session_spans.py
+++ b/tests/realtime/test_session_spans.py
@@ -173,25 +173,22 @@ async def test_tracing_disabled_creates_no_agent_spans():
 
 @pytest.mark.asyncio
 async def test_no_active_trace_does_not_poison_span_context():
-    """Without an outer trace(), the session must not install a NoOpSpan as current.
+    """Without an outer trace(), the session must not alter the ambient span context.
 
-    Convention: provider returns NoOpSpan when no active trace exists. Installing
-    a NoOpSpan as current would make every span created afterward also a NoOpSpan
-    (provider._is_noop_span check). The session must skip Scope.set_current_span()
-    for NoOpSpans so ambient context is unchanged after the session closes.
+    Convention: RealtimeSession never installs agent spans as the ContextVar current span,
+    so the context is always unchanged before and after the session regardless of whether
+    a real trace exists.
     """
     span_before = Scope.get_current_span()
     agent = RealtimeAgent(name="agent")
     session = _make_session(agent)
 
-    # Enter/exit WITHOUT any enclosing trace — span will be a NoOpSpan.
+    # Enter/exit WITHOUT any enclosing trace.
     async with session:
         pass
 
     span_after = Scope.get_current_span()
-    assert span_before is span_after, (
-        "Session must not permanently alter the current span context when no active trace exists."
-    )
+    assert span_before is span_after, "Session must not permanently alter the current span context."
 
 
 @pytest.mark.asyncio
@@ -222,20 +219,17 @@ async def test_disabled_handoff_excluded_from_span_metadata():
 
 @pytest.mark.asyncio
 async def test_cleanup_from_different_task_does_not_raise():
-    """_cleanup called from a task other than __aenter__'s task must not raise ValueError.
+    """_cleanup called from a different asyncio task must not raise and must close the session.
 
     close() is public and __aiter__ also calls _cleanup when _stored_exception is set.
-    Both can run in a different asyncio task than __aenter__. Resetting a contextvars
-    token from a different task raises ValueError — this must be caught gracefully.
+    Both can run in a different asyncio task than __aenter__.
     """
     agent = RealtimeAgent(name="agent")
     session = _make_session(agent)
 
     with trace("test"):
-        await session.enter()  # open the session in this (main) task
+        await session.enter()
 
-        # Call _cleanup from a background task — it gets a copied context, so the
-        # token stored by __aenter__ in the main task cannot be reset here; must not raise.
         async def close_from_other_task() -> None:
             await session._cleanup()
 
@@ -245,10 +239,11 @@ async def close_from_other_task() -> None:
 
 
 @pytest.mark.asyncio
-async def test_span_context_clean_after_close_called_directly():
-    """Span context must be reset even when close() is called directly (no async with).
+async def test_span_context_unchanged_after_close_called_directly():
+    """Ambient span context must be unchanged whether exited via async with or close().
 
-    Method: enter via session.enter(), call close() directly, verify Scope is clean.
+    Convention: RealtimeSession never installs agent spans as the ContextVar current span,
+    so close() has no context cleanup to perform; state before and after is identical.
     """
     span_before = Scope.get_current_span()
     agent = RealtimeAgent(name="agent")
@@ -259,7 +254,7 @@ async def test_span_context_clean_after_close_called_directly():
         await session.close()
 
     span_after = Scope.get_current_span()
-    assert span_before is span_after, "Calling close() directly must still reset the span context."
+    assert span_before is span_after, "Calling close() directly must not alter the span context."
 
 
 @pytest.mark.asyncio
@@ -300,3 +295,40 @@ async def test_handoff_span_is_sibling_not_child_of_initial_span():
         "Specialist span must not be a child of the router span. "
         f"specialist.parent_id={specialist_span.parent_id}, router.span_id={router_span.span_id}"
     )
+
+
+@pytest.mark.asyncio
+async def test_span_tool_metadata_reflects_model_config_override():
+    """model_config.initial_model_settings tool override must be reflected in span metadata.
+
+    Convention: span metadata must match what was actually sent to the model. When
+    initial_model_settings overrides tools (e.g. to empty), the span must show the
+    override — not the agent's default tool list.
+    """
+    from agents.tool import function_tool
+
+    @function_tool
+    def my_tool() -> str:
+        """A test tool."""
+        return "ok"
+
+    agent = RealtimeAgent(name="tool_agent", tools=[my_tool])
+    # model_config overrides tools with an empty list, wiping the agent's tool.
+    session = RealtimeSession(
+        model=_FakeRealtimeModel(),
+        agent=agent,
+        context=None,
+        model_config={"initial_model_settings": {"tools": []}},
+        run_config={},
+    )
+
+    with trace("test"):
+        async with session:
+            pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert agent_spans[0].span_data.tools is None, (
+        f"model_config tool override must clear tools from span, "
+        f"got: {agent_spans[0].span_data.tools}"
+    )

From ee9f76b67ccacbe9aed3c5ae0fa56d43c53455e2 Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 14:06:24 +0800
Subject: [PATCH 7/8] fix: add __aenter__ failure test and update
 _make_agent_span docstring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add test_aenter_failure_finishes_span to cover the except BaseException cleanup
path in __aenter__ — the final missing entry from the tracing conventions test
checklist (Rule 9). Update the _make_agent_span docstring to reflect the current
design: callers now populate span_data from _get_updated_model_settings_from_agent
tuple return rather than calling get_all_tools() / _get_handoffs() separately.
---
 src/agents/realtime/session.py       |  8 +++----
 tests/realtime/test_session_spans.py | 33 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index 2dc49fb88d..266e7b2723 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -1326,10 +1326,10 @@ async def _cleanup(self) -> None:
     def _make_agent_span(self, agent: RealtimeAgent) -> Span[AgentSpanData]:
         """Create a new agent span for the given agent, respecting tracing_disabled.
 
-        Both tool names and handoff names are intentionally omitted here. Callers must
-        update span_data.tools via get_all_tools() and span_data.handoffs via
-        _get_handoffs() asynchronously to reflect only what is actually sent to the model
-        (respects is_enabled on both tools and handoffs).
+        Tool and handoff names are intentionally omitted here. Callers must populate
+        span_data.tools and span_data.handoffs from the tuple returned by
+        _get_updated_model_settings_from_agent() so that metadata reflects what was
+        actually sent to the model (after is_enabled filtering and any model_config overrides).
         """
         disabled: bool = bool(self._run_config.get("tracing_disabled", False))
         return agent_span(name=agent.name, disabled=disabled)
diff --git a/tests/realtime/test_session_spans.py b/tests/realtime/test_session_spans.py
index e006029bbf..9c16035ed5 100644
--- a/tests/realtime/test_session_spans.py
+++ b/tests/realtime/test_session_spans.py
@@ -297,6 +297,39 @@ async def test_handoff_span_is_sibling_not_child_of_initial_span():
     )
 
 
+@pytest.mark.asyncio
+async def test_aenter_failure_finishes_span():
+    """If __aenter__ raises after the span is started, the span must still be finished.
+
+    Python does not call __aexit__ when __aenter__ raises, so the except BaseException
+    block in __aenter__ is the only cleanup path. Verify no unfinished span is leaked.
+    """
+
+    class _FailingConnectModel(_FakeRealtimeModel):
+        async def connect(self, options: Any) -> None:
+            raise RuntimeError("simulated connection failure")
+
+    agent = RealtimeAgent(name="agent")
+    session = RealtimeSession(
+        model=_FailingConnectModel(),
+        agent=agent,
+        context=None,
+        run_config={},
+    )
+
+    with trace("test"):
+        with pytest.raises(RuntimeError):
+            async with session:
+                pass
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert len(agent_spans) == 1, f"Expected 1 agent span, got {len(agent_spans)}"
+    assert agent_spans[0].ended_at is not None, (
+        "Agent span must be finished (not leaked) when __aenter__ raises."
+    )
+
+
 @pytest.mark.asyncio
 async def test_span_tool_metadata_reflects_model_config_override():
     """model_config.initial_model_settings tool override must be reflected in span metadata.

From 483f41e706a309fbd6f5129bc0c99e1cc2f592fe Mon Sep 17 00:00:00 2001
From: jordanchendev <jordan.chen.dev@gmail.com>
Date: Thu, 4 Jun 2026 14:32:06 +0800
Subject: [PATCH 8/8] fix: finish outgoing span and start new one in
 update_agent()

update_agent() now mirrors the handoff path: finishes the outgoing
agent span (reset_current=False), creates and starts a new span for
the incoming agent (mark_as_current=False), and populates span
metadata from _get_updated_model_settings_from_agent() tuple.

Adds test_update_agent_finishes_old_span_and_starts_new_one to
verify both agent spans are emitted and the original span is closed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/agents/realtime/session.py       | 20 +++++++++++++++-
 tests/realtime/test_session_spans.py | 35 ++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
index 266e7b2723..cf745919a2 100644
--- a/src/agents/realtime/session.py
+++ b/src/agents/realtime/session.py
@@ -315,13 +315,31 @@ async def interrupt(self) -> None:
 
     async def update_agent(self, agent: RealtimeAgent) -> None:
         """Update the active agent for this session and apply its settings to the model."""
+        # Finish the outgoing agent span before switching agents, mirroring the handoff path.
+        if self._current_agent_span is not None:
+            self._current_agent_span.finish(reset_current=False)
+
         self._current_agent = agent
+        self._current_agent_span = self._make_agent_span(self._current_agent)
+        self._current_agent_span.start(mark_as_current=False)
 
-        updated_settings, _, _ = await self._get_updated_model_settings_from_agent(
+        (
+            updated_settings,
+            resolved_tools,
+            enabled_handoffs,
+        ) = await self._get_updated_model_settings_from_agent(
             starting_settings=None,
             agent=self._current_agent,
         )
 
+        if not isinstance(self._current_agent_span, NoOpSpan):
+            self._current_agent_span.span_data.tools = [
+                n for t in resolved_tools if (n := get_tool_trace_name_for_tool(t)) is not None
+            ] or None
+            self._current_agent_span.span_data.handoffs = [
+                h.agent_name for h in enabled_handoffs
+            ] or None
+
         await self._model.send_event(
             RealtimeModelSendSessionUpdate(session_settings=updated_settings)
         )
diff --git a/tests/realtime/test_session_spans.py b/tests/realtime/test_session_spans.py
index 9c16035ed5..9d038e9753 100644
--- a/tests/realtime/test_session_spans.py
+++ b/tests/realtime/test_session_spans.py
@@ -365,3 +365,38 @@ def my_tool() -> str:
         f"model_config tool override must clear tools from span, "
         f"got: {agent_spans[0].span_data.tools}"
     )
+
+
+@pytest.mark.asyncio
+async def test_update_agent_finishes_old_span_and_starts_new_one():
+    """update_agent() must finish the outgoing span and emit a new span for the incoming agent.
+
+    Convention: update_agent() is the public API equivalent of a handoff. It must mirror
+    the handoff path: finish the current agent span, then create and start a new one for
+    the incoming agent. Without this, activity after the switch is attributed to the wrong
+    agent and no span is emitted for the new agent.
+    """
+    original = RealtimeAgent(name="original_agent")
+    replacement = RealtimeAgent(name="replacement_agent")
+    model = _FakeRealtimeModel()
+    session = _make_session(original, model)
+
+    with trace("test"):
+        async with session:
+            await session.update_agent(replacement)
+
+    spans = SPAN_PROCESSOR_TESTING.get_ordered_spans()
+    agent_spans = [s for s in spans if isinstance(s.span_data, AgentSpanData)]
+    assert len(agent_spans) == 2, (
+        f"Expected 2 agent spans (original + replacement), got {len(agent_spans)}"
+    )
+
+    names = {s.span_data.name for s in agent_spans}
+    assert names == {"original_agent", "replacement_agent"}, (
+        f"Expected spans for both agents, got: {names}"
+    )
+
+    original_span = next(s for s in agent_spans if s.span_data.name == "original_agent")
+    assert original_span.ended_at is not None, (
+        "Original agent span must be finished after update_agent()"
+    )