From 5e31ae41f58326ba1f1d4ccbdbdd8549562091c4 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Sat, 2 May 2026 14:46:59 +0900
Subject: [PATCH 1/3] test: add more unit tests for uncovered patterns

---
 .../experiemental/codex/test_payloads.py      |  45 +++++++
 tests/sandbox/test_session_state_roundtrip.py |  95 ++++++++++++++
 tests/sandbox/test_token_truncation.py        |  96 ++++++++++++++
 tests/sandbox/test_workspace_payloads.py      | 123 ++++++++++++++++++
 tests/test_pretty_print.py                    |  58 ++++++++-
 tests/test_run_internal_approvals.py          | 123 ++++++++++++++++++
 6 files changed, 538 insertions(+), 2 deletions(-)
 create mode 100644 tests/extensions/experiemental/codex/test_payloads.py
 create mode 100644 tests/sandbox/test_token_truncation.py
 create mode 100644 tests/sandbox/test_workspace_payloads.py
 create mode 100644 tests/test_run_internal_approvals.py

diff --git a/tests/extensions/experiemental/codex/test_payloads.py b/tests/extensions/experiemental/codex/test_payloads.py
new file mode 100644
index 0000000000..3041e7d324
--- /dev/null
+++ b/tests/extensions/experiemental/codex/test_payloads.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import pytest
+
+from agents.extensions.experimental.codex.items import AgentMessageItem, TodoItem, TodoListItem
+
+
+def test_dict_like_supports_mapping_access_for_dataclass_fields() -> None:
+    item = AgentMessageItem(id="item-1", text="hello")
+
+    assert item["id"] == "item-1"
+    assert item["text"] == "hello"
+    assert item["type"] == "agent_message"
+    assert item.get("text") == "hello"
+    assert item.get("missing", "fallback") == "fallback"
+    assert "id" in item
+    assert "missing" not in item
+    assert object() not in item
+    assert list(item.keys()) == ["id", "text", "type"]
+
+
+def test_dict_like_raises_key_error_for_unknown_fields() -> None:
+    item = AgentMessageItem(id="item-1", text="hello")
+
+    with pytest.raises(KeyError, match="missing"):
+        _ = item["missing"]
+
+
+def test_dict_like_as_dict_recursively_converts_nested_dataclasses() -> None:
+    item = TodoListItem(
+        id="todo-list-1",
+        items=[
+            TodoItem(text="write tests", completed=True),
+            TodoItem(text="run tests", completed=False),
+        ],
+    )
+
+    assert item.as_dict() == {
+        "id": "todo-list-1",
+        "items": [
+            {"text": "write tests", "completed": True},
+            {"text": "run tests", "completed": False},
+        ],
+        "type": "todo_list",
+    }
diff --git a/tests/sandbox/test_session_state_roundtrip.py b/tests/sandbox/test_session_state_roundtrip.py
index f90d0b8bba..7c0ac73ec7 100644
--- a/tests/sandbox/test_session_state_roundtrip.py
+++ b/tests/sandbox/test_session_state_roundtrip.py
@@ -12,6 +12,9 @@
 from pathlib import Path
 from typing import Literal
 
+import pytest
+from pydantic import ValidationError
+
 from agents.sandbox import Manifest
 from agents.sandbox.session import SandboxSessionState
 from agents.sandbox.snapshot import LocalSnapshot
@@ -27,6 +30,21 @@ class _StubSessionState(SandboxSessionState):
     custom_field: str
 
 
+class _PlainTypeSessionState(SandboxSessionState):
+    __test__ = False
+    type: str = "plain-type"
+
+
+class _EmptyDefaultSessionState(SandboxSessionState):
+    __test__ = False
+    type: Literal[""] = ""
+
+
+class _SimpleSessionState(SandboxSessionState):
+    __test__ = False
+    type: Literal["simple-roundtrip"] = "simple-roundtrip"
+
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -93,3 +111,80 @@ def test_model_dump_preserves_snapshot_subclass_fields(self) -> None:
         dumped = state.model_dump()
 
         assert "base_path" in dumped["snapshot"]
+
+    def test_parse_returns_subclass_instances_as_is(self) -> None:
+        state = _make_session_state()
+
+        assert SandboxSessionState.parse(state) is state
+
+    def test_parse_upgrades_base_instance_through_registry(self) -> None:
+        state = _SimpleSessionState(
+            session_id=uuid.UUID("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"),
+            snapshot=LocalSnapshot(id="snap-1", base_path=Path("/tmp/snapshots")),
+            manifest=Manifest(),
+        )
+        base_instance = SandboxSessionState.model_validate(state.model_dump())
+
+        reconstructed = SandboxSessionState.parse(base_instance)
+
+        assert type(reconstructed) is _SimpleSessionState
+        assert reconstructed.session_id == uuid.UUID("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb")
+
+    @pytest.mark.parametrize(
+        ("payload", "error_type", "message"),
+        [
+            ({}, ValueError, "must include a string `type`"),
+            ({"type": "missing"}, ValueError, "unknown sandbox session state type `missing`"),
+            ("not-a-state", TypeError, "session state payload must be"),
+        ],
+    )
+    def test_parse_rejects_invalid_payloads(
+        self,
+        payload: object,
+        error_type: type[Exception],
+        message: str,
+    ) -> None:
+        with pytest.raises(error_type, match=message):
+            SandboxSessionState.parse(payload)
+
+    def test_subclass_registration_skips_non_literal_or_empty_type_defaults(self) -> None:
+        assert "plain-type" not in SandboxSessionState._subclass_registry
+        assert "" not in SandboxSessionState._subclass_registry
+
+    @pytest.mark.parametrize(
+        ("raw_ports", "expected"),
+        [
+            (None, ()),
+            (8080, (8080,)),
+            ([8080, 9000, 8080], (8080, 9000)),
+        ],
+    )
+    def test_exposed_ports_are_normalized(
+        self, raw_ports: object, expected: tuple[int, ...]
+    ) -> None:
+        state = _StubSessionState(
+            snapshot=LocalSnapshot(id="snap-1", base_path=Path("/tmp/snapshots")),
+            manifest=Manifest(),
+            custom_field="my-value",
+            exposed_ports=raw_ports,  # type: ignore[arg-type]
+        )
+
+        assert state.exposed_ports == expected
+
+    @pytest.mark.parametrize(
+        ("raw_ports", "message"),
+        [
+            ("8080", "exposed_ports must be an iterable"),
+            ([8080, "9000"], "exposed_ports must contain integers"),
+            ([0], "exposed_ports entries must be between 1 and 65535"),
+            ([65536], "exposed_ports entries must be between 1 and 65535"),
+        ],
+    )
+    def test_exposed_ports_reject_invalid_values(self, raw_ports: object, message: str) -> None:
+        with pytest.raises((TypeError, ValidationError), match=message):
+            _StubSessionState(
+                snapshot=LocalSnapshot(id="snap-1", base_path=Path("/tmp/snapshots")),
+                manifest=Manifest(),
+                custom_field="my-value",
+                exposed_ports=raw_ports,  # type: ignore[arg-type]
+            )
diff --git a/tests/sandbox/test_token_truncation.py b/tests/sandbox/test_token_truncation.py
new file mode 100644
index 0000000000..fdd0f0627c
--- /dev/null
+++ b/tests/sandbox/test_token_truncation.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+from agents.sandbox.util.token_truncation import (
+    TruncationPolicy,
+    approx_bytes_for_tokens,
+    approx_token_count,
+    approx_tokens_from_byte_count,
+    format_truncation_marker,
+    formatted_truncate_text,
+    formatted_truncate_text_with_token_count,
+    removed_units_for_source,
+    split_budget,
+    split_string,
+    truncate_text,
+    truncate_with_byte_estimate,
+    truncate_with_token_budget,
+)
+
+
+def test_truncation_policy_clamps_negative_limits_and_converts_budgets() -> None:
+    byte_policy = TruncationPolicy.bytes(-10)
+    token_policy = TruncationPolicy.tokens(-2)
+
+    assert byte_policy.limit == 0
+    assert byte_policy.token_budget() == 0
+    assert byte_policy.byte_budget() == 0
+    assert token_policy.limit == 0
+    assert token_policy.token_budget() == 0
+    assert token_policy.byte_budget() == 0
+
+
+def test_formatted_truncate_text_returns_short_content_unchanged() -> None:
+    assert formatted_truncate_text("short", TruncationPolicy.bytes(20)) == "short"
+
+
+def test_formatted_truncate_text_adds_line_count_when_truncated() -> None:
+    result = formatted_truncate_text("alpha\nbeta\ngamma", TruncationPolicy.bytes(8))
+
+    assert result.startswith("Total output lines: 3\n\n")
+    assert "chars truncated" in result
+
+
+def test_formatted_truncate_text_with_token_count_handles_none_and_short_content() -> None:
+    assert formatted_truncate_text_with_token_count("short", None) == ("short", None)
+    assert formatted_truncate_text_with_token_count("short", 10) == ("short", None)
+
+
+def test_formatted_truncate_text_with_token_count_reports_original_count() -> None:
+    result, original_token_count = formatted_truncate_text_with_token_count("abcdefghi", 1)
+
+    assert result.startswith("Total output lines: 1\n\n")
+    assert "tokens truncated" in result
+    assert original_token_count == approx_token_count("abcdefghi")
+
+
+def test_truncate_text_dispatches_byte_and_token_modes() -> None:
+    assert truncate_text("abcdef", TruncationPolicy.bytes(4)).startswith("a")
+    assert "tokens truncated" in truncate_text("abcdefghi", TruncationPolicy.tokens(1))
+
+
+def test_truncate_with_token_budget_handles_empty_and_short_content() -> None:
+    assert truncate_with_token_budget("", TruncationPolicy.tokens(1)) == ("", None)
+    assert truncate_with_token_budget("abc", TruncationPolicy.tokens(1)) == ("abc", None)
+
+
+def test_truncate_with_byte_estimate_handles_empty_zero_and_short_content() -> None:
+    assert truncate_with_byte_estimate("", TruncationPolicy.bytes(0)) == ""
+    assert "chars truncated" in truncate_with_byte_estimate("abc", TruncationPolicy.bytes(0))
+    assert truncate_with_byte_estimate("abc", TruncationPolicy.bytes(10)) == "abc"
+
+
+def test_split_string_preserves_utf8_boundaries() -> None:
+    removed_chars, prefix, suffix = split_string("aあbいc", 2, 4)
+
+    assert prefix == "a"
+    assert suffix == "いc"
+    assert removed_chars == 2
+
+
+def test_split_string_handles_empty_content() -> None:
+    assert split_string("", 10, 10) == (0, "", "")
+
+
+def test_formatting_and_estimate_helpers() -> None:
+    byte_policy = TruncationPolicy.bytes(8)
+    token_policy = TruncationPolicy.tokens(2)
+
+    assert "chars truncated" in format_truncation_marker(byte_policy, 3)
+    assert "tokens truncated" in format_truncation_marker(token_policy, 2)
+    assert split_budget(5) == (2, 3)
+    assert removed_units_for_source(byte_policy, removed_bytes=10, removed_chars=4) == 4
+    assert removed_units_for_source(token_policy, removed_bytes=9, removed_chars=4) == 3
+    assert approx_token_count("abcde") == 2
+    assert approx_bytes_for_tokens(-1) == 0
+    assert approx_tokens_from_byte_count(0) == 0
+    assert approx_tokens_from_byte_count(5) == 2
diff --git a/tests/sandbox/test_workspace_payloads.py b/tests/sandbox/test_workspace_payloads.py
new file mode 100644
index 0000000000..3a5b8d2b2c
--- /dev/null
+++ b/tests/sandbox/test_workspace_payloads.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+import io
+from pathlib import Path
+from typing import Any, cast
+
+import pytest
+
+from agents.sandbox.errors import ErrorCode, WorkspaceWriteTypeError
+from agents.sandbox.session.workspace_payloads import coerce_write_payload
+
+
+class _Headers:
+    def __init__(self, value: str | None) -> None:
+        self._value = value
+
+    def get(self, name: str) -> str | None:
+        assert name == "Content-Length"
+        return self._value
+
+
+class _HeaderStream(io.BytesIO):
+    def __init__(self, data: bytes, content_length: str | None) -> None:
+        super().__init__(data)
+        self.headers = _Headers(content_length)
+
+
+class _LengthStream(io.BytesIO):
+    def __init__(self, data: bytes, length: int) -> None:
+        super().__init__(data)
+        self.length = length
+
+
+class _NoneReadStream:
+    def read(self, size: int = -1) -> Any:
+        _ = size
+        return None
+
+
+class _BytearrayReadStream:
+    def read(self, size: int = -1) -> Any:
+        _ = size
+        return bytearray(b"abc")
+
+
+class _TextReadStream:
+    def read(self, size: int = -1) -> Any:
+        _ = size
+        return "not-bytes"
+
+
+class _UnseekableStream(io.BytesIO):
+    def tell(self) -> int:
+        raise OSError("not seekable")
+
+
+def test_coerce_write_payload_adapts_binary_reads() -> None:
+    payload = coerce_write_payload(path=Path("/workspace/file.bin"), data=io.BytesIO(b"abc"))
+
+    assert payload.content_length == 3
+    assert payload.stream.readable() is True
+    assert payload.stream.read(1) == b"a"
+    assert payload.stream.read() == b"bc"
+
+
+def test_coerce_write_payload_adapts_bytearray_and_none_reads() -> None:
+    bytearray_payload = coerce_write_payload(
+        path=Path("/workspace/file.bin"),
+        data=cast(io.IOBase, _BytearrayReadStream()),
+    )
+    none_payload = coerce_write_payload(
+        path=Path("/workspace/empty.bin"),
+        data=cast(io.IOBase, _NoneReadStream()),
+    )
+
+    assert bytearray_payload.stream.read() == b"abc"
+    assert none_payload.stream.read() == b""
+
+
+def test_coerce_write_payload_supports_readinto_seek_and_tell() -> None:
+    payload = coerce_write_payload(path=Path("/workspace/file.bin"), data=io.BytesIO(b"abcdef"))
+    buffer = bytearray(3)
+
+    assert cast(Any, payload.stream).readinto(buffer) == 3
+    assert bytes(buffer) == b"abc"
+    assert payload.stream.tell() == 3
+    assert payload.stream.seek(1) == 1
+    assert payload.stream.read(2) == b"bc"
+
+
+def test_coerce_write_payload_rejects_text_chunks() -> None:
+    payload = coerce_write_payload(
+        path=Path("/workspace/file.txt"),
+        data=cast(io.IOBase, _TextReadStream()),
+    )
+
+    with pytest.raises(WorkspaceWriteTypeError) as exc_info:
+        payload.stream.read()
+
+    assert exc_info.value.error_code is ErrorCode.WORKSPACE_WRITE_TYPE_ERROR
+    assert exc_info.value.context == {
+        "path": "/workspace/file.txt",
+        "actual_type": "str",
+    }
+
+
+@pytest.mark.parametrize(
+    ("stream", "expected"),
+    [
+        (_LengthStream(b"abc", 5), 5),
+        (_HeaderStream(b"abc", "7"), 7),
+        (_HeaderStream(b"abc", "-1"), 3),
+        (_HeaderStream(b"abc", "invalid"), 3),
+        (_UnseekableStream(b"abc"), None),
+    ],
+)
+def test_coerce_write_payload_uses_best_effort_content_length(
+    stream: io.IOBase,
+    expected: int | None,
+) -> None:
+    payload = coerce_write_payload(path=Path("/workspace/file.bin"), data=stream)
+
+    assert payload.content_length == expected
diff --git a/tests/test_pretty_print.py b/tests/test_pretty_print.py
index b2218a279d..79327cfb92 100644
--- a/tests/test_pretty_print.py
+++ b/tests/test_pretty_print.py
@@ -4,9 +4,13 @@
 from inline_snapshot import snapshot
 from pydantic import BaseModel
 
-from agents import Agent, Runner
+from agents import Agent, RunContextWrapper, RunErrorDetails, Runner, RunResult
 from agents.agent_output import _WRAPPER_DICT_KEY
-from agents.util._pretty_print import pretty_print_result, pretty_print_run_result_streaming
+from agents.util._pretty_print import (
+    pretty_print_result,
+    pretty_print_run_error_details,
+    pretty_print_run_result_streaming,
+)
 from tests.fake_model import FakeModel
 
 from .test_responses import get_final_output_message, get_text_message
@@ -33,6 +37,56 @@ async def test_pretty_result():
 """)
 
 
+def test_pretty_result_handles_none_final_output():
+    agent = Agent(name="none_agent")
+    result = RunResult(
+        input="Hello",
+        new_items=[],
+        raw_responses=[],
+        final_output=None,
+        input_guardrail_results=[],
+        output_guardrail_results=[],
+        tool_input_guardrail_results=[],
+        tool_output_guardrail_results=[],
+        context_wrapper=RunContextWrapper(context=None),
+        _last_agent=agent,
+    )
+
+    assert pretty_print_result(result) == snapshot("""\
+RunResult:
+- Last agent: Agent(name="none_agent", ...)
+- Final output (NoneType):
+    None
+- 0 new item(s)
+- 0 raw response(s)
+- 0 input guardrail result(s)
+- 0 output guardrail result(s)
+(See `RunResult` for more details)\
+""")
+
+
+def test_pretty_run_error_details():
+    agent = Agent(name="error_agent")
+    details = RunErrorDetails(
+        input="Hello",
+        new_items=[],
+        raw_responses=[],
+        last_agent=agent,
+        context_wrapper=RunContextWrapper(context=None),
+        input_guardrail_results=[],
+        output_guardrail_results=[],
+    )
+
+    assert pretty_print_run_error_details(details) == snapshot("""\
+RunErrorDetails:
+- Last agent: Agent(name="error_agent", ...)
+- 0 new item(s)
+- 0 raw response(s)
+- 0 input guardrail result(s)
+(See `RunErrorDetails` for more details)\
+""")
+
+
 @pytest.mark.asyncio
 async def test_pretty_run_result_streaming():
     model = FakeModel()
diff --git a/tests/test_run_internal_approvals.py b/tests/test_run_internal_approvals.py
new file mode 100644
index 0000000000..44c57f137e
--- /dev/null
+++ b/tests/test_run_internal_approvals.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, cast
+
+from openai.types.responses import ResponseFunctionToolCall
+
+from agents import Agent
+from agents.items import MessageOutputItem, ToolCallOutputItem, TResponseInputItem
+from agents.run_internal.approvals import (
+    _build_function_tool_call_for_approval_error,
+    append_approval_error_output,
+    append_input_items_excluding_approvals,
+    approvals_from_step,
+    filter_tool_approvals,
+)
+from tests.utils.factories import make_message_output, make_tool_approval_item, make_tool_call
+
+
+@dataclass
+class _Step:
+    interruptions: list[Any]
+
+
+@dataclass
+class _NoInterruptionsStep:
+    value: str
+
+
+class _NamespacedToolCall:
+    namespace = "object_namespace"
+
+
+def test_filter_tool_approvals_keeps_only_approval_items() -> None:
+    agent = Agent(name="test")
+    approval = make_tool_approval_item(agent)
+
+    assert filter_tool_approvals(["text", approval, object()]) == [approval]
+
+
+def test_approvals_from_step_handles_missing_and_mixed_interruptions() -> None:
+    agent = Agent(name="test")
+    approval = make_tool_approval_item(agent)
+
+    assert approvals_from_step(_NoInterruptionsStep("none")) == []
+    assert approvals_from_step(_Step(["other", approval])) == [approval]
+
+
+def test_append_input_items_excluding_approvals_skips_approval_placeholders() -> None:
+    agent = Agent(name="test")
+    base_input: list[TResponseInputItem] = []
+    message = MessageOutputItem(agent=agent, raw_item=make_message_output(text="done"))
+    approval = make_tool_approval_item(agent, call_id="call_approval")
+
+    append_input_items_excluding_approvals(base_input, [message, approval])
+
+    assert len(base_input) == 1
+    assert cast(dict[str, Any], base_input[0])["type"] == "message"
+
+
+def test_append_approval_error_output_emits_function_tool_output() -> None:
+    agent = Agent(name="test")
+    generated_items: list[Any] = []
+
+    append_approval_error_output(
+        generated_items=generated_items,
+        agent=agent,
+        tool_call={"namespace": "dict_namespace"},
+        tool_name="needs_approval",
+        call_id=None,
+        message="approval denied",
+    )
+
+    assert len(generated_items) == 1
+    output_item = generated_items[0]
+    assert isinstance(output_item, ToolCallOutputItem)
+    assert output_item.agent is agent
+    assert output_item.output == "approval denied"
+    assert output_item.raw_item == {
+        "type": "function_call_output",
+        "call_id": "unknown",
+        "output": "approval denied",
+    }
+
+
+def test_build_function_tool_call_for_approval_error_reuses_typed_calls() -> None:
+    tool_call = make_tool_call(call_id="call_1", name="typed_tool")
+
+    assert (
+        _build_function_tool_call_for_approval_error(tool_call, "ignored", "ignored") is tool_call
+    )
+
+
+def test_build_function_tool_call_for_approval_error_preserves_namespace_sources() -> None:
+    from_dict = _build_function_tool_call_for_approval_error(
+        {"namespace": "dict_namespace"},
+        "dict_tool",
+        "call_dict",
+    )
+    from_object = _build_function_tool_call_for_approval_error(
+        _NamespacedToolCall(),
+        "object_tool",
+        "call_object",
+    )
+
+    assert isinstance(from_dict, ResponseFunctionToolCall)
+    assert from_dict.namespace == "dict_namespace"
+    assert from_dict.call_id == "call_dict"
+    assert from_object.namespace == "object_namespace"
+    assert from_object.call_id == "call_object"
+
+
+def test_build_function_tool_call_for_approval_error_ignores_empty_namespaces() -> None:
+    tool_call = _build_function_tool_call_for_approval_error(
+        {"namespace": ""},
+        "tool",
+        "call_1",
+    )
+
+    assert not hasattr(tool_call, "namespace") or tool_call.namespace is None
+    assert tool_call.name == "tool"
+    assert tool_call.arguments == "{}"
+    assert tool_call.status == "completed"

From 6d06268ab02ab6ff6482d006a526a09fde69da6a Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Sat, 2 May 2026 15:01:56 +0900
Subject: [PATCH 2/3] organize tests

---
 tests/conftest.py                                            | 2 +-
 .../{test_sandbox_blaxel.py => sandbox/test_blaxel.py}       | 0
 .../test_cloudflare.py}                                      | 0
 .../{test_sandbox_daytona.py => sandbox/test_daytona.py}     | 0
 .../extensions/{test_sandbox_e2b.py => sandbox/test_e2b.py}  | 0
 .../{test_sandbox_modal.py => sandbox/test_modal.py}         | 0
 .../{test_sandbox_runloop.py => sandbox/test_runloop.py}     | 0
 .../{ => sandbox}/test_runloop_capabilities_example.py       | 4 ++--
 .../test_runloop_mounts.py}                                  | 0
 .../{test_sandbox_vercel.py => sandbox/test_vercel.py}       | 0
 tests/{ => memory}/test_openai_conversations_session.py      | 0
 tests/{ => memory}/test_session.py                           | 5 ++---
 tests/{ => memory}/test_session_limit.py                     | 2 +-
 tests/{ => models}/test_anthropic_thinking_blocks.py         | 0
 tests/{ => models}/test_extended_thinking_message_order.py   | 0
 tests/{ => models}/test_gemini_thought_signatures.py         | 0
 tests/{ => models}/test_gemini_thought_signatures_stream.py  | 0
 tests/{ => models}/test_model_payload_iterators.py           | 0
 tests/{ => models}/test_model_retry.py                       | 3 +--
 tests/{ => models}/test_openai_chatcompletions.py            | 0
 tests/{ => models}/test_openai_chatcompletions_converter.py  | 0
 tests/{ => models}/test_openai_chatcompletions_stream.py     | 0
 tests/{ => models}/test_openai_client_utils.py               | 0
 tests/{ => models}/test_openai_responses.py                  | 0
 tests/{ => models}/test_openai_responses_converter.py        | 0
 tests/{ => models}/test_reasoning_content.py                 | 0
 .../test_remove_openai_responses_api_incompatible_fields.py  | 0
 tests/{ => models}/test_responses_websocket_session.py       | 0
 tests/{ => realtime}/test_session_exceptions.py              | 0
 tests/{test_sandbox_memory.py => sandbox/test_memory.py}     | 0
 .../test_runtime_agent_preparation.py}                       | 0
 31 files changed, 7 insertions(+), 9 deletions(-)
 rename tests/extensions/{test_sandbox_blaxel.py => sandbox/test_blaxel.py} (100%)
 rename tests/extensions/{test_sandbox_cloudflare.py => sandbox/test_cloudflare.py} (100%)
 rename tests/extensions/{test_sandbox_daytona.py => sandbox/test_daytona.py} (100%)
 rename tests/extensions/{test_sandbox_e2b.py => sandbox/test_e2b.py} (100%)
 rename tests/extensions/{test_sandbox_modal.py => sandbox/test_modal.py} (100%)
 rename tests/extensions/{test_sandbox_runloop.py => sandbox/test_runloop.py} (100%)
 rename tests/extensions/{ => sandbox}/test_runloop_capabilities_example.py (98%)
 rename tests/extensions/{test_sandbox_runloop_mounts.py => sandbox/test_runloop_mounts.py} (100%)
 rename tests/extensions/{test_sandbox_vercel.py => sandbox/test_vercel.py} (100%)
 rename tests/{ => memory}/test_openai_conversations_session.py (100%)
 rename tests/{ => memory}/test_session.py (99%)
 rename tests/{ => memory}/test_session_limit.py (99%)
 rename tests/{ => models}/test_anthropic_thinking_blocks.py (100%)
 rename tests/{ => models}/test_extended_thinking_message_order.py (100%)
 rename tests/{ => models}/test_gemini_thought_signatures.py (100%)
 rename tests/{ => models}/test_gemini_thought_signatures_stream.py (100%)
 rename tests/{ => models}/test_model_payload_iterators.py (100%)
 rename tests/{ => models}/test_model_retry.py (99%)
 rename tests/{ => models}/test_openai_chatcompletions.py (100%)
 rename tests/{ => models}/test_openai_chatcompletions_converter.py (100%)
 rename tests/{ => models}/test_openai_chatcompletions_stream.py (100%)
 rename tests/{ => models}/test_openai_client_utils.py (100%)
 rename tests/{ => models}/test_openai_responses.py (100%)
 rename tests/{ => models}/test_openai_responses_converter.py (100%)
 rename tests/{ => models}/test_reasoning_content.py (100%)
 rename tests/{ => models}/test_remove_openai_responses_api_incompatible_fields.py (100%)
 rename tests/{ => models}/test_responses_websocket_session.py (100%)
 rename tests/{ => realtime}/test_session_exceptions.py (100%)
 rename tests/{test_sandbox_memory.py => sandbox/test_memory.py} (100%)
 rename tests/{test_sandbox_runtime_agent_preparation.py => sandbox/test_runtime_agent_preparation.py} (100%)

diff --git a/tests/conftest.py b/tests/conftest.py
index 21a3f6d7b5..c279b6c9ef 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -20,12 +20,12 @@
         [
             "test_example_workflows.py",
             "test_run_state.py",
-            "test_sandbox_memory.py",
             "sandbox/capabilities/test_filesystem_capability.py",
             "sandbox/integration_tests/test_runner_pause_resume.py",
             "sandbox/test_client_options.py",
             "sandbox/test_exposed_ports.py",
             "sandbox/test_extract.py",
+            "sandbox/test_memory.py",
             "sandbox/test_runtime.py",
             "sandbox/test_session_manager.py",
             "sandbox/test_session_sinks.py",
diff --git a/tests/extensions/test_sandbox_blaxel.py b/tests/extensions/sandbox/test_blaxel.py
similarity index 100%
rename from tests/extensions/test_sandbox_blaxel.py
rename to tests/extensions/sandbox/test_blaxel.py
diff --git a/tests/extensions/test_sandbox_cloudflare.py b/tests/extensions/sandbox/test_cloudflare.py
similarity index 100%
rename from tests/extensions/test_sandbox_cloudflare.py
rename to tests/extensions/sandbox/test_cloudflare.py
diff --git a/tests/extensions/test_sandbox_daytona.py b/tests/extensions/sandbox/test_daytona.py
similarity index 100%
rename from tests/extensions/test_sandbox_daytona.py
rename to tests/extensions/sandbox/test_daytona.py
diff --git a/tests/extensions/test_sandbox_e2b.py b/tests/extensions/sandbox/test_e2b.py
similarity index 100%
rename from tests/extensions/test_sandbox_e2b.py
rename to tests/extensions/sandbox/test_e2b.py
diff --git a/tests/extensions/test_sandbox_modal.py b/tests/extensions/sandbox/test_modal.py
similarity index 100%
rename from tests/extensions/test_sandbox_modal.py
rename to tests/extensions/sandbox/test_modal.py
diff --git a/tests/extensions/test_sandbox_runloop.py b/tests/extensions/sandbox/test_runloop.py
similarity index 100%
rename from tests/extensions/test_sandbox_runloop.py
rename to tests/extensions/sandbox/test_runloop.py
diff --git a/tests/extensions/test_runloop_capabilities_example.py b/tests/extensions/sandbox/test_runloop_capabilities_example.py
similarity index 98%
rename from tests/extensions/test_runloop_capabilities_example.py
rename to tests/extensions/sandbox/test_runloop_capabilities_example.py
index fafacb521f..c87a3ffcc0 100644
--- a/tests/extensions/test_runloop_capabilities_example.py
+++ b/tests/extensions/sandbox/test_runloop_capabilities_example.py
@@ -11,14 +11,14 @@
 
 def _load_example_module() -> Any:
     path = (
-        Path(__file__).resolve().parents[2]
+        Path(__file__).resolve().parents[3]
         / "examples"
         / "sandbox"
         / "extensions"
         / "runloop"
         / "capabilities.py"
     )
-    module_name = "tests.extensions.runloop_capabilities_example"
+    module_name = "tests.extensions.sandbox.runloop_capabilities_example"
     spec = importlib.util.spec_from_file_location(module_name, path)
     assert spec is not None
     assert spec.loader is not None
diff --git a/tests/extensions/test_sandbox_runloop_mounts.py b/tests/extensions/sandbox/test_runloop_mounts.py
similarity index 100%
rename from tests/extensions/test_sandbox_runloop_mounts.py
rename to tests/extensions/sandbox/test_runloop_mounts.py
diff --git a/tests/extensions/test_sandbox_vercel.py b/tests/extensions/sandbox/test_vercel.py
similarity index 100%
rename from tests/extensions/test_sandbox_vercel.py
rename to tests/extensions/sandbox/test_vercel.py
diff --git a/tests/test_openai_conversations_session.py b/tests/memory/test_openai_conversations_session.py
similarity index 100%
rename from tests/test_openai_conversations_session.py
rename to tests/memory/test_openai_conversations_session.py
diff --git a/tests/test_session.py b/tests/memory/test_session.py
similarity index 99%
rename from tests/test_session.py
rename to tests/memory/test_session.py
index aa8211500a..27b5c6fa7b 100644
--- a/tests/test_session.py
+++ b/tests/memory/test_session.py
@@ -8,9 +8,8 @@
 import pytest
 
 from agents import Agent, RunConfig, Runner, SQLiteSession, TResponseInputItem
-
-from .fake_model import FakeModel
-from .test_responses import get_text_message
+from tests.fake_model import FakeModel
+from tests.test_responses import get_text_message
 
 
 # Helper functions for parametrized testing of different Runner methods
diff --git a/tests/test_session_limit.py b/tests/memory/test_session_limit.py
similarity index 99%
rename from tests/test_session_limit.py
rename to tests/memory/test_session_limit.py
index f8625f05c5..5b908ee967 100644
--- a/tests/test_session_limit.py
+++ b/tests/memory/test_session_limit.py
@@ -8,8 +8,8 @@
 from agents import Agent, RunConfig, SQLiteSession
 from agents.memory import SessionSettings
 from tests.fake_model import FakeModel
+from tests.memory.test_session import run_agent_async
 from tests.test_responses import get_text_message
-from tests.test_session import run_agent_async
 
 
 @pytest.mark.parametrize("runner_method", ["run", "run_sync", "run_streamed"])
diff --git a/tests/test_anthropic_thinking_blocks.py b/tests/models/test_anthropic_thinking_blocks.py
similarity index 100%
rename from tests/test_anthropic_thinking_blocks.py
rename to tests/models/test_anthropic_thinking_blocks.py
diff --git a/tests/test_extended_thinking_message_order.py b/tests/models/test_extended_thinking_message_order.py
similarity index 100%
rename from tests/test_extended_thinking_message_order.py
rename to tests/models/test_extended_thinking_message_order.py
diff --git a/tests/test_gemini_thought_signatures.py b/tests/models/test_gemini_thought_signatures.py
similarity index 100%
rename from tests/test_gemini_thought_signatures.py
rename to tests/models/test_gemini_thought_signatures.py
diff --git a/tests/test_gemini_thought_signatures_stream.py b/tests/models/test_gemini_thought_signatures_stream.py
similarity index 100%
rename from tests/test_gemini_thought_signatures_stream.py
rename to tests/models/test_gemini_thought_signatures_stream.py
diff --git a/tests/test_model_payload_iterators.py b/tests/models/test_model_payload_iterators.py
similarity index 100%
rename from tests/test_model_payload_iterators.py
rename to tests/models/test_model_payload_iterators.py
diff --git a/tests/test_model_retry.py b/tests/models/test_model_retry.py
similarity index 99%
rename from tests/test_model_retry.py
rename to tests/models/test_model_retry.py
index 98b87fbea0..5a99efd282 100644
--- a/tests/test_model_retry.py
+++ b/tests/models/test_model_retry.py
@@ -25,8 +25,7 @@
 )
 from agents.run_internal.model_retry import get_response_with_retry, stream_response_with_retry
 from agents.usage import Usage
-
-from .test_responses import get_text_message
+from tests.test_responses import get_text_message
 
 
 def _connection_error(message: str = "connection error") -> APIConnectionError:
diff --git a/tests/test_openai_chatcompletions.py b/tests/models/test_openai_chatcompletions.py
similarity index 100%
rename from tests/test_openai_chatcompletions.py
rename to tests/models/test_openai_chatcompletions.py
diff --git a/tests/test_openai_chatcompletions_converter.py b/tests/models/test_openai_chatcompletions_converter.py
similarity index 100%
rename from tests/test_openai_chatcompletions_converter.py
rename to tests/models/test_openai_chatcompletions_converter.py
diff --git a/tests/test_openai_chatcompletions_stream.py b/tests/models/test_openai_chatcompletions_stream.py
similarity index 100%
rename from tests/test_openai_chatcompletions_stream.py
rename to tests/models/test_openai_chatcompletions_stream.py
diff --git a/tests/test_openai_client_utils.py b/tests/models/test_openai_client_utils.py
similarity index 100%
rename from tests/test_openai_client_utils.py
rename to tests/models/test_openai_client_utils.py
diff --git a/tests/test_openai_responses.py b/tests/models/test_openai_responses.py
similarity index 100%
rename from tests/test_openai_responses.py
rename to tests/models/test_openai_responses.py
diff --git a/tests/test_openai_responses_converter.py b/tests/models/test_openai_responses_converter.py
similarity index 100%
rename from tests/test_openai_responses_converter.py
rename to tests/models/test_openai_responses_converter.py
diff --git a/tests/test_reasoning_content.py b/tests/models/test_reasoning_content.py
similarity index 100%
rename from tests/test_reasoning_content.py
rename to tests/models/test_reasoning_content.py
diff --git a/tests/test_remove_openai_responses_api_incompatible_fields.py b/tests/models/test_remove_openai_responses_api_incompatible_fields.py
similarity index 100%
rename from tests/test_remove_openai_responses_api_incompatible_fields.py
rename to tests/models/test_remove_openai_responses_api_incompatible_fields.py
diff --git a/tests/test_responses_websocket_session.py b/tests/models/test_responses_websocket_session.py
similarity index 100%
rename from tests/test_responses_websocket_session.py
rename to tests/models/test_responses_websocket_session.py
diff --git a/tests/test_session_exceptions.py b/tests/realtime/test_session_exceptions.py
similarity index 100%
rename from tests/test_session_exceptions.py
rename to tests/realtime/test_session_exceptions.py
diff --git a/tests/test_sandbox_memory.py b/tests/sandbox/test_memory.py
similarity index 100%
rename from tests/test_sandbox_memory.py
rename to tests/sandbox/test_memory.py
diff --git a/tests/test_sandbox_runtime_agent_preparation.py b/tests/sandbox/test_runtime_agent_preparation.py
similarity index 100%
rename from tests/test_sandbox_runtime_agent_preparation.py
rename to tests/sandbox/test_runtime_agent_preparation.py

From 6fbdacee6b3d1152a29ced299b1f8a16bb410643 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Sat, 2 May 2026 15:09:25 +0900
Subject: [PATCH 3/3] fix test failures on windows

---
 tests/sandbox/test_workspace_payloads.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/sandbox/test_workspace_payloads.py b/tests/sandbox/test_workspace_payloads.py
index 3a5b8d2b2c..5084da6ff2 100644
--- a/tests/sandbox/test_workspace_payloads.py
+++ b/tests/sandbox/test_workspace_payloads.py
@@ -89,8 +89,9 @@ def test_coerce_write_payload_supports_readinto_seek_and_tell() -> None:
 
 
 def test_coerce_write_payload_rejects_text_chunks() -> None:
+    path = Path("/workspace/file.txt")
     payload = coerce_write_payload(
-        path=Path("/workspace/file.txt"),
+        path=path,
         data=cast(io.IOBase, _TextReadStream()),
     )
 
@@ -99,7 +100,7 @@ def test_coerce_write_payload_rejects_text_chunks() -> None:
 
     assert exc_info.value.error_code is ErrorCode.WORKSPACE_WRITE_TYPE_ERROR
     assert exc_info.value.context == {
-        "path": "/workspace/file.txt",
+        "path": str(path),
         "actual_type": "str",
     }