diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py
index 1969ec75..8e3af61d 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py
@@ -1,8 +1,8 @@
-from typing import Any
+from typing import Any, Dict, Optional
 
 from ldai import log
-from ldai.providers import AgentResult, AgentRunner
-from ldai.providers.types import LDAIMetrics
+from ldai.providers.runner import Runner
+from ldai.providers.types import LDAIMetrics, RunnerResult
 
 from ldai_langchain.langchain_helper import (
     extract_last_message_content,
@@ -10,25 +10,32 @@
 )
 
 
-class LangChainAgentRunner(AgentRunner):
+class LangChainAgentRunner(Runner):
     """
     CAUTION:
     This feature is experimental and should NOT be considered ready for production use.
     It may change or be removed without notice and is not subject to backwards
     compatibility guarantees.
 
-    AgentRunner implementation for LangChain.
+    Runner implementation for LangChain agents.
 
     Wraps a compiled LangChain agent graph (from ``langchain.agents.create_agent``)
     and delegates execution to it. Tool calling and loop management are handled
     internally by the graph.
     Returned by LangChainRunnerFactory.create_agent(config, tools).
+
+    Implements the unified :class:`~ldai.providers.runner.Runner` protocol via
+    :meth:`run`.
     """
 
     def __init__(self, agent: Any):
         self._agent = agent
 
-    async def run(self, input: Any) -> AgentResult:
+    async def run(
+        self,
+        input: Any,
+        output_type: Optional[Dict[str, Any]] = None,
+    ) -> RunnerResult:
         """
         Run the agent with the given input string.
 
@@ -36,7 +43,10 @@ async def run(self, input: Any) -> AgentResult:
         the tool-calling loop internally.
 
         :param input: The user prompt or input to the agent
-        :return: AgentResult with output, raw response, and aggregated metrics
+        :param output_type: Reserved for future structured output support;
+            currently ignored.
+        :return: :class:`RunnerResult` with ``content``, ``raw`` response, and
+            aggregated metrics.
         """
         try:
             result = await self._agent.ainvoke({
@@ -44,19 +54,18 @@ async def run(self, input: Any) -> AgentResult:
             })
             messages = result.get("messages", [])
             output = extract_last_message_content(messages)
-            return AgentResult(
-                output=output,
-                raw=result,
+            return RunnerResult(
+                content=output,
                 metrics=LDAIMetrics(
                     success=True,
                     usage=sum_token_usage_from_messages(messages),
                 ),
+                raw=result,
             )
         except Exception as error:
             log.warning(f"LangChain agent run failed: {error}")
-            return AgentResult(
-                output="",
-                raw=None,
+            return RunnerResult(
+                content="",
                 metrics=LDAIMetrics(success=False, usage=None),
             )
 
diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py
index d504030b..213c072d 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py
@@ -1,10 +1,10 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.messages import BaseMessage
 from ldai import LDMessage, log
-from ldai.providers.model_runner import ModelRunner
-from ldai.providers.types import LDAIMetrics, ModelResponse, StructuredResponse
+from ldai.providers.runner import Runner
+from ldai.providers.types import LDAIMetrics, RunnerResult
 
 from ldai_langchain.langchain_helper import (
     convert_messages_to_langchain,
@@ -13,12 +13,15 @@
 )
 
 
-class LangChainModelRunner(ModelRunner):
+class LangChainModelRunner(Runner):
     """
-    ModelRunner implementation for LangChain.
+    Runner implementation for LangChain chat models.
 
     Holds a fully-configured BaseChatModel.
-    Returned by LangChainConnector.create_model(config).
+    Returned by LangChainRunnerFactory.create_model(config).
+
+    Implements the unified :class:`~ldai.providers.runner.Runner` protocol via
+    :meth:`run`.
     """
 
     def __init__(self, llm: BaseChatModel):
@@ -32,13 +35,38 @@ def get_llm(self) -> BaseChatModel:
         """
         return self._llm
 
-    async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse:
+    async def run(
+        self,
+        input: Any,
+        output_type: Optional[Dict[str, Any]] = None,
+    ) -> RunnerResult:
         """
-        Invoke the LangChain model with an array of messages.
-
-        :param messages: Array of LDMessage objects representing the conversation
-        :return: ModelResponse containing the model's response and metrics
+        Run the LangChain model with the given input.
+
+        :param input: A string prompt or a list of :class:`LDMessage` objects
+        :param output_type: Optional JSON schema dict requesting structured output.
+            When provided, ``parsed`` on the returned :class:`RunnerResult` is
+            populated with the parsed JSON document.
+        :return: :class:`RunnerResult` containing ``content``, ``metrics``,
+            ``raw`` and (when ``output_type`` is set) ``parsed``.
         """
+        messages = self._coerce_input(input)
+
+        if output_type is not None:
+            return await self._run_structured(messages, output_type)
+        return await self._run_completion(messages)
+
+    @staticmethod
+    def _coerce_input(input: Any) -> List[LDMessage]:
+        if isinstance(input, str):
+            return [LDMessage(role='user', content=input)]
+        if isinstance(input, list):
+            return input
+        raise TypeError(
+            f"Unsupported input type for LangChainModelRunner.run: {type(input).__name__}"
+        )
+
+    async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult:
         try:
             langchain_messages = convert_messages_to_langchain(messages)
             response: BaseMessage = await self._llm.ainvoke(langchain_messages)
@@ -52,58 +80,63 @@ async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse:
                     f'Multimodal response not supported, expecting a string. '
                     f'Content type: {type(response.content)}, Content: {response.content}'
                 )
-                metrics = LDAIMetrics(success=False, usage=metrics.usage)
+                return RunnerResult(
+                    content='',
+                    metrics=LDAIMetrics(success=False, usage=metrics.usage),
+                    raw=response,
+                )
 
-            return ModelResponse(
-                message=LDMessage(role='assistant', content=content),
-                metrics=metrics,
-            )
+            return RunnerResult(content=content, metrics=metrics, raw=response)
         except Exception as error:
             log.warning(f'LangChain model invocation failed: {error}')
-            return ModelResponse(
-                message=LDMessage(role='assistant', content=''),
+            return RunnerResult(
+                content='',
                 metrics=LDAIMetrics(success=False, usage=None),
             )
 
-    async def invoke_structured_model(
+    async def _run_structured(
         self,
         messages: List[LDMessage],
-        response_structure: Dict[str, Any],
-    ) -> StructuredResponse:
-        """
-        Invoke the LangChain model with structured output support.
-
-        :param messages: Array of LDMessage objects representing the conversation
-        :param response_structure: Dictionary defining the output structure
-        :return: StructuredResponse containing the structured data
-        """
-        structured_response = StructuredResponse(
-            data={},
-            raw_response='',
-            metrics=LDAIMetrics(success=False, usage=None),
-        )
+        output_type: Dict[str, Any],
+    ) -> RunnerResult:
         try:
             langchain_messages = convert_messages_to_langchain(messages)
-            structured_llm = self._llm.with_structured_output(response_structure, include_raw=True)
+            structured_llm = self._llm.with_structured_output(output_type, include_raw=True)
             response = await structured_llm.ainvoke(langchain_messages)
 
             if not isinstance(response, dict):
                 log.warning(f'Structured output did not return a dict. Got: {type(response)}')
-                return structured_response
+                return RunnerResult(
+                    content='',
+                    metrics=LDAIMetrics(success=False, usage=None),
+                )
 
             raw_response = response.get('raw')
+            usage = None
+            raw_content = ''
             if raw_response is not None:
                 if hasattr(raw_response, 'content'):
-                    structured_response.raw_response = raw_response.content
-                structured_response.metrics.usage = get_ai_usage_from_response(raw_response)
+                    raw_content = raw_response.content or ''
+                usage = get_ai_usage_from_response(raw_response)
 
             if response.get('parsing_error'):
                 log.warning('LangChain structured model invocation had a parsing error')
-                return structured_response
+                return RunnerResult(
+                    content=raw_content,
+                    metrics=LDAIMetrics(success=False, usage=usage),
+                    raw=raw_response,
+                )
 
-            structured_response.metrics.success = True
-            structured_response.data = response.get('parsed') or {}
-            return structured_response
+            parsed = response.get('parsed') or {}
+            return RunnerResult(
+                content=raw_content,
+                metrics=LDAIMetrics(success=True, usage=usage),
+                raw=raw_response,
+                parsed=parsed,
+            )
         except Exception as error:
             log.warning(f'LangChain structured model invocation failed: {error}')
-            return structured_response
+            return RunnerResult(
+                content='',
+                metrics=LDAIMetrics(success=False, usage=None),
+            )
diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py
index 9ecb2351..15eee41f 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py
@@ -329,8 +329,10 @@ async def run(self, input: Any) -> AgentGraphResult:
             messages = result.get('messages', [])
             output = extract_last_message_content(messages)
 
-            # Flush per-node metrics to LD trackers
-            all_eval_results = await handler.flush(self._graph, pending_eval_tasks)
+            # Flush per-node metrics to LD trackers; eval results are tracked
+            # internally and intentionally not exposed on AgentGraphResult here
+            # — judge dispatch is the managed layer's responsibility.
+            await handler.flush(self._graph, pending_eval_tasks)
 
             tracker.track_path(handler.path)
             tracker.track_duration(duration)
@@ -341,7 +343,6 @@ async def run(self, input: Any) -> AgentGraphResult:
                 output=output,
                 raw=result,
                 metrics=LDAIMetrics(success=True),
-                evaluations=all_eval_results,
             )
 
         except Exception as exc:
diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
index 4018e7c3..a8fc46cf 100644
--- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
+++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
@@ -219,8 +219,8 @@ def test_returns_provider_name_unchanged_for_unmapped_providers(self):
         assert map_provider('unknown') == 'unknown'
 
 
-class TestInvokeModel:
-    """Tests for invoke_model instance method."""
+class TestRunCompletion:
+    """Tests for run() without structured output."""
 
     @pytest.fixture
     def mock_llm(self):
@@ -235,10 +235,10 @@ async def test_returns_success_true_for_string_content(self, mock_llm):
         provider = LangChainModelRunner(mock_llm)
 
         messages = [LDMessage(role='user', content='Hello')]
-        result = await provider.invoke_model(messages)
+        result = await provider.run(messages)
 
         assert result.metrics.success is True
-        assert result.message.content == 'Test response'
+        assert result.content == 'Test response'
 
     @pytest.mark.asyncio
     async def test_returns_success_false_for_non_string_content_and_logs_warning(self, mock_llm):
@@ -248,10 +248,10 @@ async def test_returns_success_false_for_non_string_content_and_logs_warning(sel
         provider = LangChainModelRunner(mock_llm)
 
         messages = [LDMessage(role='user', content='Hello')]
-        result = await provider.invoke_model(messages)
+        result = await provider.run(messages)
 
         assert result.metrics.success is False
-        assert result.message.content == ''
+        assert result.content == ''
 
     @pytest.mark.asyncio
     async def test_returns_success_false_when_model_invocation_throws_error(self, mock_llm):
@@ -261,15 +261,14 @@ async def test_returns_success_false_when_model_invocation_throws_error(self, mo
         provider = LangChainModelRunner(mock_llm)
 
         messages = [LDMessage(role='user', content='Hello')]
-        result = await provider.invoke_model(messages)
+        result = await provider.run(messages)
 
         assert result.metrics.success is False
-        assert result.message.content == ''
-        assert result.message.role == 'assistant'
+        assert result.content == ''
 
 
-class TestInvokeStructuredModel:
-    """Tests for invoke_structured_model instance method."""
+class TestRunStructured:
+    """Tests for run() with structured output."""
 
     @pytest.fixture
     def mock_llm(self):
@@ -288,10 +287,10 @@ async def test_returns_success_true_for_successful_invocation(self, mock_llm):
 
         messages = [LDMessage(role='user', content='Hello')]
         response_structure = {'type': 'object', 'properties': {}}
-        result = await provider.invoke_structured_model(messages, response_structure)
+        result = await provider.run(messages, output_type=response_structure)
 
         assert result.metrics.success is True
-        assert result.data == parsed_data
+        assert result.parsed == parsed_data
 
     @pytest.mark.asyncio
     async def test_returns_success_false_when_structured_model_invocation_throws_error(self, mock_llm):
@@ -304,11 +303,11 @@ async def test_returns_success_false_when_structured_model_invocation_throws_err
 
         messages = [LDMessage(role='user', content='Hello')]
         response_structure = {'type': 'object', 'properties': {}}
-        result = await provider.invoke_structured_model(messages, response_structure)
+        result = await provider.run(messages, output_type=response_structure)
 
         assert result.metrics.success is False
-        assert result.data == {}
-        assert result.raw_response == ''
+        assert result.parsed is None
+        assert result.raw is None
         assert result.metrics.usage is None
 
 
@@ -464,7 +463,7 @@ class TestLangChainAgentRunner:
 
     @pytest.mark.asyncio
     async def test_runs_agent_and_returns_result(self):
-        """Should return AgentResult with the last message content from the graph."""
+        """Should return RunnerResult with the last message content from the graph."""
         from ldai_langchain import LangChainAgentRunner
 
         final_msg = AIMessage(content="The answer is 42.")
@@ -474,7 +473,7 @@ async def test_runs_agent_and_returns_result(self):
         runner = LangChainAgentRunner(mock_agent)
         result = await runner.run("What is the answer?")
 
-        assert result.output == "The answer is 42."
+        assert result.content == "The answer is 42."
         assert result.metrics.success is True
         mock_agent.ainvoke.assert_called_once_with(
             {"messages": [{"role": "user", "content": "What is the answer?"}]}
@@ -496,7 +495,7 @@ async def test_aggregates_token_usage_across_messages(self):
         runner = LangChainAgentRunner(mock_agent)
         result = await runner.run("Hello")
 
-        assert result.output == "final answer"
+        assert result.content == "final answer"
         assert result.metrics.success is True
         assert result.metrics.usage is not None
         assert result.metrics.usage.total == 30
@@ -505,7 +504,7 @@ async def test_aggregates_token_usage_across_messages(self):
 
     @pytest.mark.asyncio
     async def test_returns_failure_when_exception_thrown(self):
-        """Should return unsuccessful AgentResult when exception is thrown."""
+        """Should return unsuccessful RunnerResult when exception is thrown."""
         from ldai_langchain import LangChainAgentRunner
 
         mock_agent = MagicMock()
@@ -514,7 +513,7 @@ async def test_returns_failure_when_exception_thrown(self):
         runner = LangChainAgentRunner(mock_agent)
         result = await runner.run("Hello")
 
-        assert result.output == ""
+        assert result.content == ""
         assert result.metrics.success is False
 
 
diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py
index 7e79c836..4f6866cc 100644
--- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py
+++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py
@@ -1,28 +1,31 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from ldai import log
-from ldai.providers import AgentResult, AgentRunner, ToolRegistry
+from ldai.providers import RunnerResult, ToolRegistry
+from ldai.providers.runner import Runner
 from ldai.providers.types import LDAIMetrics
 
 from ldai_openai.openai_helper import (
     get_ai_usage_from_response,
+    get_tool_calls_from_run_items,
     registry_value_to_agent_tool,
 )
 
 
-class OpenAIAgentRunner(AgentRunner):
+class OpenAIAgentRunner(Runner):
     """
     CAUTION:
     This feature is experimental and should NOT be considered ready for production use.
     It may change or be removed without notice and is not subject to backwards
     compatibility guarantees.
 
-    AgentRunner implementation for OpenAI.
+    Runner implementation for a single OpenAI agent.
 
     Executes a single agent using the OpenAI Agents SDK (``openai-agents``).
     Tool calling and the agentic loop are handled internally by ``Runner.run``.
-    Returned by OpenAIRunnerFactory.create_agent(config, tools).
+    Returned by ``OpenAIRunnerFactory.create_agent(config, tools)``.
 
+    Implements the unified :class:`~ldai.providers.runner.Runner` protocol.
     Requires ``openai-agents`` to be installed.
     """
 
@@ -40,15 +43,22 @@ def __init__(
         self._tool_definitions = tool_definitions
         self._tools = tools
 
-    async def run(self, input: Any) -> AgentResult:
+    async def run(
+        self,
+        input: Any,
+        output_type: Optional[Dict[str, Any]] = None,
+    ) -> RunnerResult:
         """
-        Run the agent with the given input string.
+        Run the agent with the given input.
 
         Delegates to the OpenAI Agents SDK ``Runner.run``, which handles the
         tool-calling loop internally.
 
         :param input: The user prompt or input to the agent
-        :return: AgentResult with output, raw response, and aggregated metrics
+        :param output_type: Reserved for future structured output support;
+            currently ignored.
+        :return: :class:`RunnerResult` with ``content``, ``raw`` response, and
+            metrics including aggregated token usage and observed ``tool_calls``.
         """
         try:
             from agents import Agent, Runner
@@ -57,7 +67,10 @@ async def run(self, input: Any) -> AgentResult:
                 "openai-agents is required for OpenAIAgentRunner. "
                 "Install it with: pip install openai-agents"
             )
-            return AgentResult(output="", raw=None, metrics=LDAIMetrics(success=False, usage=None))
+            return RunnerResult(
+                content="",
+                metrics=LDAIMetrics(success=False, usage=None),
+            )
 
         try:
             agent_tools = self._build_agent_tools()
@@ -73,17 +86,26 @@ async def run(self, input: Any) -> AgentResult:
 
             result = await Runner.run(agent, str(input), max_turns=25)
 
-            return AgentResult(
-                output=str(result.final_output),
-                raw=result,
+            tool_calls = [
+                tool_name
+                for _agent_name, tool_name in get_tool_calls_from_run_items(result.new_items)
+            ]
+
+            return RunnerResult(
+                content=str(result.final_output),
                 metrics=LDAIMetrics(
                     success=True,
                     usage=get_ai_usage_from_response(result),
+                    tool_calls=tool_calls if tool_calls else None,
                 ),
+                raw=result,
             )
         except Exception as error:
             log.warning(f"OpenAI agent run failed: {error}")
-            return AgentResult(output="", raw=None, metrics=LDAIMetrics(success=False, usage=None))
+            return RunnerResult(
+                content="",
+                metrics=LDAIMetrics(success=False, usage=None),
+            )
 
     def _build_agent_tools(self) -> List[Any]:
         """Build tool instances from LD tool definitions and registry."""
diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py
index 9c4a34d8..75695ba0 100644
--- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py
+++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py
@@ -1,9 +1,9 @@
 import json
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from ldai import LDMessage, log
-from ldai.providers.model_runner import ModelRunner
-from ldai.providers.types import LDAIMetrics, ModelResponse, StructuredResponse
+from ldai.providers.runner import Runner
+from ldai.providers.types import LDAIMetrics, RunnerResult
 from openai import AsyncOpenAI
 
 from ldai_openai.openai_helper import (
@@ -12,12 +12,15 @@
 )
 
 
-class OpenAIModelRunner(ModelRunner):
+class OpenAIModelRunner(Runner):
     """
-    ModelRunner implementation for OpenAI.
+    Runner implementation for OpenAI chat completions.
 
     Holds a fully-configured AsyncOpenAI client, model name, and parameters.
-    Returned by OpenAIConnector.create_model(config).
+    Returned by ``OpenAIRunnerFactory.create_model(config)``.
+
+    Implements the unified :class:`~ldai.providers.runner.Runner` protocol via
+    :meth:`run`.
     """
 
     def __init__(
@@ -30,13 +33,38 @@ def __init__(
         self._model_name = model_name
         self._parameters = parameters
 
-    async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse:
+    async def run(
+        self,
+        input: Any,
+        output_type: Optional[Dict[str, Any]] = None,
+    ) -> RunnerResult:
         """
-        Invoke the OpenAI model with an array of messages.
-
-        :param messages: Array of LDMessage objects representing the conversation
-        :return: ModelResponse containing the model's response and metrics
+        Run the OpenAI model with the given input.
+
+        :param input: A string prompt or a list of :class:`LDMessage` objects
+        :param output_type: Optional JSON schema dict requesting structured output.
+            When provided, ``parsed`` on the returned :class:`RunnerResult` is
+            populated with the parsed JSON document.
+        :return: :class:`RunnerResult` containing ``content``, ``metrics``,
+            ``raw`` and (when ``output_type`` is set) ``parsed``.
         """
+        messages = self._coerce_input(input)
+
+        if output_type is not None:
+            return await self._run_structured(messages, output_type)
+        return await self._run_completion(messages)
+
+    @staticmethod
+    def _coerce_input(input: Any) -> List[LDMessage]:
+        if isinstance(input, str):
+            return [LDMessage(role='user', content=input)]
+        if isinstance(input, list):
+            return input
+        raise TypeError(
+            f"Unsupported input type for OpenAIModelRunner.run: {type(input).__name__}"
+        )
+
+    async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult:
         try:
             response = await self._client.chat.completions.create(
                 model=self._model_name,
@@ -45,40 +73,29 @@ async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse:
             )
 
             metrics = get_ai_metrics_from_response(response)
-
-            content = ''
-            if response.choices and len(response.choices) > 0:
-                message = response.choices[0].message
-                if message and message.content:
-                    content = message.content
+            content = self._extract_content(response)
 
             if not content:
                 log.warning('OpenAI response has no content available')
-                metrics = LDAIMetrics(success=False, usage=metrics.usage)
+                return RunnerResult(
+                    content='',
+                    metrics=LDAIMetrics(success=False, usage=metrics.usage),
+                    raw=response,
+                )
 
-            return ModelResponse(
-                message=LDMessage(role='assistant', content=content),
-                metrics=metrics,
-            )
+            return RunnerResult(content=content, metrics=metrics, raw=response)
         except Exception as error:
             log.warning(f'OpenAI model invocation failed: {error}')
-            return ModelResponse(
-                message=LDMessage(role='assistant', content=''),
+            return RunnerResult(
+                content='',
                 metrics=LDAIMetrics(success=False, usage=None),
             )
 
-    async def invoke_structured_model(
+    async def _run_structured(
         self,
         messages: List[LDMessage],
-        response_structure: Dict[str, Any],
-    ) -> StructuredResponse:
-        """
-        Invoke the OpenAI model with structured output support.
-
-        :param messages: Array of LDMessage objects representing the conversation
-        :param response_structure: Dictionary defining the JSON schema for output structure
-        :return: StructuredResponse containing the structured data
-        """
+        output_type: Dict[str, Any],
+    ) -> RunnerResult:
         try:
             response = await self._client.chat.completions.create(
                 model=self._model_name,
@@ -87,7 +104,7 @@ async def invoke_structured_model(
                     'type': 'json_schema',
                     'json_schema': {
                         'name': 'structured_output',
-                        'schema': response_structure,
+                        'schema': output_type,
                         'strict': True,
                     },
                 },
@@ -95,35 +112,42 @@ async def invoke_structured_model(
             )
 
             metrics = get_ai_metrics_from_response(response)
-
-            content = ''
-            if response.choices and len(response.choices) > 0:
-                message = response.choices[0].message
-                if message and message.content:
-                    content = message.content
+            content = self._extract_content(response)
 
             if not content:
                 log.warning('OpenAI structured response has no content available')
-                return StructuredResponse(
-                    data={},
-                    raw_response='',
+                return RunnerResult(
+                    content='',
                     metrics=LDAIMetrics(success=False, usage=metrics.usage),
+                    raw=response,
                 )
 
             try:
-                data = json.loads(content)
-                return StructuredResponse(data=data, raw_response=content, metrics=metrics)
+                parsed = json.loads(content)
+                return RunnerResult(
+                    content=content,
+                    metrics=metrics,
+                    raw=response,
+                    parsed=parsed,
+                )
             except json.JSONDecodeError as parse_error:
                 log.warning(f'OpenAI structured response contains invalid JSON: {parse_error}')
-                return StructuredResponse(
-                    data={},
-                    raw_response=content,
+                return RunnerResult(
+                    content=content,
                     metrics=LDAIMetrics(success=False, usage=metrics.usage),
+                    raw=response,
                 )
         except Exception as error:
             log.warning(f'OpenAI structured model invocation failed: {error}')
-            return StructuredResponse(
-                data={},
-                raw_response='',
+            return RunnerResult(
+                content='',
                 metrics=LDAIMetrics(success=False, usage=None),
             )
+
+    @staticmethod
+    def _extract_content(response: Any) -> str:
+        if response.choices and len(response.choices) > 0:
+            message = response.choices[0].message
+            if message and message.content:
+                return message.content
+        return ''
diff --git a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py
index 19d2cff7..3b69d3f6 100644
--- a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py
+++ b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py
@@ -120,8 +120,8 @@ def test_handles_partial_usage_data(self):
         assert result.usage.output == 0
 
 
-class TestInvokeModel:
-    """Tests for invoke_model instance method."""
+class TestRunCompletion:
+    """Tests for the unified run() method (chat-completion path)."""
 
     @pytest.fixture
     def mock_client(self):
@@ -144,15 +144,14 @@ async def test_invokes_openai_chat_completions_and_returns_response(self, mock_c
 
         provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {})
         messages = [LDMessage(role='user', content='Hello!')]
-        result = await provider.invoke_model(messages)
+        result = await provider.run(messages)
 
         mock_client.chat.completions.create.assert_called_once_with(
             model='gpt-3.5-turbo',
             messages=[{'role': 'user', 'content': 'Hello!'}],
         )
 
-        assert result.message.role == 'assistant'
-        assert result.message.content == 'Hello! How can I help you today?'
+        assert result.content == 'Hello! How can I help you today?'
         assert result.metrics.success is True
         assert result.metrics.usage is not None
         assert result.metrics.usage.total == 25
@@ -174,10 +173,9 @@ async def test_returns_unsuccessful_response_when_no_content(self, mock_client):
 
         provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {})
         messages = [LDMessage(role='user', content='Hello!')]
-        result = await provider.invoke_model(messages)
+        result = await provider.run(messages)
 
-        assert result.message.role == 'assistant'
-        assert result.message.content == ''
+        assert result.content == ''
         assert result.metrics.success is False
 
     @pytest.mark.asyncio
@@ -193,10 +191,9 @@ async def test_returns_unsuccessful_response_when_choices_empty(self, mock_clien
 
         provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {})
         messages = [LDMessage(role='user', content='Hello!')]
-        result = await provider.invoke_model(messages)
+        result = await provider.run(messages)
 
-        assert result.message.role == 'assistant'
-        assert result.message.content == ''
+        assert result.content == ''
         assert result.metrics.success is False
 
     @pytest.mark.asyncio
@@ -208,15 +205,14 @@ async def test_returns_unsuccessful_response_when_exception_thrown(self, mock_cl
 
         provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {})
         messages = [LDMessage(role='user', content='Hello!')]
-        result = await provider.invoke_model(messages)
+        result = await provider.run(messages)
 
-        assert result.message.role == 'assistant'
-        assert result.message.content == ''
+        assert result.content == ''
         assert result.metrics.success is False
 
 
-class TestInvokeStructuredModel:
-    """Tests for invoke_structured_model instance method."""
+class TestRunStructured:
+    """Tests for the unified run() method (structured-output path)."""
 
     @pytest.fixture
     def mock_client(self):
@@ -249,10 +245,10 @@ async def test_invokes_openai_with_structured_output(self, mock_client):
             'required': ['name', 'age', 'city'],
         }
 
-        result = await provider.invoke_structured_model(messages, response_structure)
+        result = await provider.run(messages, output_type=response_structure)
 
-        assert result.data == {'name': 'John', 'age': 30, 'city': 'New York'}
-        assert result.raw_response == '{"name": "John", "age": 30, "city": "New York"}'
+        assert result.parsed == {'name': 'John', 'age': 30, 'city': 'New York'}
+        assert result.content == '{"name": "John", "age": 30, "city": "New York"}'
         assert result.metrics.success is True
         assert result.metrics.usage is not None
         assert result.metrics.usage.total == 30
@@ -276,10 +272,10 @@ async def test_returns_unsuccessful_when_no_content_in_structured_response(self,
         messages = [LDMessage(role='user', content='Tell me about a person')]
         response_structure = {'type': 'object'}
 
-        result = await provider.invoke_structured_model(messages, response_structure)
+        result = await provider.run(messages, output_type=response_structure)
 
-        assert result.data == {}
-        assert result.raw_response == ''
+        assert result.parsed is None
+        assert result.content == ''
         assert result.metrics.success is False
 
     @pytest.mark.asyncio
@@ -300,10 +296,10 @@ async def test_handles_json_parsing_errors(self, mock_client):
         messages = [LDMessage(role='user', content='Tell me about a person')]
         response_structure = {'type': 'object'}
 
-        result = await provider.invoke_structured_model(messages, response_structure)
+        result = await provider.run(messages, output_type=response_structure)
 
-        assert result.data == {}
-        assert result.raw_response == 'invalid json content'
+        assert result.parsed is None
+        assert result.content == 'invalid json content'
         assert result.metrics.success is False
         assert result.metrics.usage is not None
         assert result.metrics.usage.total == 15
@@ -319,10 +315,10 @@ async def test_returns_unsuccessful_response_when_exception_thrown(self, mock_cl
         messages = [LDMessage(role='user', content='Tell me about a person')]
         response_structure = {'type': 'object'}
 
-        result = await provider.invoke_structured_model(messages, response_structure)
+        result = await provider.run(messages, output_type=response_structure)
 
-        assert result.data == {}
-        assert result.raw_response == ''
+        assert result.parsed is None
+        assert result.content == ''
         assert result.metrics.success is False
 
 
@@ -465,19 +461,20 @@ def _make_run_result(self, output: str, total: int = 15, input_tokens: int = 10,
 
     @pytest.mark.asyncio
     async def test_runs_agent_and_returns_result_with_no_tool_calls(self):
-        """Should return AgentResult when Runner.run returns a final output."""
+        """Should return RunnerResult when Runner.run returns a final output."""
         import sys
 
         from ldai_openai import OpenAIAgentRunner
 
         mock_run_result = self._make_run_result("The answer is 42.", total=15, input_tokens=10, output_tokens=5)
+        mock_run_result.new_items = []
         agents_mock, tc_mock = _make_agents_mock(AsyncMock(return_value=mock_run_result))
 
         runner = OpenAIAgentRunner('gpt-4', {}, 'You are helpful.', [], {})
         with patch.dict(sys.modules, {'agents': agents_mock, 'agents.tool_context': tc_mock}):
             result = await runner.run("What is the answer?")
 
-        assert result.output == "The answer is 42."
+        assert result.content == "The answer is 42."
         assert result.metrics.success is True
         assert result.metrics.usage is not None
         assert result.metrics.usage.total == 15
@@ -490,6 +487,7 @@ async def test_executes_tool_calls_and_returns_final_response(self):
         from ldai_openai import OpenAIAgentRunner
 
         mock_run_result = self._make_run_result("It is sunny in Paris.", total=43, input_tokens=30, output_tokens=13)
+        mock_run_result.new_items = []
         agents_mock, tc_mock = _make_agents_mock(AsyncMock(return_value=mock_run_result))
 
         weather_fn = MagicMock(return_value="Sunny, 25°C")
@@ -501,13 +499,13 @@ async def test_executes_tool_calls_and_returns_final_response(self):
         with patch.dict(sys.modules, {'agents': agents_mock, 'agents.tool_context': tc_mock}):
             result = await runner.run("What is the weather in Paris?")
 
-        assert result.output == "It is sunny in Paris."
+        assert result.content == "It is sunny in Paris."
         assert result.metrics.success is True
         assert result.metrics.usage.total == 43
 
     @pytest.mark.asyncio
     async def test_returns_failure_when_exception_thrown(self):
-        """Should return unsuccessful AgentResult when Runner.run raises."""
+        """Should return unsuccessful RunnerResult when Runner.run raises."""
         import sys
 
         from ldai_openai import OpenAIAgentRunner
@@ -518,12 +516,12 @@ async def test_returns_failure_when_exception_thrown(self):
         with patch.dict(sys.modules, {'agents': agents_mock, 'agents.tool_context': tc_mock}):
             result = await runner.run("Hello")
 
-        assert result.output == ""
+        assert result.content == ""
         assert result.metrics.success is False
 
     @pytest.mark.asyncio
     async def test_returns_failure_when_openai_agents_not_installed(self):
-        """Should return unsuccessful AgentResult when openai-agents is not installed."""
+        """Should return unsuccessful RunnerResult when openai-agents is not installed."""
         import sys
 
         from ldai_openai import OpenAIAgentRunner
@@ -532,5 +530,5 @@ async def test_returns_failure_when_openai_agents_not_installed(self):
         with patch.dict(sys.modules, {'agents': None}):
             result = await runner.run("Hello")
 
-        assert result.output == ""
+        assert result.content == ""
         assert result.metrics.success is False
diff --git a/packages/sdk/server-ai/src/ldai/__init__.py b/packages/sdk/server-ai/src/ldai/__init__.py
index 405ec5a8..56d780d3 100644
--- a/packages/sdk/server-ai/src/ldai/__init__.py
+++ b/packages/sdk/server-ai/src/ldai/__init__.py
@@ -34,12 +34,19 @@
 from ldai.providers import (
     AgentGraphResult,
     AgentGraphRunner,
+    AgentGraphRunnerResult,
     AgentResult,
     AgentRunner,
+    GraphMetrics,
+    GraphMetricSummary,
+    ManagedGraphResult,
+    ManagedResult,
+    Runner,
+    RunnerResult,
     ToolRegistry,
 )
 from ldai.providers.types import JudgeResult
-from ldai.tracker import AIGraphTracker
+from ldai.tracker import AIGraphTracker, LDAIMetricSummary
 
 __all__ = [
     'LDAIClient',
@@ -48,6 +55,14 @@
     'AgentGraphRunner',
     'AgentResult',
     'AgentGraphResult',
+    'AgentGraphRunnerResult',
+    'GraphMetrics',
+    'GraphMetricSummary',
+    'ManagedGraphResult',
+    'ManagedResult',
+    'Runner',
+    'RunnerResult',
+    'LDAIMetricSummary',
     'ToolRegistry',
     'AIAgentConfig',
     'AIAgentConfigDefault',
diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py
index 448d5c55..eeff1f1b 100644
--- a/packages/sdk/server-ai/src/ldai/client.py
+++ b/packages/sdk/server-ai/src/ldai/client.py
@@ -815,7 +815,7 @@ async def create_agent_graph(
         if not runner:
             return None
 
-        return ManagedAgentGraph(runner)
+        return ManagedAgentGraph(runner, graph=graph)
 
     def agents(
         self,
diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py
index f2e8c362..6919a7aa 100644
--- a/packages/sdk/server-ai/src/ldai/judge/__init__.py
+++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py
@@ -8,8 +8,8 @@
 from ldai import log
 from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
 from ldai.models import AIJudgeConfig, LDMessage
-from ldai.providers.model_runner import ModelRunner
-from ldai.providers.types import JudgeResult, ModelResponse
+from ldai.providers.runner import Runner
+from ldai.providers.types import JudgeResult, RunnerResult
 
 
 class Judge:
@@ -23,7 +23,7 @@ class Judge:
     def __init__(
         self,
         ai_config: AIJudgeConfig,
-        model_runner: ModelRunner,
+        model_runner: Runner,
     ):
         """
         Initialize the Judge.
@@ -76,10 +76,14 @@ async def evaluate(
 
             response = await tracker.track_metrics_of_async(
                 lambda result: result.metrics,
-                lambda: self._model_runner.invoke_structured_model(messages, self._evaluation_response_structure),
+                lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure),
             )
 
-            parsed = self._parse_evaluation_response(response.data)
+            if response.parsed is None:
+                log.warning('Judge evaluation did not return structured output')
+                return judge_result
+
+            parsed = self._parse_evaluation_response(response.parsed)
 
             if parsed is None:
                 log.warning('Judge evaluation did not return the expected evaluation')
@@ -99,19 +103,19 @@ async def evaluate(
     async def evaluate_messages(
         self,
         messages: list[LDMessage],
-        response: ModelResponse,
+        response: RunnerResult,
         sampling_ratio: float = 1.0,
     ) -> JudgeResult:
         """
         Evaluates an AI response from chat messages and response.
 
         :param messages: Array of messages representing the conversation history
-        :param response: The AI response to be evaluated
+        :param response: The runner result to be evaluated
         :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
         :return: The result of the judge evaluation.
         """
         input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
-        output_text = response.message.content
+        output_text = response.content
 
         return await self.evaluate(input_text, output_text, sampling_ratio)
 
@@ -123,7 +127,7 @@ def get_ai_config(self) -> AIJudgeConfig:
         """
         return self._ai_config
 
-    def get_model_runner(self) -> ModelRunner:
+    def get_model_runner(self) -> Runner:
         """
         Returns the model runner used by this judge.
 
diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py
index ab3ee5e6..9d582ae4 100644
--- a/packages/sdk/server-ai/src/ldai/managed_agent.py
+++ b/packages/sdk/server-ai/src/ldai/managed_agent.py
@@ -1,43 +1,49 @@
 """ManagedAgent — LaunchDarkly managed wrapper for agent invocations."""
 
 from ldai.models import AIAgentConfig
-from ldai.providers import AgentResult, AgentRunner
+from ldai.providers.runner import Runner
+from ldai.providers.types import ManagedResult
 
 
 class ManagedAgent:
     """
     LaunchDarkly managed wrapper for AI agent invocations.
 
-    Holds an AgentRunner. Handles tracking automatically via ``create_tracker()``.
+    Holds a Runner. Handles tracking automatically via ``create_tracker()``.
     Obtain an instance via ``LDAIClient.create_agent()``.
     """
 
     def __init__(
         self,
         ai_config: AIAgentConfig,
-        agent_runner: AgentRunner,
+        agent_runner: Runner,
     ):
         self._ai_config = ai_config
         self._agent_runner = agent_runner
 
-    async def run(self, input: str) -> AgentResult:
+    async def run(self, input: str) -> ManagedResult:
         """
         Run the agent with the given input string.
 
         :param input: The user prompt or input to the agent
-        :return: AgentResult containing the agent's output and metrics
+        :return: ManagedResult containing the agent's output and metric summary
         """
         tracker = self._ai_config.create_tracker()
-        return await tracker.track_metrics_of_async(
-            lambda result: result.metrics,
+        result = await tracker.track_metrics_of_async(
+            lambda r: r.metrics,
             lambda: self._agent_runner.run(input),
         )
+        return ManagedResult(
+            content=result.content,
+            metrics=tracker.get_summary(),
+            raw=result.raw,
+        )
 
-    def get_agent_runner(self) -> AgentRunner:
+    def get_agent_runner(self) -> Runner:
         """
-        Return the underlying AgentRunner for advanced use.
+        Return the underlying runner for advanced use.
 
-        :return: The AgentRunner instance.
+        :return: The Runner instance.
         """
         return self._agent_runner
 
diff --git a/packages/sdk/server-ai/src/ldai/managed_agent_graph.py b/packages/sdk/server-ai/src/ldai/managed_agent_graph.py
index a146e60e..94026973 100644
--- a/packages/sdk/server-ai/src/ldai/managed_agent_graph.py
+++ b/packages/sdk/server-ai/src/ldai/managed_agent_graph.py
@@ -1,17 +1,33 @@
 """ManagedAgentGraph — LaunchDarkly managed wrapper for agent graph execution."""
 
-from typing import Any
+from typing import Any, Optional
 
 from ldai.providers import AgentGraphResult, AgentGraphRunner
+from ldai.providers.types import (
+    AgentGraphRunnerResult,
+    GraphMetricSummary,
+    LDAIMetrics,
+    ManagedGraphResult,
+)
 
 
 class ManagedAgentGraph:
     """
     LaunchDarkly managed wrapper for AI agent graph execution.
 
-    Holds an AgentGraphRunner. Auto-tracking of path,
-    tool calls, handoffs, latency, and invocation success/failure is handled
-    by the runner implementation.
+    Holds an AgentGraphRunner and an optional AgentGraphDefinition. Wraps the
+    runner result in a :class:`~ldai.providers.types.ManagedGraphResult` and
+    builds a :class:`~ldai.providers.types.GraphMetricSummary` from the runner's
+    metrics.
+
+    When the runner returns an :class:`~ldai.providers.types.AgentGraphRunnerResult`
+    (new shape), the managed layer drives all graph-level tracking from
+    ``result.metrics``.  When the runner returns the legacy
+    :class:`~ldai.providers.AgentGraphResult`, tracking has already been performed
+    inside the runner; the managed layer simply wraps the result.  This detection
+    branch exists as a deliberate bridge: once PR 11-openai and PR 11-langchain
+    migrate both runners to return ``AgentGraphRunnerResult``, the legacy branch
+    becomes dead code and will be removed in PR 11-langchain's final cleanup commit.
 
     Obtain an instance via ``LDAIClient.create_agent_graph()``.
     """
@@ -19,25 +35,104 @@ class ManagedAgentGraph:
     def __init__(
         self,
         runner: AgentGraphRunner,
+        graph: Optional[Any] = None,
     ):
         """
         Initialize ManagedAgentGraph.
 
         :param runner: The AgentGraphRunner to delegate execution to
+        :param graph: Optional AgentGraphDefinition used to create the
+            graph-level tracker when the runner returns an
+            :class:`AgentGraphRunnerResult` (new shape).  Not needed for
+            legacy runners that still return :class:`AgentGraphResult`.
         """
         self._runner = runner
+        self._graph = graph
 
-    async def run(self, input: Any) -> AgentGraphResult:
+    async def run(self, input: Any) -> ManagedGraphResult:
         """
         Run the agent graph with the given input.
 
-        Delegates to the underlying AgentGraphRunner, which handles
-        execution and all auto-tracking internally.
+        Delegates to the underlying AgentGraphRunner.  The returned type
+        determines which tracking path is taken:
+
+        - :class:`AgentGraphRunnerResult` (new shape): the managed layer drives
+          graph-level tracking from ``result.metrics`` via the graph tracker.
+          Per-node tracking from ``result.metrics.node_metrics`` will be wired
+          in a follow-up commit once the runners populate ``node_metrics``.
+        - :class:`AgentGraphResult` (legacy shape): tracking already occurred
+          inside the runner; the managed layer wraps the result without
+          additional tracking.
 
         :param input: The input prompt or structured input for the graph
-        :return: AgentGraphResult containing the output, raw response, and metrics
+        :return: ManagedGraphResult containing the content, metric summary,
+            raw response, and an optional evaluations task (always ``None``
+            for now — per-graph evaluations will be added in a future PR).
+        """
+        raw_result = await self._runner.run(input)
+
+        if isinstance(raw_result, AgentGraphRunnerResult):
+            # New shape: managed layer drives all tracking.
+            summary = self._build_summary_from_runner_result(raw_result)
+            if self._graph is not None:
+                self._flush_graph_tracking(raw_result, self._graph.create_tracker())
+            return ManagedGraphResult(
+                content=raw_result.content,
+                metrics=summary,
+                raw=raw_result.raw,
+                evaluations=None,
+            )
+
+        # Legacy shape (AgentGraphResult): tracking already happened in the runner.
+        # Build a GraphMetricSummary from the runner result's LDAIMetrics.
+        # path and node_metrics will be populated once graph runners are migrated
+        # to return AgentGraphRunnerResult with GraphMetrics (PR 11-openai/langchain).
+        metrics: LDAIMetrics = raw_result.metrics
+        summary = GraphMetricSummary(
+            success=metrics.success,
+            usage=metrics.usage,
+            duration_ms=getattr(metrics, 'duration_ms', None),
+        )
+        return ManagedGraphResult(
+            content=raw_result.output,
+            metrics=summary,
+            raw=raw_result.raw,
+            evaluations=None,
+        )
+
+    def _build_summary_from_runner_result(
+        self,
+        result: AgentGraphRunnerResult,
+    ) -> GraphMetricSummary:
+        """Build a GraphMetricSummary from an AgentGraphRunnerResult."""
+        m = result.metrics
+        return GraphMetricSummary(
+            success=m.success,
+            path=list(m.path),
+            duration_ms=m.duration_ms,
+            usage=m.usage,
+            node_metrics=dict(m.node_metrics),
+        )
+
+    def _flush_graph_tracking(self, result: AgentGraphRunnerResult, tracker: Any) -> None:
+        """
+        Drive graph-level LaunchDarkly tracking events from runner result metrics.
+
+        Called only when the runner returns the new ``AgentGraphRunnerResult``
+        shape.  Node-level tracking (from ``result.metrics.node_metrics``) will
+        be wired once the runners start populating that field.
         """
-        return await self._runner.run(input)
+        m = result.metrics
+        if m.path:
+            tracker.track_path(m.path)
+        if m.duration_ms is not None:
+            tracker.track_duration(m.duration_ms)
+        if m.success:
+            tracker.track_invocation_success()
+        else:
+            tracker.track_invocation_failure()
+        if m.usage is not None:
+            tracker.track_total_tokens(m.usage)
 
     def get_agent_graph_runner(self) -> AgentGraphRunner:
         """
diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py
index 9cfb503a..94605eab 100644
--- a/packages/sdk/server-ai/src/ldai/managed_model.py
+++ b/packages/sdk/server-ai/src/ldai/managed_model.py
@@ -1,10 +1,10 @@
 import asyncio
-from typing import List, Optional
+from typing import List
 
 from ldai import log
 from ldai.models import AICompletionConfig, LDMessage
-from ldai.providers.model_runner import ModelRunner
-from ldai.providers.types import JudgeResult, ModelResponse
+from ldai.providers.runner import Runner
+from ldai.providers.types import JudgeResult, ManagedResult
 from ldai.tracker import LDAIConfigTracker
 
 
@@ -12,7 +12,7 @@ class ManagedModel:
     """
     LaunchDarkly managed wrapper for AI model invocations.
 
-    Holds a ModelRunner. Handles conversation management, judge evaluation
+    Holds a Runner. Handles conversation management, judge evaluation
     dispatch, and tracking automatically via ``create_tracker()``.
     Obtain an instance via ``LDAIClient.create_model()``.
     """
@@ -20,22 +20,23 @@ class ManagedModel:
     def __init__(
         self,
         ai_config: AICompletionConfig,
-        model_runner: ModelRunner,
+        model_runner: Runner,
     ):
         self._ai_config = ai_config
         self._model_runner = model_runner
         self._messages: List[LDMessage] = []
 
-    async def invoke(self, prompt: str) -> ModelResponse:
+    async def run(self, prompt: str) -> ManagedResult:
         """
-        Invoke the model with a prompt string.
+        Run the model with a prompt string.
 
         Appends the prompt to the conversation history, prepends any
         system messages from the config, delegates to the runner, and
         appends the response to the history.
 
         :param prompt: The user prompt to send to the model
-        :return: ModelResponse containing the model's response and metrics
+        :return: ManagedResult containing the model's response, metric summary,
+            and an optional evaluations task
         """
         tracker = self._ai_config.create_tracker()
 
@@ -45,17 +46,26 @@ async def invoke(self, prompt: str) -> ModelResponse:
         config_messages = self._ai_config.messages or []
         all_messages = config_messages + self._messages
 
-        response = await tracker.track_metrics_of_async(
-            lambda result: result.metrics,
-            lambda: self._model_runner.invoke_model(all_messages),
+        result = await tracker.track_metrics_of_async(
+            lambda r: r.metrics,
+            lambda: self._model_runner.run(all_messages),
         )
 
+        assistant_message = LDMessage(role='assistant', content=result.content)
+
         input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else ''
-        output_text = response.message.content
-        response.evaluations = self._track_judge_results(tracker, input_text, output_text)
 
-        self._messages.append(response.message)
-        return response
+        evaluations_task = self._track_judge_results(tracker, input_text, result.content)
+
+        self._messages.append(assistant_message)
+
+        return ManagedResult(
+            content=result.content,
+            metrics=tracker.get_summary(),
+            raw=result.raw,
+            parsed=result.parsed,
+            evaluations=evaluations_task,
+        )
 
     def _track_judge_results(
         self,
@@ -98,11 +108,11 @@ def append_messages(self, messages: List[LDMessage]) -> None:
         """
         self._messages.extend(messages)
 
-    def get_model_runner(self) -> ModelRunner:
+    def get_model_runner(self) -> Runner:
         """
-        Return the underlying ModelRunner for advanced use.
+        Return the underlying runner for advanced use.
 
-        :return: The ModelRunner instance.
+        :return: The Runner instance.
         """
         return self._model_runner
 
diff --git a/packages/sdk/server-ai/src/ldai/providers/__init__.py b/packages/sdk/server-ai/src/ldai/providers/__init__.py
index b2bfa72e..22dce784 100644
--- a/packages/sdk/server-ai/src/ldai/providers/__init__.py
+++ b/packages/sdk/server-ai/src/ldai/providers/__init__.py
@@ -2,13 +2,20 @@
 from ldai.providers.agent_runner import AgentRunner
 from ldai.providers.ai_provider import AIProvider
 from ldai.providers.model_runner import ModelRunner
+from ldai.providers.runner import Runner
 from ldai.providers.runner_factory import RunnerFactory
 from ldai.providers.types import (
     AgentGraphResult,
+    AgentGraphRunnerResult,
     AgentResult,
+    GraphMetrics,
+    GraphMetricSummary,
     JudgeResult,
     LDAIMetrics,
+    ManagedGraphResult,
+    ManagedResult,
     ModelResponse,
+    RunnerResult,
     StructuredResponse,
     ToolRegistry,
 )
@@ -17,13 +24,20 @@
     'AIProvider',
     'AgentGraphResult',
     'AgentGraphRunner',
+    'AgentGraphRunnerResult',
     'AgentResult',
     'AgentRunner',
+    'GraphMetrics',
+    'GraphMetricSummary',
     'JudgeResult',
     'LDAIMetrics',
+    'ManagedGraphResult',
+    'ManagedResult',
     'ModelResponse',
     'ModelRunner',
+    'Runner',
     'RunnerFactory',
+    'RunnerResult',
     'StructuredResponse',
     'ToolRegistry',
 ]
diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py
new file mode 100644
index 00000000..5e1b9abc
--- /dev/null
+++ b/packages/sdk/server-ai/src/ldai/providers/runner.py
@@ -0,0 +1,29 @@
+"""Unified Runner protocol for AI providers."""
+
+from typing import Any, Dict, Optional, Protocol, runtime_checkable
+
+from ldai.providers.types import RunnerResult
+
+
+@runtime_checkable
+class Runner(Protocol):
+    """
+    Unified runtime capability interface for all AI provider runners.
+
+    A :class:`Runner` is a focused, configured object that performs a single
+    AI invocation.
+    """
+
+    async def run(
+        self,
+        input: Any,
+        output_type: Optional[Dict[str, Any]] = None,
+    ) -> RunnerResult:
+        """
+        Execute the runner with the given input.
+
+        :param input: The input to the runner.
+        :param output_type: Optional JSON schema for structured output.
+        :return: RunnerResult containing content, metrics, raw, and parsed fields.
+        """
+        ...
diff --git a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py
index 9363f8e0..b7548791 100644
--- a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py
+++ b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py
@@ -4,9 +4,8 @@
 from ldai import log
 from ldai.models import AIConfigKind
 from ldai.providers.agent_graph_runner import AgentGraphRunner
-from ldai.providers.agent_runner import AgentRunner
 from ldai.providers.ai_provider import AIProvider
-from ldai.providers.model_runner import ModelRunner
+from ldai.providers.runner import Runner
 
 T = TypeVar('T')
 
@@ -118,13 +117,13 @@ def _get_providers_to_try(
     def create_model(
         config: AIConfigKind,
         default_ai_provider: Optional[str] = None,
-    ) -> Optional[ModelRunner]:
+    ) -> Optional[Runner]:
         """
         Create a model executor for the given AI completion config.
 
         :param config: LaunchDarkly AI config (completion or judge)
         :param default_ai_provider: Optional provider override ('openai', 'langchain', …)
-        :return: Configured ModelRunner ready to invoke the model, or None
+        :return: Configured Runner ready to invoke the model, or None
         """
         provider_name = config.provider.name.lower() if config.provider else None
         providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name)
@@ -135,7 +134,7 @@ def create_agent(
         config: Any,
         tools: Any,
         default_ai_provider: Optional[str] = None,
-    ) -> Optional[AgentRunner]:
+    ) -> Optional[Runner]:
         """
         CAUTION:
         This feature is experimental and should NOT be considered ready for production use.
@@ -147,7 +146,7 @@ def create_agent(
         :param config: LaunchDarkly AI agent config
         :param tools: Tool registry mapping tool names to callables
         :param default_ai_provider: Optional provider override
-        :return: AgentRunner instance, or None
+        :return: Runner instance, or None
         """
         provider_name = config.provider.name.lower() if config.provider else None
         providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name)
diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py
index aa537880..5bb8ce47 100644
--- a/packages/sdk/server-ai/src/ldai/providers/types.py
+++ b/packages/sdk/server-ai/src/ldai/providers/types.py
@@ -3,11 +3,11 @@
 from __future__ import annotations
 
 import asyncio
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional
 
 from ldai.models import LDMessage
-from ldai.tracker import TokenUsage
+from ldai.tracker import LDAIMetricSummary, TokenUsage
 
 # Type alias for a registry of tools available to an agent.
 # Keys are tool names; values are the callable implementations.
@@ -16,11 +16,19 @@
 
 @dataclass
 class LDAIMetrics:
-    """
-    Metrics information for AI operations that includes success status and token usage.
-    """
+    """Contains metrics for a single AI invocation."""
+
     success: bool
+    """Whether the invocation succeeded."""
+
     usage: Optional[TokenUsage] = None
+    """Optional token usage information."""
+
+    tool_calls: Optional[List[str]] = None
+    """Ordered list of tool-call names observed during the invocation."""
+
+    duration_ms: Optional[int] = None
+    """Wall-clock duration of the runner invocation in milliseconds."""
 
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -35,13 +43,58 @@ def to_dict(self) -> Dict[str, Any]:
                 'input': self.usage.input,
                 'output': self.usage.output,
             }
+        if self.tool_calls is not None:
+            result['toolCalls'] = self.tool_calls
+        if self.duration_ms is not None:
+            result['durationMs'] = self.duration_ms
         return result
 
 
+@dataclass
+class RunnerResult:
+    """Contains the result of a single AI model invocation."""
+
+    content: str
+    """The text content returned by the model."""
+
+    metrics: LDAIMetrics
+    """Metrics for this invocation."""
+
+    raw: Optional[Any] = None
+    """Optional provider-native response object for advanced consumers."""
+
+    parsed: Optional[Dict[str, Any]] = None
+    """Optional parsed structured output, populated when ``output_type`` was supplied."""
+
+
+@dataclass
+class ManagedResult:
+    """Contains the result of a managed AI invocation, including metrics and optional judge evaluations."""
+
+    content: str
+    """The text content returned by the model."""
+
+    metrics: LDAIMetricSummary
+    """Aggregated metric summary from the tracker for this invocation."""
+
+    raw: Optional[Any] = None
+    """Optional provider-native response object for advanced consumers."""
+
+    parsed: Optional[Dict[str, Any]] = None
+    """Optional parsed structured output, populated when ``output_type`` was supplied."""
+
+    evaluations: Optional[asyncio.Task[List[JudgeResult]]] = None
+    """Optional asyncio Task that resolves to the list of :class:`JudgeResult` instances when awaited."""
+
+
 @dataclass
 class ModelResponse:
     """
     Response from a model invocation.
+
+    .. deprecated::
+        Use :class:`RunnerResult` (from a runner) and :class:`ManagedResult`
+        (from the managed layer) instead.
     """
     message: LDMessage
     metrics: LDAIMetrics
@@ -52,24 +105,113 @@ class ModelResponse:
 class StructuredResponse:
     """
     Structured response from AI models.
+
+    .. deprecated::
+        Structured output is now represented by :attr:`RunnerResult.parsed`.
     """
     data: Dict[str, Any]
     raw_response: str
     metrics: LDAIMetrics
 
 
+@dataclass
+class GraphMetrics:
+    """Contains raw metrics from a single agent graph run."""
+
+    success: bool
+    """Whether the graph run succeeded."""
+
+    path: List[str] = field(default_factory=list)
+    """Ordered list of node keys visited during the run."""
+
+    duration_ms: Optional[int] = None
+    """Wall-clock duration of the graph run in milliseconds."""
+
+    usage: Optional[TokenUsage] = None
+    """Optional aggregate token usage information across all nodes in the graph run."""
+
+    node_metrics: Dict[str, LDAIMetrics] = field(default_factory=dict)
+    """Per-node metrics keyed by node key."""
+
+
+@dataclass
+class GraphMetricSummary:
+    """Contains a summary of metrics for an agent graph run."""
+
+    success: bool
+    """Whether the graph run succeeded."""
+
+    path: List[str] = field(default_factory=list)
+    """Ordered list of node keys visited during the run."""
+
+    duration_ms: Optional[int] = None
+    """Wall-clock duration of the graph run in milliseconds."""
+
+    usage: Optional[TokenUsage] = None
+    """Optional aggregate token usage information across all nodes in the graph run."""
+
+    node_metrics: Dict[str, LDAIMetrics] = field(default_factory=dict)
+    """Per-node metrics keyed by node key."""
+
+    resumption_token: Optional[str] = None
+    """Optional resumption token from the graph tracker for cross-process resumption."""
+
+
+@dataclass
+class ManagedGraphResult:
+    """Contains the result of a managed agent graph run, including metrics and optional judge evaluations."""
+
+    content: str
+    """The graph's final output content."""
+
+    metrics: GraphMetricSummary
+    """Aggregated metric summary from the graph tracker for this run."""
+
+    raw: Optional[Any] = None
+    """Optional provider-native response object for advanced consumers."""
+
+    evaluations: Optional[asyncio.Task[List[JudgeResult]]] = None
+    """Optional asyncio Task that resolves to the list of :class:`JudgeResult` instances when awaited."""
+
+
+@dataclass
+class AgentGraphRunnerResult:
+    """Contains the result of an agent graph runner invocation."""
+
+    content: str
+    """The graph's final output content."""
+
+    metrics: GraphMetrics
+    """Metrics from the graph run."""
+
+    raw: Optional[Any] = None
+    """Optional provider-native response object for advanced consumers."""
+
+
 @dataclass
 class JudgeResult:
-    """
-    Result from a judge evaluation.
-    """
+    """Contains the result of a single judge evaluation."""
+
     judge_config_key: Optional[str] = None
+    """The configuration key of the judge that produced this result."""
+
     success: bool = False
+    """Whether the judge evaluation completed successfully."""
+
     error_message: Optional[str] = None
-    sampled: bool = False  # True when the evaluation was sampled and run
+    """Error message describing why the evaluation failed, if any."""
+
+    sampled: bool = False
+    """True when the evaluation was sampled and run."""
+
     metric_key: Optional[str] = None
+    """The metric key under which this judge's score is reported."""
+
     score: Optional[float] = None
+    """The numeric score (0-1) returned by the judge."""
+
     reasoning: Optional[str] = None
+    """The judge's reasoning text accompanying the score."""
 
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -96,6 +238,10 @@ def to_dict(self) -> Dict[str, Any]:
 class AgentResult:
     """
     Result from a single-agent run.
+
+    .. deprecated::
+        Use :class:`ManagedResult` (managed layer) or :class:`RunnerResult`
+        (runner layer) instead.
     """
     output: str
     raw: Any
@@ -104,10 +250,16 @@ class AgentResult:
 
 @dataclass
 class AgentGraphResult:
-    """
-    Result from an agent graph run.
-    """
+    """Contains the result of an agent graph run."""
+
     output: str
+    """The agent graph's final output content."""
+
     raw: Any
+    """The provider-native response object from the graph run."""
+
     metrics: LDAIMetrics
+    """Metrics recorded during the graph run."""
+
     evaluations: Optional[List[JudgeResult]] = None
+    """Optional list of judge evaluation results produced for the graph run."""
diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py
index 0f5a32c5..31416649 100644
--- a/packages/sdk/server-ai/src/ldai/tracker.py
+++ b/packages/sdk/server-ai/src/ldai/tracker.py
@@ -1,15 +1,20 @@
+from __future__ import annotations
+
 import base64
 import json
 import time
 import warnings
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Callable, Dict, Iterable, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional
 
 from ldclient import Context, LDClient, Result
 
 from ldai import log
 
+if TYPE_CHECKING:
+    from ldai.providers.types import LDAIMetrics
+
 
 class FeedbackKind(Enum):
     """
@@ -41,15 +46,31 @@ class LDAIMetricSummary:
     """
 
     def __init__(self):
-        self._duration = None
-        self._success = None
-        self._feedback = None
-        self._usage = None
-        self._time_to_first_token = None
+        self._duration_ms: Optional[int] = None
+        self._success: Optional[bool] = None
+        self._feedback: Optional[Dict[str, FeedbackKind]] = None
+        self._usage: Optional[TokenUsage] = None
+        self._time_to_first_token: Optional[int] = None
+        self._tool_calls: Optional[List[str]] = None
+        self._resumption_token: Optional[str] = None
+
+    @property
+    def duration_ms(self) -> Optional[int]:
+        """Duration of the AI operation in milliseconds."""
+        return self._duration_ms
 
     @property
     def duration(self) -> Optional[int]:
-        return self._duration
+        """
+        .. deprecated::
+            Use :attr:`duration_ms` instead.
+        """
+        warnings.warn(
+            "LDAIMetricSummary.duration is deprecated. Use duration_ms instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self._duration_ms
 
     @property
     def success(self) -> Optional[bool]:
@@ -67,6 +88,20 @@ def usage(self) -> Optional[TokenUsage]:
     def time_to_first_token(self) -> Optional[int]:
         return self._time_to_first_token
 
+    @property
+    def tool_calls(self) -> Optional[List[str]]:
+        """List of tool keys that were invoked during this operation."""
+        return self._tool_calls
+
+    @property
+    def resumption_token(self) -> Optional[str]:
+        """
+        URL-safe Base64-encoded resumption token captured at tracker
+        instantiation. Useful for deferred feedback flows where a downstream
+        process needs to associate events with the original execution.
+        """
+        return self._resumption_token
+
 
 class LDAIConfigTracker:
     """
@@ -107,8 +142,10 @@ def __init__(
         self._provider_name = provider_name
         self._context = context
         self._graph_key = graph_key
-        self._summary = LDAIMetricSummary()
         self._run_id = run_id
+        self._summary = LDAIMetricSummary()
+        # Capture resumption_token immediately so it's available on the summary at instantiation.
+        self._summary._resumption_token = self.resumption_token
 
     @property
     def resumption_token(self) -> str:
@@ -200,10 +237,10 @@ def track_duration(self, duration: int) -> None:
 
         :param duration: Duration in milliseconds.
         """
-        if self._summary.duration is not None:
+        if self._summary.duration_ms is not None:
             log.warning("Duration has already been tracked for this execution. %s", self.__get_track_data())
             return
-        self._summary._duration = duration
+        self._summary._duration_ms = duration
         self._ld_client.track(
             "$ld:ai:duration:total", self._context, self.__get_track_data(), duration
         )
@@ -250,20 +287,32 @@ def track_duration_of(self, func):
     def _track_from_metrics_extractor(
         self,
         result: Any,
-        metrics_extractor: Callable[[Any], Any],
-    ) -> Any:
-        metrics = metrics_extractor(result)
+        metrics_extractor: Callable[[Any], Optional[LDAIMetrics]],
+        elapsed_ms: int,
+    ) -> None:
+        metrics = None
+        try:
+            metrics = metrics_extractor(result)
+        except Exception as exc:
+            log.warning("Failed to extract metrics: %s", exc)
+
+        if metrics is None:
+            self.track_duration(elapsed_ms)
+            return
+
+        self.track_duration(metrics.duration_ms if metrics.duration_ms is not None else elapsed_ms)
         if metrics.success:
             self.track_success()
         else:
             self.track_error()
         if metrics.usage:
             self.track_tokens(metrics.usage)
-        return result
+        if metrics.tool_calls is not None:
+            self.track_tool_calls(metrics.tool_calls)
 
     def track_metrics_of(
         self,
-        metrics_extractor: Callable[[Any], Any],
+        metrics_extractor: Callable[[Any], Optional[LDAIMetrics]],
         func: Callable[[], Any],
     ) -> Any:
         """
@@ -278,6 +327,10 @@ def track_metrics_of(
 
         For async operations, use :meth:`track_metrics_of_async`.
 
+        When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a
+        non-``None`` ``duration_ms`` field, that value is used as the measured duration
+        instead of the wall-clock elapsed time.
+
         :param metrics_extractor: Function that extracts LDAIMetrics from the operation result
         :param func: Synchronous callable that runs the operation
         :return: The result of the operation
@@ -291,16 +344,24 @@ def track_metrics_of(
             self.track_error()
             raise err
 
-        duration = (time.perf_counter_ns() - start_ns) // 1_000_000
-        self.track_duration(duration)
-        return self._track_from_metrics_extractor(result, metrics_extractor)
+        elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000
+        self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms)
+        return result
 
-    async def track_metrics_of_async(self, metrics_extractor, func):
+    async def track_metrics_of_async(
+        self,
+        metrics_extractor: Callable[[Any], Optional[LDAIMetrics]],
+        func: Callable[[], Any],
+    ) -> Any:
         """
         Track metrics for an async AI operation (``func`` is awaited).
 
         Same event semantics as :meth:`track_metrics_of`.
 
+        When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a
+        non-``None`` ``duration_ms`` field, that value is used as the measured duration
+        instead of the wall-clock elapsed time.
+
         :param metrics_extractor: Function that extracts LDAIMetrics from the operation result
         :param func: Async callable or zero-arg callable that returns an awaitable when called
         :return: The result of the operation
@@ -315,9 +376,9 @@ async def track_metrics_of_async(self, metrics_extractor, func):
             self.track_error()
             raise err
 
-        duration = (time.perf_counter_ns() - start_ns) // 1_000_000
-        self.track_duration(duration)
-        return self._track_from_metrics_extractor(result, metrics_extractor)
+        elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000
+        self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms)
+        return result
 
     def track_judge_result(self, judge_result: Any) -> None:
         """
@@ -364,6 +425,24 @@ def track_feedback(self, feedback: Dict[str, FeedbackKind]) -> None:
                 1,
             )
 
+    def track_tool_calls(self, tool_calls: Iterable[str]) -> None:
+        """
+        Track the tool calls made during an AI operation.
+
+        Stores the tool call names on the summary (guarding against duplicate
+        tracking) and fires a ``$ld:ai:tool_call`` event for each tool.
+
+        :param tool_calls: Tool identifiers (e.g. from a model response).
+        """
+        if self._summary.tool_calls is not None:
+            log.warning("Tool calls have already been tracked for this execution. %s", self.__get_track_data())
+            return
+        tool_calls_list = list(tool_calls)
+        self._summary._tool_calls = tool_calls_list
+        for tool_key in tool_calls_list:
+            self.track_tool_call(tool_key)
+
+
     def track_success(self) -> None:
         """
         Track a successful AI generation.
@@ -499,15 +578,6 @@ def track_tool_call(self, tool_key: str) -> None:
             1,
         )
 
-    def track_tool_calls(self, tool_keys: Iterable[str]) -> None:
-        """
-        Track multiple tool invocations for this configuration.
-
-        :param tool_keys: Tool identifiers (e.g. from a model response).
-        """
-        for tool_key in tool_keys:
-            self.track_tool_call(tool_key)
-
     def get_summary(self) -> LDAIMetricSummary:
         """
         Get the current summary of AI metrics.
diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py
index c2690b6a..3ca0750b 100644
--- a/packages/sdk/server-ai/tests/test_judge.py
+++ b/packages/sdk/server-ai/tests/test_judge.py
@@ -9,7 +9,7 @@
 from ldai.judge import Judge
 from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
 from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig
-from ldai.providers.types import JudgeResult, LDAIMetrics, StructuredResponse
+from ldai.providers.types import JudgeResult, LDAIMetrics, RunnerResult
 from ldai.tracker import LDAIConfigTracker
 
 
@@ -40,9 +40,9 @@ def client(td: TestData) -> LDClient:
 
 @pytest.fixture
 def mock_runner():
-    """Create a mock AI provider."""
+    """Create a mock AI runner."""
     provider = MagicMock()
-    provider.invoke_structured_model = AsyncMock()
+    provider.run = AsyncMock()
     return provider
 
 
@@ -137,7 +137,7 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing(
         assert isinstance(result, JudgeResult)
         assert result.success is False
         assert result.sampled is False
-        mock_runner.invoke_structured_model.assert_not_called()
+        mock_runner.run.assert_not_called()
 
     @pytest.mark.asyncio
     async def test_evaluate_returns_failure_when_messages_missing(
@@ -151,23 +151,23 @@ async def test_evaluate_returns_failure_when_messages_missing(
         assert isinstance(result, JudgeResult)
         assert result.success is False
         assert result.sampled is False
-        mock_runner.invoke_structured_model.assert_not_called()
+        mock_runner.run.assert_not_called()
 
     @pytest.mark.asyncio
     async def test_evaluate_success_with_valid_response(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
         """Evaluate should return JudgeResponse with valid evaluation."""
-        mock_response = StructuredResponse(
-            data={
+        mock_response = RunnerResult(
+            content='',
+            metrics=LDAIMetrics(success=True),
+            parsed={
                 'score': 0.85,
                 'reasoning': 'The response is highly relevant to the input.'
             },
-            raw_response='{"score": 0.85, "reasoning": "..."}',
-            metrics=LDAIMetrics(success=True)
         )
 
-        mock_runner.invoke_structured_model.return_value = mock_response
+        mock_runner.run.return_value = mock_response
         tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
 
         judge = Judge(judge_config_with_key, mock_runner)
@@ -187,15 +187,15 @@ async def test_evaluate_success_with_evaluation_response_shape(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
         """Evaluate should accept shape { score, reasoning } and key by metric."""
-        mock_response = StructuredResponse(
-            data={
+        mock_response = RunnerResult(
+            content='',
+            metrics=LDAIMetrics(success=True),
+            parsed={
                 'score': 0.9,
                 'reasoning': 'The response is accurate and complete.',
             },
-            raw_response='{"score": 0.9, "reasoning": "..."}',
-            metrics=LDAIMetrics(success=True),
         )
-        mock_runner.invoke_structured_model.return_value = mock_response
+        mock_runner.run.return_value = mock_response
         tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
 
         judge = Judge(judge_config_with_key, mock_runner)
@@ -214,13 +214,13 @@ async def test_evaluate_handles_missing_evaluation_in_response(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
         """Evaluate should handle missing score/reasoning in response."""
-        mock_response = StructuredResponse(
-            data={},
-            raw_response='{}',
-            metrics=LDAIMetrics(success=True)
+        mock_response = RunnerResult(
+            content='',
+            metrics=LDAIMetrics(success=True),
+            parsed={},
         )
 
-        mock_runner.invoke_structured_model.return_value = mock_response
+        mock_runner.run.return_value = mock_response
         tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
 
         judge = Judge(judge_config_with_key, mock_runner)
@@ -236,16 +236,16 @@ async def test_evaluate_handles_invalid_score(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
         """Evaluate should handle invalid score values."""
-        mock_response = StructuredResponse(
-            data={
+        mock_response = RunnerResult(
+            content='',
+            metrics=LDAIMetrics(success=True),
+            parsed={
                 'score': 1.5,
-                'reasoning': 'Some reasoning'
+                'reasoning': 'Some reasoning',
             },
-            raw_response='{"score": 1.5, "reasoning": "..."}',
-            metrics=LDAIMetrics(success=True)
         )
 
-        mock_runner.invoke_structured_model.return_value = mock_response
+        mock_runner.run.return_value = mock_response
         tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
 
         judge = Judge(judge_config_with_key, mock_runner)
@@ -261,13 +261,13 @@ async def test_evaluate_handles_missing_reasoning(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
         """Evaluate should handle missing reasoning."""
-        mock_response = StructuredResponse(
-            data={'score': 0.8},
-            raw_response='{"score": 0.8}',
-            metrics=LDAIMetrics(success=True)
+        mock_response = RunnerResult(
+            content='',
+            metrics=LDAIMetrics(success=True),
+            parsed={'score': 0.8},
         )
 
-        mock_runner.invoke_structured_model.return_value = mock_response
+        mock_runner.run.return_value = mock_response
         tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
 
         judge = Judge(judge_config_with_key, mock_runner)
@@ -283,7 +283,7 @@ async def test_evaluate_handles_exception(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
         """Evaluate should handle exceptions gracefully."""
-        mock_runner.invoke_structured_model.side_effect = Exception("Provider error")
+        mock_runner.run.side_effect = Exception("Provider error")
         tracker.track_metrics_of_async = AsyncMock(side_effect=Exception("Provider error"))
 
         judge = Judge(judge_config_with_key, mock_runner)
@@ -306,7 +306,7 @@ async def test_evaluate_respects_sampling_rate(
         assert isinstance(result, JudgeResult)
         assert result.sampled is False
         assert result.success is False
-        mock_runner.invoke_structured_model.assert_not_called()
+        mock_runner.run.assert_not_called()
 
 
 class TestJudgeEvaluateMessages:
@@ -317,15 +317,13 @@ async def test_evaluate_messages_calls_evaluate(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
         """evaluate_messages should call evaluate with constructed input/output."""
-        from ldai.providers.types import ModelResponse
-
-        mock_response = StructuredResponse(
-            data={'score': 0.9, 'reasoning': 'Very relevant'},
-            raw_response='{"score": 0.9, "reasoning": "..."}',
-            metrics=LDAIMetrics(success=True)
+        mock_response = RunnerResult(
+            content='',
+            metrics=LDAIMetrics(success=True),
+            parsed={'score': 0.9, 'reasoning': 'Very relevant'},
         )
 
-        mock_runner.invoke_structured_model.return_value = mock_response
+        mock_runner.run.return_value = mock_response
         tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
 
         judge = Judge(judge_config_with_key, mock_runner)
@@ -334,9 +332,9 @@ async def test_evaluate_messages_calls_evaluate(
             LDMessage(role='user', content='Question 1'),
             LDMessage(role='assistant', content='Answer 1'),
         ]
-        chat_response = ModelResponse(
-            message=LDMessage(role='assistant', content='Answer 2'),
-            metrics=LDAIMetrics(success=True)
+        chat_response = RunnerResult(
+            content='Answer 2',
+            metrics=LDAIMetrics(success=True),
         )
 
         result = await judge.evaluate_messages(messages, chat_response)
diff --git a/packages/sdk/server-ai/tests/test_managed_agent.py b/packages/sdk/server-ai/tests/test_managed_agent.py
index 144641fc..0c30637a 100644
--- a/packages/sdk/server-ai/tests/test_managed_agent.py
+++ b/packages/sdk/server-ai/tests/test_managed_agent.py
@@ -6,13 +6,19 @@
 from ldai import LDAIClient, ManagedAgent
 from ldai.managed_agent import ManagedAgent
 from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig
-from ldai.providers import AgentResult
-from ldai.providers.types import LDAIMetrics
+from ldai.providers.types import LDAIMetrics, ManagedResult, RunnerResult
+from ldai.tracker import LDAIMetricSummary
 
 from ldclient import Config, Context, LDClient
 from ldclient.integrations.test_data import TestData
 
 
+def _make_summary(success: bool = True) -> LDAIMetricSummary:
+    summary = LDAIMetricSummary()
+    summary._success = success
+    return summary
+
+
 @pytest.fixture
 def td() -> TestData:
     td = TestData.data_source()
@@ -53,30 +59,32 @@ class TestManagedAgentRun:
 
     @pytest.mark.asyncio
     async def test_run_delegates_to_agent_runner(self):
-        """Should delegate run() to the underlying AgentRunner."""
+        """Should delegate run() to the underlying AgentRunner and return ManagedResult."""
         mock_config = MagicMock(spec=AIAgentConfig)
         mock_tracker = MagicMock()
         mock_tracker.track_metrics_of_async = AsyncMock(
-            return_value=AgentResult(
-                output="Test response",
-                raw=None,
+            return_value=RunnerResult(
+                content="Test response",
                 metrics=LDAIMetrics(success=True, usage=None),
+                raw=None,
             )
         )
+        mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
         mock_config.create_tracker = MagicMock(return_value=mock_tracker)
         mock_runner = MagicMock()
         mock_runner.run = AsyncMock(
-            return_value=AgentResult(
-                output="Test response",
-                raw=None,
+            return_value=RunnerResult(
+                content="Test response",
                 metrics=LDAIMetrics(success=True, usage=None),
+                raw=None,
             )
         )
 
         agent = ManagedAgent(mock_config, mock_runner)
         result = await agent.run("Hello")
 
-        assert result.output == "Test response"
+        assert isinstance(result, ManagedResult)
+        assert result.content == "Test response"
         assert result.metrics.success is True
         mock_config.create_tracker.assert_called_once()
         mock_tracker.track_metrics_of_async.assert_called_once()
@@ -87,12 +95,13 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self):
         mock_config = MagicMock(spec=AIAgentConfig)
         fresh_tracker = MagicMock()
         fresh_tracker.track_metrics_of_async = AsyncMock(
-            return_value=AgentResult(
-                output="Fresh tracker response",
-                raw=None,
+            return_value=RunnerResult(
+                content="Fresh tracker response",
                 metrics=LDAIMetrics(success=True, usage=None),
+                raw=None,
             )
         )
+        fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True))
         mock_config.create_tracker = MagicMock(return_value=fresh_tracker)
 
         mock_runner = MagicMock()
@@ -100,7 +109,8 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self):
         agent = ManagedAgent(mock_config, mock_runner)
         result = await agent.run("Hello")
 
-        assert result.output == "Fresh tracker response"
+        assert isinstance(result, ManagedResult)
+        assert result.content == "Fresh tracker response"
         mock_config.create_tracker.assert_called_once()
         fresh_tracker.track_metrics_of_async.assert_called_once()
 
@@ -152,7 +162,7 @@ async def test_returns_managed_agent_when_runner_available(self, ldai_client: LD
 
         mock_runner = MagicMock()
         mock_runner.run = AsyncMock(
-            return_value=AgentResult(output="Hello!", raw=None, metrics=LDAIMetrics(success=True, usage=None))
+            return_value=RunnerResult(content="Hello!", metrics=LDAIMetrics(success=True, usage=None), raw=None)
         )
 
         original = rf.RunnerFactory.create_agent
diff --git a/packages/sdk/server-ai/tests/test_managed_agent_graph.py b/packages/sdk/server-ai/tests/test_managed_agent_graph.py
index 35be2766..05b0ed27 100644
--- a/packages/sdk/server-ai/tests/test_managed_agent_graph.py
+++ b/packages/sdk/server-ai/tests/test_managed_agent_graph.py
@@ -5,14 +5,16 @@
 from ldclient import Config, Context, LDClient
 from ldclient.integrations.test_data import TestData
 
-from ldai import LDAIClient, ManagedAgentGraph
-from ldai.providers.types import LDAIMetrics
+from ldai import LDAIClient, ManagedAgentGraph, ManagedGraphResult
+from ldai.providers.types import AgentGraphRunnerResult, GraphMetrics, LDAIMetrics
 from ldai.providers import AgentGraphResult, AgentGraphRunner, ToolRegistry
+from ldai.tracker import TokenUsage
 
 
-# --- Test double ---
+# --- Test doubles ---
 
 class StubAgentGraphRunner(AgentGraphRunner):
+    """Legacy runner that returns AgentGraphResult (old shape)."""
     def __init__(self, output: str = "stub output"):
         self._output = output
 
@@ -24,14 +26,35 @@ async def run(self, input) -> AgentGraphResult:
         )
 
 
-# --- ManagedAgentGraph unit tests ---
+class StubNewShapeRunner(AgentGraphRunner):
+    """New-shape runner that returns AgentGraphRunnerResult."""
+    def __init__(self, content: str = "new shape output"):
+        self._content = content
+
+    async def run(self, input) -> AgentGraphRunnerResult:
+        return AgentGraphRunnerResult(
+            content=self._content,
+            metrics=GraphMetrics(
+                success=True,
+                path=["root", "specialist"],
+                duration_ms=42,
+                usage=TokenUsage(total=10, input=5, output=5),
+                node_metrics={},
+            ),
+            raw={"input": input},
+        )
+
+
+# --- ManagedAgentGraph unit tests (legacy shape) ---
 
 @pytest.mark.asyncio
 async def test_managed_agent_graph_run_delegates_to_runner():
+    """Legacy AgentGraphResult shape: content comes from output field."""
     runner = StubAgentGraphRunner("hello world")
     managed = ManagedAgentGraph(runner)
     result = await managed.run("test input")
-    assert result.output == "hello world"
+    assert isinstance(result, ManagedGraphResult)
+    assert result.content == "hello world"
     assert result.metrics.success is True
 
 
@@ -41,6 +64,56 @@ def test_managed_agent_graph_get_runner():
     assert managed.get_agent_graph_runner() is runner
 
 
+# --- ManagedAgentGraph unit tests (new AgentGraphRunnerResult shape) ---
+
+@pytest.mark.asyncio
+async def test_managed_agent_graph_run_handles_new_shape():
+    """New AgentGraphRunnerResult shape: content and GraphMetrics are surfaced."""
+    runner = StubNewShapeRunner("final answer")
+    mock_graph = MagicMock()
+    mock_tracker = MagicMock()
+    mock_graph.create_tracker = MagicMock(return_value=mock_tracker)
+
+    managed = ManagedAgentGraph(runner, graph=mock_graph)
+    result = await managed.run("test input")
+
+    assert isinstance(result, ManagedGraphResult)
+    assert result.content == "final answer"
+    assert result.metrics.success is True
+    assert result.metrics.path == ["root", "specialist"]
+    assert result.metrics.duration_ms == 42
+    assert result.metrics.usage is not None
+    assert result.metrics.usage.total == 10
+
+
+@pytest.mark.asyncio
+async def test_managed_agent_graph_new_shape_drives_tracking():
+    """New shape: managed layer calls tracker methods from result.metrics."""
+    runner = StubNewShapeRunner()
+    mock_graph = MagicMock()
+    mock_tracker = MagicMock()
+    mock_graph.create_tracker = MagicMock(return_value=mock_tracker)
+
+    managed = ManagedAgentGraph(runner, graph=mock_graph)
+    await managed.run("test input")
+
+    mock_tracker.track_path.assert_called_once_with(["root", "specialist"])
+    mock_tracker.track_duration.assert_called_once_with(42)
+    mock_tracker.track_invocation_success.assert_called_once()
+    mock_tracker.track_total_tokens.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_managed_agent_graph_new_shape_no_graph_skips_tracking():
+    """New shape without graph: no tracking called (graph not available)."""
+    runner = StubNewShapeRunner()
+    managed = ManagedAgentGraph(runner, graph=None)
+    # Should not raise even without a graph reference
+    result = await managed.run("test input")
+    assert result.content == "new shape output"
+    assert result.metrics.success is True
+
+
 
 
 # --- LDAIClient.create_agent_graph() integration tests ---
@@ -172,7 +245,8 @@ async def test_create_agent_graph_run_produces_result(ldai_client: LDAIClient):
 
     assert managed is not None
     result = await managed.run("find restaurants")
-    assert result.output == "final answer"
+    assert isinstance(result, ManagedGraphResult)
+    assert result.content == "final answer"
     assert result.metrics.success is True
 
 
diff --git a/packages/sdk/server-ai/tests/test_managed_model.py b/packages/sdk/server-ai/tests/test_managed_model.py
index 36802a14..6d679552 100644
--- a/packages/sdk/server-ai/tests/test_managed_model.py
+++ b/packages/sdk/server-ai/tests/test_managed_model.py
@@ -2,48 +2,65 @@
 
 import asyncio
 from typing import List
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 
 from ldai.evaluator import Evaluator
 from ldai.managed_model import ManagedModel
 from ldai.models import AICompletionConfig, LDMessage, ModelConfig, ProviderConfig
-from ldai.providers.types import JudgeResult, LDAIMetrics, ModelResponse
-from ldai.tracker import LDAIConfigTracker
+from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, RunnerResult
+from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary
 
 
 
-def _make_model_response(content: str = 'response text') -> ModelResponse:
-    return ModelResponse(
-        message=LDMessage(role='assistant', content=content),
+def _make_runner_result(content: str = 'response text') -> RunnerResult:
+    return RunnerResult(
+        content=content,
         metrics=LDAIMetrics(success=True, usage=None),
     )
 
 
-class TestManagedModelInvokeReturnsImmediately:
-    """invoke() must return before the evaluations task resolves."""
+def _make_summary() -> LDAIMetricSummary:
+    summary = LDAIMetricSummary()
+    summary._success = True
+    return summary
+
+
+def _make_config_with_tracker(evaluator: Evaluator) -> tuple[AICompletionConfig, MagicMock]:
+    """Build an AICompletionConfig with a fully-mocked tracker."""
+    mock_tracker = MagicMock(spec=LDAIConfigTracker)
+    mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result())
+    mock_tracker.get_summary = MagicMock(return_value=_make_summary())
+    config = AICompletionConfig(
+        key='test-config',
+        enabled=True,
+        create_tracker=MagicMock(return_value=mock_tracker),
+        model=ModelConfig('gpt-4'),
+        provider=ProviderConfig('openai'),
+        messages=[],
+        evaluator=evaluator,
+    )
+    return config, mock_tracker
 
-    @pytest.mark.asyncio
-    async def test_invoke_returns_before_evaluations_resolve(self):
-        """invoke() should return a ModelResponse before evaluations complete."""
-        # Set up a barrier so the evaluation coroutine doesn't complete until we release it
-        barrier = asyncio.Event()
 
-        async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]:
-            await barrier.wait()
-            return []
+class TestManagedModelRunReturnsImmediately:
+    """run() must return before the evaluations task resolves."""
 
+    @pytest.mark.asyncio
+    async def test_run_returns_managed_result(self):
+        """run() should return a ManagedResult with content from the runner."""
         evaluator = MagicMock(spec=Evaluator)
         evaluator.evaluate = MagicMock(
-            side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o))
+            side_effect=lambda i, o: asyncio.create_task(_empty_eval())
         )
 
         mock_runner = MagicMock()
-        mock_runner.invoke_model = AsyncMock(return_value=_make_model_response())
+        mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result('hi'))
 
         mock_tracker = MagicMock(spec=LDAIConfigTracker)
-        mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response())
+        mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result('hi'))
+        mock_tracker.get_summary = MagicMock(return_value=_make_summary())
         config = AICompletionConfig(
             key='test-config',
             enabled=True,
@@ -55,20 +72,46 @@ async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]
         )
 
         model = ManagedModel(config, mock_runner)
-        response = await model.invoke('Hello')
+        result = await model.run('Hello')
+
+        assert isinstance(result, ManagedResult)
+        assert result.content == 'hi'
+        assert isinstance(result.metrics, LDAIMetricSummary)
+        # Cleanup the still-pending evaluations task.
+        if result.evaluations is not None:
+            await result.evaluations
+
+    @pytest.mark.asyncio
+    async def test_run_returns_before_evaluations_resolve(self):
+        """run() should return a ManagedResult before evaluations complete."""
+        barrier = asyncio.Event()
+
+        async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]:
+            await barrier.wait()
+            return []
+
+        evaluator = MagicMock(spec=Evaluator)
+        evaluator.evaluate = MagicMock(
+            side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o))
+        )
+
+        mock_runner = MagicMock()
+        mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result())
+
+        config, _tracker = _make_config_with_tracker(evaluator)
+        model = ManagedModel(config, mock_runner)
+        result = await model.run('Hello')
 
-        # invoke() returned — evaluations task should still be pending
-        assert response is not None
-        assert response.evaluations is not None
-        assert not response.evaluations.done(), "evaluations task should still be pending"
+        assert result is not None
+        assert result.evaluations is not None
+        assert not result.evaluations.done(), "evaluations task should still be pending"
 
-        # Release the barrier and let it finish cleanly
         barrier.set()
-        await response.evaluations
+        await result.evaluations
 
     @pytest.mark.asyncio
     async def test_await_evaluations_collects_results(self):
-        """await response.evaluations should return the list of JudgeResult instances."""
+        """await result.evaluations should return the list of JudgeResult instances."""
         judge_result = JudgeResult(
             judge_config_key='judge-key',
             success=True,
@@ -87,24 +130,13 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]
         )
 
         mock_runner = MagicMock()
-        mock_runner.invoke_model = AsyncMock(return_value=_make_model_response())
-
-        mock_tracker = MagicMock(spec=LDAIConfigTracker)
-        mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response())
-        config = AICompletionConfig(
-            key='test-config',
-            enabled=True,
-            create_tracker=MagicMock(return_value=mock_tracker),
-            model=ModelConfig('gpt-4'),
-            provider=ProviderConfig('openai'),
-            messages=[],
-            evaluator=evaluator,
-        )
+        mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result())
 
+        config, _tracker = _make_config_with_tracker(evaluator)
         model = ManagedModel(config, mock_runner)
-        response = await model.invoke('Hello')
+        result = await model.run('Hello')
 
-        results = await response.evaluations  # type: ignore[misc]
+        results = await result.evaluations  # type: ignore[misc]
         assert results == [judge_result]
 
     @pytest.mark.asyncio
@@ -128,30 +160,19 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]
         )
 
         mock_runner = MagicMock()
-        mock_runner.invoke_model = AsyncMock(return_value=_make_model_response())
+        mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result())
 
-        mock_tracker = MagicMock(spec=LDAIConfigTracker)
-        mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response())
+        config, mock_tracker = _make_config_with_tracker(evaluator)
         mock_tracker.track_judge_result = MagicMock()
 
-        config = AICompletionConfig(
-            key='test-config',
-            enabled=True,
-            create_tracker=MagicMock(return_value=mock_tracker),
-            model=ModelConfig('gpt-4'),
-            provider=ProviderConfig('openai'),
-            messages=[],
-            evaluator=evaluator,
-        )
-
         model = ManagedModel(config, mock_runner)
-        response = await model.invoke('Hello')
+        result = await model.run('Hello')
 
         # Tracking should NOT have fired yet (before we await evaluations)
         mock_tracker.track_judge_result.assert_not_called()
 
         # Now await the evaluations task — tracking fires inside the chain
-        await response.evaluations  # type: ignore[misc]
+        await result.evaluations  # type: ignore[misc]
 
         mock_tracker.track_judge_result.assert_called_once_with(judge_result)
 
@@ -174,25 +195,14 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]
         )
 
         mock_runner = MagicMock()
-        mock_runner.invoke_model = AsyncMock(return_value=_make_model_response())
+        mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result())
 
-        mock_tracker = MagicMock(spec=LDAIConfigTracker)
-        mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response())
+        config, mock_tracker = _make_config_with_tracker(evaluator)
         mock_tracker.track_judge_result = MagicMock()
 
-        config = AICompletionConfig(
-            key='test-config',
-            enabled=True,
-            create_tracker=MagicMock(return_value=mock_tracker),
-            model=ModelConfig('gpt-4'),
-            provider=ProviderConfig('openai'),
-            messages=[],
-            evaluator=evaluator,
-        )
-
         model = ManagedModel(config, mock_runner)
-        response = await model.invoke('Hello')
-        await response.evaluations  # type: ignore[misc]
+        result = await model.run('Hello')
+        await result.evaluations  # type: ignore[misc]
 
         mock_tracker.track_judge_result.assert_not_called()
 
@@ -202,23 +212,15 @@ async def test_noop_evaluator_returns_empty_list(self):
         evaluator = Evaluator.noop()
 
         mock_runner = MagicMock()
-        mock_runner.invoke_model = AsyncMock(return_value=_make_model_response())
-
-        mock_tracker = MagicMock(spec=LDAIConfigTracker)
-        mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response())
-
-        config = AICompletionConfig(
-            key='test-config',
-            enabled=True,
-            create_tracker=MagicMock(return_value=mock_tracker),
-            model=ModelConfig('gpt-4'),
-            provider=ProviderConfig('openai'),
-            messages=[],
-            evaluator=evaluator,
-        )
+        mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result())
 
+        config, _tracker = _make_config_with_tracker(evaluator)
         model = ManagedModel(config, mock_runner)
-        response = await model.invoke('Hello')
-        results = await response.evaluations  # type: ignore[misc]
+        result = await model.run('Hello')
+        results = await result.evaluations  # type: ignore[misc]
 
         assert results == []
+
+
+async def _empty_eval() -> List[JudgeResult]:
+    return []
diff --git a/packages/sdk/server-ai/tests/test_runner_abcs.py b/packages/sdk/server-ai/tests/test_runner_abcs.py
index d5136fd0..7e8087cd 100644
--- a/packages/sdk/server-ai/tests/test_runner_abcs.py
+++ b/packages/sdk/server-ai/tests/test_runner_abcs.py
@@ -1,17 +1,17 @@
 import pytest
 
-from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentResult, AgentRunner, ToolRegistry
-from ldai.providers.types import LDAIMetrics
+from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentRunner, ToolRegistry
+from ldai.providers.types import LDAIMetrics, RunnerResult
 
 
 # --- Concrete test doubles ---
 
 class ConcreteAgentRunner:
     async def run(self, input):
-        return AgentResult(
-            output=f"agent response to: {input}",
-            raw={"raw": input},
+        return RunnerResult(
+            content=f"agent response to: {input}",
             metrics=LDAIMetrics(success=True),
+            raw={"raw": input},
         )
 
 
@@ -39,20 +39,20 @@ def test_agent_runner_structural_check_fails_when_run_missing():
 
 
 @pytest.mark.asyncio
-async def test_agent_runner_run_returns_agent_result():
+async def test_agent_runner_run_returns_runner_result():
     runner = ConcreteAgentRunner()
     result = await runner.run("hello")
-    assert isinstance(result, AgentResult)
-    assert result.output == "agent response to: hello"
+    assert isinstance(result, RunnerResult)
+    assert result.content == "agent response to: hello"
     assert result.raw == {"raw": "hello"}
     assert result.metrics.success is True
 
 
 @pytest.mark.asyncio
-async def test_agent_result_fields():
+async def test_runner_result_fields():
     metrics = LDAIMetrics(success=True)
-    result = AgentResult(output="done", raw={"key": "val"}, metrics=metrics)
-    assert result.output == "done"
+    result = RunnerResult(content="done", metrics=metrics, raw={"key": "val"})
+    assert result.content == "done"
     assert result.raw == {"key": "val"}
     assert result.metrics is metrics
 
@@ -103,6 +103,6 @@ def test_top_level_exports():
     import ldai
     assert hasattr(ldai, 'AgentRunner')
     assert hasattr(ldai, 'AgentGraphRunner')
-    assert hasattr(ldai, 'AgentResult')
     assert hasattr(ldai, 'AgentGraphResult')
+    assert hasattr(ldai, 'RunnerResult')
     assert hasattr(ldai, 'ToolRegistry')
diff --git a/packages/sdk/server-ai/tests/test_tracker.py b/packages/sdk/server-ai/tests/test_tracker.py
index c2ae2dde..4ed53441 100644
--- a/packages/sdk/server-ai/tests/test_tracker.py
+++ b/packages/sdk/server-ai/tests/test_tracker.py
@@ -909,3 +909,108 @@ def test_client_create_tracker_fails_on_invalid_json():
     result = ai_client.create_tracker(bad_token, context)
     assert not result.is_success()
     assert "Invalid resumption token" in result.error
+
+
+def test_ldai_metrics_to_dict_includes_tool_calls_and_duration_ms():
+    metrics = LDAIMetrics(
+        success=True,
+        usage=TokenUsage(total=10, input=4, output=6),
+        tool_calls=["search", "lookup"],
+        duration_ms=123,
+    )
+    d = metrics.to_dict()
+    assert d["success"] is True
+    assert d["usage"] == {"total": 10, "input": 4, "output": 6}
+    assert d["toolCalls"] == ["search", "lookup"]
+    assert d["durationMs"] == 123
+
+
+def test_ldai_metrics_to_dict_omits_optional_fields_when_none():
+    metrics = LDAIMetrics(success=False)
+    d = metrics.to_dict()
+    assert d == {"success": False}
+
+
+def test_track_metrics_of_uses_metrics_duration_ms_when_set(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        ld_client=client, run_id="test-run-id", config_key="config-key",
+        variation_key="variation-key", version=3, model_name="m",
+        provider_name="p", context=context,
+    )
+
+    def fn():
+        return "done"
+
+    def extract(_r):
+        return LDAIMetrics(success=True, duration_ms=999)
+
+    tracker.track_metrics_of(extract, fn)
+    assert tracker.get_summary().duration_ms == 999
+
+
+@pytest.mark.asyncio
+async def test_track_metrics_of_async_uses_metrics_duration_ms_when_set(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        ld_client=client, run_id="test-run-id", config_key="config-key",
+        variation_key="variation-key", version=3, model_name="m",
+        provider_name="p", context=context,
+    )
+
+    async def fn():
+        return "done"
+
+    def extract(_r):
+        return LDAIMetrics(success=True, duration_ms=42)
+
+    await tracker.track_metrics_of_async(extract, fn)
+    assert tracker.get_summary().duration_ms == 42
+
+
+def test_track_metrics_of_calls_track_tool_calls_when_present(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        ld_client=client, run_id="test-run-id", config_key="config-key",
+        variation_key="variation-key", version=3, model_name="m",
+        provider_name="p", context=context,
+    )
+
+    def fn():
+        return "done"
+
+    def extract(_r):
+        return LDAIMetrics(success=True, tool_calls=["foo", "bar"])
+
+    tracker.track_metrics_of(extract, fn)
+    summary = tracker.get_summary()
+    assert summary.tool_calls == ["foo", "bar"]
+    # One $ld:ai:tool_call event per tool key.
+    tool_call_events = [
+        c for c in client.track.mock_calls  # type: ignore
+        if c.args[0] == "$ld:ai:tool_call"
+    ]
+    assert len(tool_call_events) == 2
+
+
+def test_track_metrics_of_skips_track_tool_calls_when_absent(client: LDClient):
+    context = Context.create("user-key")
+    tracker = LDAIConfigTracker(
+        ld_client=client, run_id="test-run-id", config_key="config-key",
+        variation_key="variation-key", version=3, model_name="m",
+        provider_name="p", context=context,
+    )
+
+    def fn():
+        return "done"
+
+    def extract(_r):
+        return LDAIMetrics(success=True, usage=None)
+
+    tracker.track_metrics_of(extract, fn)
+    assert tracker.get_summary().tool_calls is None
+    tool_call_events = [
+        c for c in client.track.mock_calls  # type: ignore
+        if c.args[0] == "$ld:ai:tool_call"
+    ]
+    assert tool_call_events == []