diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py index 1969ec75..8e3af61d 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py @@ -1,8 +1,8 @@ -from typing import Any +from typing import Any, Dict, Optional from ldai import log -from ldai.providers import AgentResult, AgentRunner -from ldai.providers.types import LDAIMetrics +from ldai.providers.runner import Runner +from ldai.providers.types import LDAIMetrics, RunnerResult from ldai_langchain.langchain_helper import ( extract_last_message_content, @@ -10,25 +10,32 @@ ) -class LangChainAgentRunner(AgentRunner): +class LangChainAgentRunner(Runner): """ CAUTION: This feature is experimental and should NOT be considered ready for production use. It may change or be removed without notice and is not subject to backwards compatibility guarantees. - AgentRunner implementation for LangChain. + Runner implementation for LangChain agents. Wraps a compiled LangChain agent graph (from ``langchain.agents.create_agent``) and delegates execution to it. Tool calling and loop management are handled internally by the graph. Returned by LangChainRunnerFactory.create_agent(config, tools). + + Implements the unified :class:`~ldai.providers.runner.Runner` protocol via + :meth:`run`. """ def __init__(self, agent: Any): self._agent = agent - async def run(self, input: Any) -> AgentResult: + async def run( + self, + input: Any, + output_type: Optional[Dict[str, Any]] = None, + ) -> RunnerResult: """ Run the agent with the given input string. @@ -36,7 +43,10 @@ async def run(self, input: Any) -> AgentResult: the tool-calling loop internally. :param input: The user prompt or input to the agent - :return: AgentResult with output, raw response, and aggregated metrics + :param output_type: Reserved for future structured output support; + currently ignored. + :return: :class:`RunnerResult` with ``content``, ``raw`` response, and + aggregated metrics. """ try: result = await self._agent.ainvoke({ @@ -44,19 +54,18 @@ async def run(self, input: Any) -> AgentResult: }) messages = result.get("messages", []) output = extract_last_message_content(messages) - return AgentResult( - output=output, - raw=result, + return RunnerResult( + content=output, metrics=LDAIMetrics( success=True, usage=sum_token_usage_from_messages(messages), ), + raw=result, ) except Exception as error: log.warning(f"LangChain agent run failed: {error}") - return AgentResult( - output="", - raw=None, + return RunnerResult( + content="", metrics=LDAIMetrics(success=False, usage=None), ) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py index d504030b..213c072d 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py @@ -1,10 +1,10 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import BaseMessage from ldai import LDMessage, log -from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import LDAIMetrics, ModelResponse, StructuredResponse +from ldai.providers.runner import Runner +from ldai.providers.types import LDAIMetrics, RunnerResult from ldai_langchain.langchain_helper import ( convert_messages_to_langchain, @@ -13,12 +13,15 @@ ) -class LangChainModelRunner(ModelRunner): +class LangChainModelRunner(Runner): """ - ModelRunner implementation for LangChain. + Runner implementation for LangChain chat models. Holds a fully-configured BaseChatModel. - Returned by LangChainConnector.create_model(config). + Returned by LangChainRunnerFactory.create_model(config). + + Implements the unified :class:`~ldai.providers.runner.Runner` protocol via + :meth:`run`. """ def __init__(self, llm: BaseChatModel): @@ -32,13 +35,38 @@ def get_llm(self) -> BaseChatModel: """ return self._llm - async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse: + async def run( + self, + input: Any, + output_type: Optional[Dict[str, Any]] = None, + ) -> RunnerResult: """ - Invoke the LangChain model with an array of messages. - - :param messages: Array of LDMessage objects representing the conversation - :return: ModelResponse containing the model's response and metrics + Run the LangChain model with the given input. + + :param input: A string prompt or a list of :class:`LDMessage` objects + :param output_type: Optional JSON schema dict requesting structured output. + When provided, ``parsed`` on the returned :class:`RunnerResult` is + populated with the parsed JSON document. + :return: :class:`RunnerResult` containing ``content``, ``metrics``, + ``raw`` and (when ``output_type`` is set) ``parsed``. """ + messages = self._coerce_input(input) + + if output_type is not None: + return await self._run_structured(messages, output_type) + return await self._run_completion(messages) + + @staticmethod + def _coerce_input(input: Any) -> List[LDMessage]: + if isinstance(input, str): + return [LDMessage(role='user', content=input)] + if isinstance(input, list): + return input + raise TypeError( + f"Unsupported input type for LangChainModelRunner.run: {type(input).__name__}" + ) + + async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult: try: langchain_messages = convert_messages_to_langchain(messages) response: BaseMessage = await self._llm.ainvoke(langchain_messages) @@ -52,58 +80,63 @@ async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse: f'Multimodal response not supported, expecting a string. ' f'Content type: {type(response.content)}, Content: {response.content}' ) - metrics = LDAIMetrics(success=False, usage=metrics.usage) + return RunnerResult( + content='', + metrics=LDAIMetrics(success=False, usage=metrics.usage), + raw=response, + ) - return ModelResponse( - message=LDMessage(role='assistant', content=content), - metrics=metrics, - ) + return RunnerResult(content=content, metrics=metrics, raw=response) except Exception as error: log.warning(f'LangChain model invocation failed: {error}') - return ModelResponse( - message=LDMessage(role='assistant', content=''), + return RunnerResult( + content='', metrics=LDAIMetrics(success=False, usage=None), ) - async def invoke_structured_model( + async def _run_structured( self, messages: List[LDMessage], - response_structure: Dict[str, Any], - ) -> StructuredResponse: - """ - Invoke the LangChain model with structured output support. - - :param messages: Array of LDMessage objects representing the conversation - :param response_structure: Dictionary defining the output structure - :return: StructuredResponse containing the structured data - """ - structured_response = StructuredResponse( - data={}, - raw_response='', - metrics=LDAIMetrics(success=False, usage=None), - ) + output_type: Dict[str, Any], + ) -> RunnerResult: try: langchain_messages = convert_messages_to_langchain(messages) - structured_llm = self._llm.with_structured_output(response_structure, include_raw=True) + structured_llm = self._llm.with_structured_output(output_type, include_raw=True) response = await structured_llm.ainvoke(langchain_messages) if not isinstance(response, dict): log.warning(f'Structured output did not return a dict. Got: {type(response)}') - return structured_response + return RunnerResult( + content='', + metrics=LDAIMetrics(success=False, usage=None), + ) raw_response = response.get('raw') + usage = None + raw_content = '' if raw_response is not None: if hasattr(raw_response, 'content'): - structured_response.raw_response = raw_response.content - structured_response.metrics.usage = get_ai_usage_from_response(raw_response) + raw_content = raw_response.content or '' + usage = get_ai_usage_from_response(raw_response) if response.get('parsing_error'): log.warning('LangChain structured model invocation had a parsing error') - return structured_response + return RunnerResult( + content=raw_content, + metrics=LDAIMetrics(success=False, usage=usage), + raw=raw_response, + ) - structured_response.metrics.success = True - structured_response.data = response.get('parsed') or {} - return structured_response + parsed = response.get('parsed') or {} + return RunnerResult( + content=raw_content, + metrics=LDAIMetrics(success=True, usage=usage), + raw=raw_response, + parsed=parsed, + ) except Exception as error: log.warning(f'LangChain structured model invocation failed: {error}') - return structured_response + return RunnerResult( + content='', + metrics=LDAIMetrics(success=False, usage=None), + ) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py index 9ecb2351..15eee41f 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py @@ -329,8 +329,10 @@ async def run(self, input: Any) -> AgentGraphResult: messages = result.get('messages', []) output = extract_last_message_content(messages) - # Flush per-node metrics to LD trackers - all_eval_results = await handler.flush(self._graph, pending_eval_tasks) + # Flush per-node metrics to LD trackers; eval results are tracked + # internally and intentionally not exposed on AgentGraphResult here + # — judge dispatch is the managed layer's responsibility. + await handler.flush(self._graph, pending_eval_tasks) tracker.track_path(handler.path) tracker.track_duration(duration) @@ -341,7 +343,6 @@ async def run(self, input: Any) -> AgentGraphResult: output=output, raw=result, metrics=LDAIMetrics(success=True), - evaluations=all_eval_results, ) except Exception as exc: diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py index 4018e7c3..a8fc46cf 100644 --- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py @@ -219,8 +219,8 @@ def test_returns_provider_name_unchanged_for_unmapped_providers(self): assert map_provider('unknown') == 'unknown' -class TestInvokeModel: - """Tests for invoke_model instance method.""" +class TestRunCompletion: + """Tests for run() without structured output.""" @pytest.fixture def mock_llm(self): @@ -235,10 +235,10 @@ async def test_returns_success_true_for_string_content(self, mock_llm): provider = LangChainModelRunner(mock_llm) messages = [LDMessage(role='user', content='Hello')] - result = await provider.invoke_model(messages) + result = await provider.run(messages) assert result.metrics.success is True - assert result.message.content == 'Test response' + assert result.content == 'Test response' @pytest.mark.asyncio async def test_returns_success_false_for_non_string_content_and_logs_warning(self, mock_llm): @@ -248,10 +248,10 @@ async def test_returns_success_false_for_non_string_content_and_logs_warning(sel provider = LangChainModelRunner(mock_llm) messages = [LDMessage(role='user', content='Hello')] - result = await provider.invoke_model(messages) + result = await provider.run(messages) assert result.metrics.success is False - assert result.message.content == '' + assert result.content == '' @pytest.mark.asyncio async def test_returns_success_false_when_model_invocation_throws_error(self, mock_llm): @@ -261,15 +261,14 @@ async def test_returns_success_false_when_model_invocation_throws_error(self, mo provider = LangChainModelRunner(mock_llm) messages = [LDMessage(role='user', content='Hello')] - result = await provider.invoke_model(messages) + result = await provider.run(messages) assert result.metrics.success is False - assert result.message.content == '' - assert result.message.role == 'assistant' + assert result.content == '' -class TestInvokeStructuredModel: - """Tests for invoke_structured_model instance method.""" +class TestRunStructured: + """Tests for run() with structured output.""" @pytest.fixture def mock_llm(self): @@ -288,10 +287,10 @@ async def test_returns_success_true_for_successful_invocation(self, mock_llm): messages = [LDMessage(role='user', content='Hello')] response_structure = {'type': 'object', 'properties': {}} - result = await provider.invoke_structured_model(messages, response_structure) + result = await provider.run(messages, output_type=response_structure) assert result.metrics.success is True - assert result.data == parsed_data + assert result.parsed == parsed_data @pytest.mark.asyncio async def test_returns_success_false_when_structured_model_invocation_throws_error(self, mock_llm): @@ -304,11 +303,11 @@ async def test_returns_success_false_when_structured_model_invocation_throws_err messages = [LDMessage(role='user', content='Hello')] response_structure = {'type': 'object', 'properties': {}} - result = await provider.invoke_structured_model(messages, response_structure) + result = await provider.run(messages, output_type=response_structure) assert result.metrics.success is False - assert result.data == {} - assert result.raw_response == '' + assert result.parsed is None + assert result.raw is None assert result.metrics.usage is None @@ -464,7 +463,7 @@ class TestLangChainAgentRunner: @pytest.mark.asyncio async def test_runs_agent_and_returns_result(self): - """Should return AgentResult with the last message content from the graph.""" + """Should return RunnerResult with the last message content from the graph.""" from ldai_langchain import LangChainAgentRunner final_msg = AIMessage(content="The answer is 42.") @@ -474,7 +473,7 @@ async def test_runs_agent_and_returns_result(self): runner = LangChainAgentRunner(mock_agent) result = await runner.run("What is the answer?") - assert result.output == "The answer is 42." + assert result.content == "The answer is 42." assert result.metrics.success is True mock_agent.ainvoke.assert_called_once_with( {"messages": [{"role": "user", "content": "What is the answer?"}]} @@ -496,7 +495,7 @@ async def test_aggregates_token_usage_across_messages(self): runner = LangChainAgentRunner(mock_agent) result = await runner.run("Hello") - assert result.output == "final answer" + assert result.content == "final answer" assert result.metrics.success is True assert result.metrics.usage is not None assert result.metrics.usage.total == 30 @@ -505,7 +504,7 @@ async def test_aggregates_token_usage_across_messages(self): @pytest.mark.asyncio async def test_returns_failure_when_exception_thrown(self): - """Should return unsuccessful AgentResult when exception is thrown.""" + """Should return unsuccessful RunnerResult when exception is thrown.""" from ldai_langchain import LangChainAgentRunner mock_agent = MagicMock() @@ -514,7 +513,7 @@ async def test_returns_failure_when_exception_thrown(self): runner = LangChainAgentRunner(mock_agent) result = await runner.run("Hello") - assert result.output == "" + assert result.content == "" assert result.metrics.success is False diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py index 7e79c836..4f6866cc 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py @@ -1,28 +1,31 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from ldai import log -from ldai.providers import AgentResult, AgentRunner, ToolRegistry +from ldai.providers import RunnerResult, ToolRegistry +from ldai.providers.runner import Runner from ldai.providers.types import LDAIMetrics from ldai_openai.openai_helper import ( get_ai_usage_from_response, + get_tool_calls_from_run_items, registry_value_to_agent_tool, ) -class OpenAIAgentRunner(AgentRunner): +class OpenAIAgentRunner(Runner): """ CAUTION: This feature is experimental and should NOT be considered ready for production use. It may change or be removed without notice and is not subject to backwards compatibility guarantees. - AgentRunner implementation for OpenAI. + Runner implementation for a single OpenAI agent. Executes a single agent using the OpenAI Agents SDK (``openai-agents``). Tool calling and the agentic loop are handled internally by ``Runner.run``. - Returned by OpenAIRunnerFactory.create_agent(config, tools). + Returned by ``OpenAIRunnerFactory.create_agent(config, tools)``. + Implements the unified :class:`~ldai.providers.runner.Runner` protocol. Requires ``openai-agents`` to be installed. """ @@ -40,15 +43,22 @@ def __init__( self._tool_definitions = tool_definitions self._tools = tools - async def run(self, input: Any) -> AgentResult: + async def run( + self, + input: Any, + output_type: Optional[Dict[str, Any]] = None, + ) -> RunnerResult: """ - Run the agent with the given input string. + Run the agent with the given input. Delegates to the OpenAI Agents SDK ``Runner.run``, which handles the tool-calling loop internally. :param input: The user prompt or input to the agent - :return: AgentResult with output, raw response, and aggregated metrics + :param output_type: Reserved for future structured output support; + currently ignored. + :return: :class:`RunnerResult` with ``content``, ``raw`` response, and + metrics including aggregated token usage and observed ``tool_calls``. """ try: from agents import Agent, Runner @@ -57,7 +67,10 @@ async def run(self, input: Any) -> AgentResult: "openai-agents is required for OpenAIAgentRunner. " "Install it with: pip install openai-agents" ) - return AgentResult(output="", raw=None, metrics=LDAIMetrics(success=False, usage=None)) + return RunnerResult( + content="", + metrics=LDAIMetrics(success=False, usage=None), + ) try: agent_tools = self._build_agent_tools() @@ -73,17 +86,26 @@ async def run(self, input: Any) -> AgentResult: result = await Runner.run(agent, str(input), max_turns=25) - return AgentResult( - output=str(result.final_output), - raw=result, + tool_calls = [ + tool_name + for _agent_name, tool_name in get_tool_calls_from_run_items(result.new_items) + ] + + return RunnerResult( + content=str(result.final_output), metrics=LDAIMetrics( success=True, usage=get_ai_usage_from_response(result), + tool_calls=tool_calls if tool_calls else None, ), + raw=result, ) except Exception as error: log.warning(f"OpenAI agent run failed: {error}") - return AgentResult(output="", raw=None, metrics=LDAIMetrics(success=False, usage=None)) + return RunnerResult( + content="", + metrics=LDAIMetrics(success=False, usage=None), + ) def _build_agent_tools(self) -> List[Any]: """Build tool instances from LD tool definitions and registry.""" diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py index 9c4a34d8..75695ba0 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py @@ -1,9 +1,9 @@ import json -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from ldai import LDMessage, log -from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import LDAIMetrics, ModelResponse, StructuredResponse +from ldai.providers.runner import Runner +from ldai.providers.types import LDAIMetrics, RunnerResult from openai import AsyncOpenAI from ldai_openai.openai_helper import ( @@ -12,12 +12,15 @@ ) -class OpenAIModelRunner(ModelRunner): +class OpenAIModelRunner(Runner): """ - ModelRunner implementation for OpenAI. + Runner implementation for OpenAI chat completions. Holds a fully-configured AsyncOpenAI client, model name, and parameters. - Returned by OpenAIConnector.create_model(config). + Returned by ``OpenAIRunnerFactory.create_model(config)``. + + Implements the unified :class:`~ldai.providers.runner.Runner` protocol via + :meth:`run`. """ def __init__( @@ -30,13 +33,38 @@ def __init__( self._model_name = model_name self._parameters = parameters - async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse: + async def run( + self, + input: Any, + output_type: Optional[Dict[str, Any]] = None, + ) -> RunnerResult: """ - Invoke the OpenAI model with an array of messages. - - :param messages: Array of LDMessage objects representing the conversation - :return: ModelResponse containing the model's response and metrics + Run the OpenAI model with the given input. + + :param input: A string prompt or a list of :class:`LDMessage` objects + :param output_type: Optional JSON schema dict requesting structured output. + When provided, ``parsed`` on the returned :class:`RunnerResult` is + populated with the parsed JSON document. + :return: :class:`RunnerResult` containing ``content``, ``metrics``, + ``raw`` and (when ``output_type`` is set) ``parsed``. """ + messages = self._coerce_input(input) + + if output_type is not None: + return await self._run_structured(messages, output_type) + return await self._run_completion(messages) + + @staticmethod + def _coerce_input(input: Any) -> List[LDMessage]: + if isinstance(input, str): + return [LDMessage(role='user', content=input)] + if isinstance(input, list): + return input + raise TypeError( + f"Unsupported input type for OpenAIModelRunner.run: {type(input).__name__}" + ) + + async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult: try: response = await self._client.chat.completions.create( model=self._model_name, @@ -45,40 +73,29 @@ async def invoke_model(self, messages: List[LDMessage]) -> ModelResponse: ) metrics = get_ai_metrics_from_response(response) - - content = '' - if response.choices and len(response.choices) > 0: - message = response.choices[0].message - if message and message.content: - content = message.content + content = self._extract_content(response) if not content: log.warning('OpenAI response has no content available') - metrics = LDAIMetrics(success=False, usage=metrics.usage) + return RunnerResult( + content='', + metrics=LDAIMetrics(success=False, usage=metrics.usage), + raw=response, + ) - return ModelResponse( - message=LDMessage(role='assistant', content=content), - metrics=metrics, - ) + return RunnerResult(content=content, metrics=metrics, raw=response) except Exception as error: log.warning(f'OpenAI model invocation failed: {error}') - return ModelResponse( - message=LDMessage(role='assistant', content=''), + return RunnerResult( + content='', metrics=LDAIMetrics(success=False, usage=None), ) - async def invoke_structured_model( + async def _run_structured( self, messages: List[LDMessage], - response_structure: Dict[str, Any], - ) -> StructuredResponse: - """ - Invoke the OpenAI model with structured output support. - - :param messages: Array of LDMessage objects representing the conversation - :param response_structure: Dictionary defining the JSON schema for output structure - :return: StructuredResponse containing the structured data - """ + output_type: Dict[str, Any], + ) -> RunnerResult: try: response = await self._client.chat.completions.create( model=self._model_name, @@ -87,7 +104,7 @@ async def invoke_structured_model( 'type': 'json_schema', 'json_schema': { 'name': 'structured_output', - 'schema': response_structure, + 'schema': output_type, 'strict': True, }, }, @@ -95,35 +112,42 @@ async def invoke_structured_model( ) metrics = get_ai_metrics_from_response(response) - - content = '' - if response.choices and len(response.choices) > 0: - message = response.choices[0].message - if message and message.content: - content = message.content + content = self._extract_content(response) if not content: log.warning('OpenAI structured response has no content available') - return StructuredResponse( - data={}, - raw_response='', + return RunnerResult( + content='', metrics=LDAIMetrics(success=False, usage=metrics.usage), + raw=response, ) try: - data = json.loads(content) - return StructuredResponse(data=data, raw_response=content, metrics=metrics) + parsed = json.loads(content) + return RunnerResult( + content=content, + metrics=metrics, + raw=response, + parsed=parsed, + ) except json.JSONDecodeError as parse_error: log.warning(f'OpenAI structured response contains invalid JSON: {parse_error}') - return StructuredResponse( - data={}, - raw_response=content, + return RunnerResult( + content=content, metrics=LDAIMetrics(success=False, usage=metrics.usage), + raw=response, ) except Exception as error: log.warning(f'OpenAI structured model invocation failed: {error}') - return StructuredResponse( - data={}, - raw_response='', + return RunnerResult( + content='', metrics=LDAIMetrics(success=False, usage=None), ) + + @staticmethod + def _extract_content(response: Any) -> str: + if response.choices and len(response.choices) > 0: + message = response.choices[0].message + if message and message.content: + return message.content + return '' diff --git a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py index 19d2cff7..3b69d3f6 100644 --- a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py +++ b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py @@ -120,8 +120,8 @@ def test_handles_partial_usage_data(self): assert result.usage.output == 0 -class TestInvokeModel: - """Tests for invoke_model instance method.""" +class TestRunCompletion: + """Tests for the unified run() method (chat-completion path).""" @pytest.fixture def mock_client(self): @@ -144,15 +144,14 @@ async def test_invokes_openai_chat_completions_and_returns_response(self, mock_c provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) messages = [LDMessage(role='user', content='Hello!')] - result = await provider.invoke_model(messages) + result = await provider.run(messages) mock_client.chat.completions.create.assert_called_once_with( model='gpt-3.5-turbo', messages=[{'role': 'user', 'content': 'Hello!'}], ) - assert result.message.role == 'assistant' - assert result.message.content == 'Hello! How can I help you today?' + assert result.content == 'Hello! How can I help you today?' assert result.metrics.success is True assert result.metrics.usage is not None assert result.metrics.usage.total == 25 @@ -174,10 +173,9 @@ async def test_returns_unsuccessful_response_when_no_content(self, mock_client): provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) messages = [LDMessage(role='user', content='Hello!')] - result = await provider.invoke_model(messages) + result = await provider.run(messages) - assert result.message.role == 'assistant' - assert result.message.content == '' + assert result.content == '' assert result.metrics.success is False @pytest.mark.asyncio @@ -193,10 +191,9 @@ async def test_returns_unsuccessful_response_when_choices_empty(self, mock_clien provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) messages = [LDMessage(role='user', content='Hello!')] - result = await provider.invoke_model(messages) + result = await provider.run(messages) - assert result.message.role == 'assistant' - assert result.message.content == '' + assert result.content == '' assert result.metrics.success is False @pytest.mark.asyncio @@ -208,15 +205,14 @@ async def test_returns_unsuccessful_response_when_exception_thrown(self, mock_cl provider = OpenAIModelRunner(mock_client, 'gpt-3.5-turbo', {}) messages = [LDMessage(role='user', content='Hello!')] - result = await provider.invoke_model(messages) + result = await provider.run(messages) - assert result.message.role == 'assistant' - assert result.message.content == '' + assert result.content == '' assert result.metrics.success is False -class TestInvokeStructuredModel: - """Tests for invoke_structured_model instance method.""" +class TestRunStructured: + """Tests for the unified run() method (structured-output path).""" @pytest.fixture def mock_client(self): @@ -249,10 +245,10 @@ async def test_invokes_openai_with_structured_output(self, mock_client): 'required': ['name', 'age', 'city'], } - result = await provider.invoke_structured_model(messages, response_structure) + result = await provider.run(messages, output_type=response_structure) - assert result.data == {'name': 'John', 'age': 30, 'city': 'New York'} - assert result.raw_response == '{"name": "John", "age": 30, "city": "New York"}' + assert result.parsed == {'name': 'John', 'age': 30, 'city': 'New York'} + assert result.content == '{"name": "John", "age": 30, "city": "New York"}' assert result.metrics.success is True assert result.metrics.usage is not None assert result.metrics.usage.total == 30 @@ -276,10 +272,10 @@ async def test_returns_unsuccessful_when_no_content_in_structured_response(self, messages = [LDMessage(role='user', content='Tell me about a person')] response_structure = {'type': 'object'} - result = await provider.invoke_structured_model(messages, response_structure) + result = await provider.run(messages, output_type=response_structure) - assert result.data == {} - assert result.raw_response == '' + assert result.parsed is None + assert result.content == '' assert result.metrics.success is False @pytest.mark.asyncio @@ -300,10 +296,10 @@ async def test_handles_json_parsing_errors(self, mock_client): messages = [LDMessage(role='user', content='Tell me about a person')] response_structure = {'type': 'object'} - result = await provider.invoke_structured_model(messages, response_structure) + result = await provider.run(messages, output_type=response_structure) - assert result.data == {} - assert result.raw_response == 'invalid json content' + assert result.parsed is None + assert result.content == 'invalid json content' assert result.metrics.success is False assert result.metrics.usage is not None assert result.metrics.usage.total == 15 @@ -319,10 +315,10 @@ async def test_returns_unsuccessful_response_when_exception_thrown(self, mock_cl messages = [LDMessage(role='user', content='Tell me about a person')] response_structure = {'type': 'object'} - result = await provider.invoke_structured_model(messages, response_structure) + result = await provider.run(messages, output_type=response_structure) - assert result.data == {} - assert result.raw_response == '' + assert result.parsed is None + assert result.content == '' assert result.metrics.success is False @@ -465,19 +461,20 @@ def _make_run_result(self, output: str, total: int = 15, input_tokens: int = 10, @pytest.mark.asyncio async def test_runs_agent_and_returns_result_with_no_tool_calls(self): - """Should return AgentResult when Runner.run returns a final output.""" + """Should return RunnerResult when Runner.run returns a final output.""" import sys from ldai_openai import OpenAIAgentRunner mock_run_result = self._make_run_result("The answer is 42.", total=15, input_tokens=10, output_tokens=5) + mock_run_result.new_items = [] agents_mock, tc_mock = _make_agents_mock(AsyncMock(return_value=mock_run_result)) runner = OpenAIAgentRunner('gpt-4', {}, 'You are helpful.', [], {}) with patch.dict(sys.modules, {'agents': agents_mock, 'agents.tool_context': tc_mock}): result = await runner.run("What is the answer?") - assert result.output == "The answer is 42." + assert result.content == "The answer is 42." assert result.metrics.success is True assert result.metrics.usage is not None assert result.metrics.usage.total == 15 @@ -490,6 +487,7 @@ async def test_executes_tool_calls_and_returns_final_response(self): from ldai_openai import OpenAIAgentRunner mock_run_result = self._make_run_result("It is sunny in Paris.", total=43, input_tokens=30, output_tokens=13) + mock_run_result.new_items = [] agents_mock, tc_mock = _make_agents_mock(AsyncMock(return_value=mock_run_result)) weather_fn = MagicMock(return_value="Sunny, 25°C") @@ -501,13 +499,13 @@ async def test_executes_tool_calls_and_returns_final_response(self): with patch.dict(sys.modules, {'agents': agents_mock, 'agents.tool_context': tc_mock}): result = await runner.run("What is the weather in Paris?") - assert result.output == "It is sunny in Paris." + assert result.content == "It is sunny in Paris." assert result.metrics.success is True assert result.metrics.usage.total == 43 @pytest.mark.asyncio async def test_returns_failure_when_exception_thrown(self): - """Should return unsuccessful AgentResult when Runner.run raises.""" + """Should return unsuccessful RunnerResult when Runner.run raises.""" import sys from ldai_openai import OpenAIAgentRunner @@ -518,12 +516,12 @@ async def test_returns_failure_when_exception_thrown(self): with patch.dict(sys.modules, {'agents': agents_mock, 'agents.tool_context': tc_mock}): result = await runner.run("Hello") - assert result.output == "" + assert result.content == "" assert result.metrics.success is False @pytest.mark.asyncio async def test_returns_failure_when_openai_agents_not_installed(self): - """Should return unsuccessful AgentResult when openai-agents is not installed.""" + """Should return unsuccessful RunnerResult when openai-agents is not installed.""" import sys from ldai_openai import OpenAIAgentRunner @@ -532,5 +530,5 @@ async def test_returns_failure_when_openai_agents_not_installed(self): with patch.dict(sys.modules, {'agents': None}): result = await runner.run("Hello") - assert result.output == "" + assert result.content == "" assert result.metrics.success is False diff --git a/packages/sdk/server-ai/src/ldai/__init__.py b/packages/sdk/server-ai/src/ldai/__init__.py index 405ec5a8..56d780d3 100644 --- a/packages/sdk/server-ai/src/ldai/__init__.py +++ b/packages/sdk/server-ai/src/ldai/__init__.py @@ -34,12 +34,19 @@ from ldai.providers import ( AgentGraphResult, AgentGraphRunner, + AgentGraphRunnerResult, AgentResult, AgentRunner, + GraphMetrics, + GraphMetricSummary, + ManagedGraphResult, + ManagedResult, + Runner, + RunnerResult, ToolRegistry, ) from ldai.providers.types import JudgeResult -from ldai.tracker import AIGraphTracker +from ldai.tracker import AIGraphTracker, LDAIMetricSummary __all__ = [ 'LDAIClient', @@ -48,6 +55,14 @@ 'AgentGraphRunner', 'AgentResult', 'AgentGraphResult', + 'AgentGraphRunnerResult', + 'GraphMetrics', + 'GraphMetricSummary', + 'ManagedGraphResult', + 'ManagedResult', + 'Runner', + 'RunnerResult', + 'LDAIMetricSummary', 'ToolRegistry', 'AIAgentConfig', 'AIAgentConfigDefault', diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 448d5c55..eeff1f1b 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -815,7 +815,7 @@ async def create_agent_graph( if not runner: return None - return ManagedAgentGraph(runner) + return ManagedAgentGraph(runner, graph=graph) def agents( self, diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index f2e8c362..6919a7aa 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -8,8 +8,8 @@ from ldai import log from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, LDMessage -from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import JudgeResult, ModelResponse +from ldai.providers.runner import Runner +from ldai.providers.types import JudgeResult, RunnerResult class Judge: @@ -23,7 +23,7 @@ class Judge: def __init__( self, ai_config: AIJudgeConfig, - model_runner: ModelRunner, + model_runner: Runner, ): """ Initialize the Judge. @@ -76,10 +76,14 @@ async def evaluate( response = await tracker.track_metrics_of_async( lambda result: result.metrics, - lambda: self._model_runner.invoke_structured_model(messages, self._evaluation_response_structure), + lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure), ) - parsed = self._parse_evaluation_response(response.data) + if response.parsed is None: + log.warning('Judge evaluation did not return structured output') + return judge_result + + parsed = self._parse_evaluation_response(response.parsed) if parsed is None: log.warning('Judge evaluation did not return the expected evaluation') @@ -99,19 +103,19 @@ async def evaluate( async def evaluate_messages( self, messages: list[LDMessage], - response: ModelResponse, + response: RunnerResult, sampling_ratio: float = 1.0, ) -> JudgeResult: """ Evaluates an AI response from chat messages and response. :param messages: Array of messages representing the conversation history - :param response: The AI response to be evaluated + :param response: The runner result to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) :return: The result of the judge evaluation. """ input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' - output_text = response.message.content + output_text = response.content return await self.evaluate(input_text, output_text, sampling_ratio) @@ -123,7 +127,7 @@ def get_ai_config(self) -> AIJudgeConfig: """ return self._ai_config - def get_model_runner(self) -> ModelRunner: + def get_model_runner(self) -> Runner: """ Returns the model runner used by this judge. diff --git a/packages/sdk/server-ai/src/ldai/managed_agent.py b/packages/sdk/server-ai/src/ldai/managed_agent.py index ab3ee5e6..9d582ae4 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent.py @@ -1,43 +1,49 @@ """ManagedAgent — LaunchDarkly managed wrapper for agent invocations.""" from ldai.models import AIAgentConfig -from ldai.providers import AgentResult, AgentRunner +from ldai.providers.runner import Runner +from ldai.providers.types import ManagedResult class ManagedAgent: """ LaunchDarkly managed wrapper for AI agent invocations. - Holds an AgentRunner. Handles tracking automatically via ``create_tracker()``. + Holds a Runner. Handles tracking automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_agent()``. """ def __init__( self, ai_config: AIAgentConfig, - agent_runner: AgentRunner, + agent_runner: Runner, ): self._ai_config = ai_config self._agent_runner = agent_runner - async def run(self, input: str) -> AgentResult: + async def run(self, input: str) -> ManagedResult: """ Run the agent with the given input string. :param input: The user prompt or input to the agent - :return: AgentResult containing the agent's output and metrics + :return: ManagedResult containing the agent's output and metric summary """ tracker = self._ai_config.create_tracker() - return await tracker.track_metrics_of_async( - lambda result: result.metrics, + result = await tracker.track_metrics_of_async( + lambda r: r.metrics, lambda: self._agent_runner.run(input), ) + return ManagedResult( + content=result.content, + metrics=tracker.get_summary(), + raw=result.raw, + ) - def get_agent_runner(self) -> AgentRunner: + def get_agent_runner(self) -> Runner: """ - Return the underlying AgentRunner for advanced use. + Return the underlying runner for advanced use. - :return: The AgentRunner instance. + :return: The Runner instance. """ return self._agent_runner diff --git a/packages/sdk/server-ai/src/ldai/managed_agent_graph.py b/packages/sdk/server-ai/src/ldai/managed_agent_graph.py index a146e60e..94026973 100644 --- a/packages/sdk/server-ai/src/ldai/managed_agent_graph.py +++ b/packages/sdk/server-ai/src/ldai/managed_agent_graph.py @@ -1,17 +1,33 @@ """ManagedAgentGraph — LaunchDarkly managed wrapper for agent graph execution.""" -from typing import Any +from typing import Any, Optional from ldai.providers import AgentGraphResult, AgentGraphRunner +from ldai.providers.types import ( + AgentGraphRunnerResult, + GraphMetricSummary, + LDAIMetrics, + ManagedGraphResult, +) class ManagedAgentGraph: """ LaunchDarkly managed wrapper for AI agent graph execution. - Holds an AgentGraphRunner. Auto-tracking of path, - tool calls, handoffs, latency, and invocation success/failure is handled - by the runner implementation. + Holds an AgentGraphRunner and an optional AgentGraphDefinition. Wraps the + runner result in a :class:`~ldai.providers.types.ManagedGraphResult` and + builds a :class:`~ldai.providers.types.GraphMetricSummary` from the runner's + metrics. + + When the runner returns an :class:`~ldai.providers.types.AgentGraphRunnerResult` + (new shape), the managed layer drives all graph-level tracking from + ``result.metrics``. When the runner returns the legacy + :class:`~ldai.providers.AgentGraphResult`, tracking has already been performed + inside the runner; the managed layer simply wraps the result. This detection + branch exists as a deliberate bridge: once PR 11-openai and PR 11-langchain + migrate both runners to return ``AgentGraphRunnerResult``, the legacy branch + becomes dead code and will be removed in PR 11-langchain's final cleanup commit. Obtain an instance via ``LDAIClient.create_agent_graph()``. """ @@ -19,25 +35,104 @@ class ManagedAgentGraph: def __init__( self, runner: AgentGraphRunner, + graph: Optional[Any] = None, ): """ Initialize ManagedAgentGraph. :param runner: The AgentGraphRunner to delegate execution to + :param graph: Optional AgentGraphDefinition used to create the + graph-level tracker when the runner returns an + :class:`AgentGraphRunnerResult` (new shape). Not needed for + legacy runners that still return :class:`AgentGraphResult`. """ self._runner = runner + self._graph = graph - async def run(self, input: Any) -> AgentGraphResult: + async def run(self, input: Any) -> ManagedGraphResult: """ Run the agent graph with the given input. - Delegates to the underlying AgentGraphRunner, which handles - execution and all auto-tracking internally. + Delegates to the underlying AgentGraphRunner. The returned type + determines which tracking path is taken: + + - :class:`AgentGraphRunnerResult` (new shape): the managed layer drives + graph-level tracking from ``result.metrics`` via the graph tracker. + Per-node tracking from ``result.metrics.node_metrics`` will be wired + in a follow-up commit once the runners populate ``node_metrics``. + - :class:`AgentGraphResult` (legacy shape): tracking already occurred + inside the runner; the managed layer wraps the result without + additional tracking. :param input: The input prompt or structured input for the graph - :return: AgentGraphResult containing the output, raw response, and metrics + :return: ManagedGraphResult containing the content, metric summary, + raw response, and an optional evaluations task (always ``None`` + for now — per-graph evaluations will be added in a future PR). + """ + raw_result = await self._runner.run(input) + + if isinstance(raw_result, AgentGraphRunnerResult): + # New shape: managed layer drives all tracking. + summary = self._build_summary_from_runner_result(raw_result) + if self._graph is not None: + self._flush_graph_tracking(raw_result, self._graph.create_tracker()) + return ManagedGraphResult( + content=raw_result.content, + metrics=summary, + raw=raw_result.raw, + evaluations=None, + ) + + # Legacy shape (AgentGraphResult): tracking already happened in the runner. + # Build a GraphMetricSummary from the runner result's LDAIMetrics. + # path and node_metrics will be populated once graph runners are migrated + # to return AgentGraphRunnerResult with GraphMetrics (PR 11-openai/langchain). + metrics: LDAIMetrics = raw_result.metrics + summary = GraphMetricSummary( + success=metrics.success, + usage=metrics.usage, + duration_ms=getattr(metrics, 'duration_ms', None), + ) + return ManagedGraphResult( + content=raw_result.output, + metrics=summary, + raw=raw_result.raw, + evaluations=None, + ) + + def _build_summary_from_runner_result( + self, + result: AgentGraphRunnerResult, + ) -> GraphMetricSummary: + """Build a GraphMetricSummary from an AgentGraphRunnerResult.""" + m = result.metrics + return GraphMetricSummary( + success=m.success, + path=list(m.path), + duration_ms=m.duration_ms, + usage=m.usage, + node_metrics=dict(m.node_metrics), + ) + + def _flush_graph_tracking(self, result: AgentGraphRunnerResult, tracker: Any) -> None: + """ + Drive graph-level LaunchDarkly tracking events from runner result metrics. + + Called only when the runner returns the new ``AgentGraphRunnerResult`` + shape. Node-level tracking (from ``result.metrics.node_metrics``) will + be wired once the runners start populating that field. """ - return await self._runner.run(input) + m = result.metrics + if m.path: + tracker.track_path(m.path) + if m.duration_ms is not None: + tracker.track_duration(m.duration_ms) + if m.success: + tracker.track_invocation_success() + else: + tracker.track_invocation_failure() + if m.usage is not None: + tracker.track_total_tokens(m.usage) def get_agent_graph_runner(self) -> AgentGraphRunner: """ diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index 9cfb503a..94605eab 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -1,10 +1,10 @@ import asyncio -from typing import List, Optional +from typing import List from ldai import log from ldai.models import AICompletionConfig, LDMessage -from ldai.providers.model_runner import ModelRunner -from ldai.providers.types import JudgeResult, ModelResponse +from ldai.providers.runner import Runner +from ldai.providers.types import JudgeResult, ManagedResult from ldai.tracker import LDAIConfigTracker @@ -12,7 +12,7 @@ class ManagedModel: """ LaunchDarkly managed wrapper for AI model invocations. - Holds a ModelRunner. Handles conversation management, judge evaluation + Holds a Runner. Handles conversation management, judge evaluation dispatch, and tracking automatically via ``create_tracker()``. Obtain an instance via ``LDAIClient.create_model()``. """ @@ -20,22 +20,23 @@ class ManagedModel: def __init__( self, ai_config: AICompletionConfig, - model_runner: ModelRunner, + model_runner: Runner, ): self._ai_config = ai_config self._model_runner = model_runner self._messages: List[LDMessage] = [] - async def invoke(self, prompt: str) -> ModelResponse: + async def run(self, prompt: str) -> ManagedResult: """ - Invoke the model with a prompt string. + Run the model with a prompt string. Appends the prompt to the conversation history, prepends any system messages from the config, delegates to the runner, and appends the response to the history. :param prompt: The user prompt to send to the model - :return: ModelResponse containing the model's response and metrics + :return: ManagedResult containing the model's response, metric summary, + and an optional evaluations task """ tracker = self._ai_config.create_tracker() @@ -45,17 +46,26 @@ async def invoke(self, prompt: str) -> ModelResponse: config_messages = self._ai_config.messages or [] all_messages = config_messages + self._messages - response = await tracker.track_metrics_of_async( - lambda result: result.metrics, - lambda: self._model_runner.invoke_model(all_messages), + result = await tracker.track_metrics_of_async( + lambda r: r.metrics, + lambda: self._model_runner.run(all_messages), ) + assistant_message = LDMessage(role='assistant', content=result.content) + input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else '' - output_text = response.message.content - response.evaluations = self._track_judge_results(tracker, input_text, output_text) - self._messages.append(response.message) - return response + evaluations_task = self._track_judge_results(tracker, input_text, result.content) + + self._messages.append(assistant_message) + + return ManagedResult( + content=result.content, + metrics=tracker.get_summary(), + raw=result.raw, + parsed=result.parsed, + evaluations=evaluations_task, + ) def _track_judge_results( self, @@ -98,11 +108,11 @@ def append_messages(self, messages: List[LDMessage]) -> None: """ self._messages.extend(messages) - def get_model_runner(self) -> ModelRunner: + def get_model_runner(self) -> Runner: """ - Return the underlying ModelRunner for advanced use. + Return the underlying runner for advanced use. - :return: The ModelRunner instance. + :return: The Runner instance. """ return self._model_runner diff --git a/packages/sdk/server-ai/src/ldai/providers/__init__.py b/packages/sdk/server-ai/src/ldai/providers/__init__.py index b2bfa72e..22dce784 100644 --- a/packages/sdk/server-ai/src/ldai/providers/__init__.py +++ b/packages/sdk/server-ai/src/ldai/providers/__init__.py @@ -2,13 +2,20 @@ from ldai.providers.agent_runner import AgentRunner from ldai.providers.ai_provider import AIProvider from ldai.providers.model_runner import ModelRunner +from ldai.providers.runner import Runner from ldai.providers.runner_factory import RunnerFactory from ldai.providers.types import ( AgentGraphResult, + AgentGraphRunnerResult, AgentResult, + GraphMetrics, + GraphMetricSummary, JudgeResult, LDAIMetrics, + ManagedGraphResult, + ManagedResult, ModelResponse, + RunnerResult, StructuredResponse, ToolRegistry, ) @@ -17,13 +24,20 @@ 'AIProvider', 'AgentGraphResult', 'AgentGraphRunner', + 'AgentGraphRunnerResult', 'AgentResult', 'AgentRunner', + 'GraphMetrics', + 'GraphMetricSummary', 'JudgeResult', 'LDAIMetrics', + 'ManagedGraphResult', + 'ManagedResult', 'ModelResponse', 'ModelRunner', + 'Runner', 'RunnerFactory', + 'RunnerResult', 'StructuredResponse', 'ToolRegistry', ] diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py new file mode 100644 index 00000000..5e1b9abc --- /dev/null +++ b/packages/sdk/server-ai/src/ldai/providers/runner.py @@ -0,0 +1,29 @@ +"""Unified Runner protocol for AI providers.""" + +from typing import Any, Dict, Optional, Protocol, runtime_checkable + +from ldai.providers.types import RunnerResult + + +@runtime_checkable +class Runner(Protocol): + """ + Unified runtime capability interface for all AI provider runners. + + A :class:`Runner` is a focused, configured object that performs a single + AI invocation. + """ + + async def run( + self, + input: Any, + output_type: Optional[Dict[str, Any]] = None, + ) -> RunnerResult: + """ + Execute the runner with the given input. + + :param input: The input to the runner. + :param output_type: Optional JSON schema for structured output. + :return: RunnerResult containing content, metrics, raw, and parsed fields. + """ + ... diff --git a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py index 9363f8e0..b7548791 100644 --- a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py +++ b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py @@ -4,9 +4,8 @@ from ldai import log from ldai.models import AIConfigKind from ldai.providers.agent_graph_runner import AgentGraphRunner -from ldai.providers.agent_runner import AgentRunner from ldai.providers.ai_provider import AIProvider -from ldai.providers.model_runner import ModelRunner +from ldai.providers.runner import Runner T = TypeVar('T') @@ -118,13 +117,13 @@ def _get_providers_to_try( def create_model( config: AIConfigKind, default_ai_provider: Optional[str] = None, - ) -> Optional[ModelRunner]: + ) -> Optional[Runner]: """ Create a model executor for the given AI completion config. :param config: LaunchDarkly AI config (completion or judge) :param default_ai_provider: Optional provider override ('openai', 'langchain', …) - :return: Configured ModelRunner ready to invoke the model, or None + :return: Configured Runner ready to invoke the model, or None """ provider_name = config.provider.name.lower() if config.provider else None providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name) @@ -135,7 +134,7 @@ def create_agent( config: Any, tools: Any, default_ai_provider: Optional[str] = None, - ) -> Optional[AgentRunner]: + ) -> Optional[Runner]: """ CAUTION: This feature is experimental and should NOT be considered ready for production use. @@ -147,7 +146,7 @@ def create_agent( :param config: LaunchDarkly AI agent config :param tools: Tool registry mapping tool names to callables :param default_ai_provider: Optional provider override - :return: AgentRunner instance, or None + :return: Runner instance, or None """ provider_name = config.provider.name.lower() if config.provider else None providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name) diff --git a/packages/sdk/server-ai/src/ldai/providers/types.py b/packages/sdk/server-ai/src/ldai/providers/types.py index aa537880..5bb8ce47 100644 --- a/packages/sdk/server-ai/src/ldai/providers/types.py +++ b/packages/sdk/server-ai/src/ldai/providers/types.py @@ -3,11 +3,11 @@ from __future__ import annotations import asyncio -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional from ldai.models import LDMessage -from ldai.tracker import TokenUsage +from ldai.tracker import LDAIMetricSummary, TokenUsage # Type alias for a registry of tools available to an agent. # Keys are tool names; values are the callable implementations. @@ -16,11 +16,19 @@ @dataclass class LDAIMetrics: - """ - Metrics information for AI operations that includes success status and token usage. - """ + """Contains metrics for a single AI invocation.""" + success: bool + """Whether the invocation succeeded.""" + usage: Optional[TokenUsage] = None + """Optional token usage information.""" + + tool_calls: Optional[List[str]] = None + """Ordered list of tool-call names observed during the invocation.""" + + duration_ms: Optional[int] = None + """Wall-clock duration of the runner invocation in milliseconds.""" def to_dict(self) -> Dict[str, Any]: """ @@ -35,13 +43,58 @@ def to_dict(self) -> Dict[str, Any]: 'input': self.usage.input, 'output': self.usage.output, } + if self.tool_calls is not None: + result['toolCalls'] = self.tool_calls + if self.duration_ms is not None: + result['durationMs'] = self.duration_ms return result +@dataclass +class RunnerResult: + """Contains the result of a single AI model invocation.""" + + content: str + """The text content returned by the model.""" + + metrics: LDAIMetrics + """Metrics for this invocation.""" + + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + + parsed: Optional[Dict[str, Any]] = None + """Optional parsed structured output, populated when ``output_type`` was supplied.""" + + +@dataclass +class ManagedResult: + """Contains the result of a managed AI invocation, including metrics and optional judge evaluations.""" + + content: str + """The text content returned by the model.""" + + metrics: LDAIMetricSummary + """Aggregated metric summary from the tracker for this invocation.""" + + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + + parsed: Optional[Dict[str, Any]] = None + """Optional parsed structured output, populated when ``output_type`` was supplied.""" + + evaluations: Optional[asyncio.Task[List[JudgeResult]]] = None + """Optional asyncio Task that resolves to the list of :class:`JudgeResult` instances when awaited.""" + + @dataclass class ModelResponse: """ Response from a model invocation. + + .. deprecated:: + Use :class:`RunnerResult` (from a runner) and :class:`ManagedResult` + (from the managed layer) instead. """ message: LDMessage metrics: LDAIMetrics @@ -52,24 +105,113 @@ class ModelResponse: class StructuredResponse: """ Structured response from AI models. + + .. deprecated:: + Structured output is now represented by :attr:`RunnerResult.parsed`. """ data: Dict[str, Any] raw_response: str metrics: LDAIMetrics +@dataclass +class GraphMetrics: + """Contains raw metrics from a single agent graph run.""" + + success: bool + """Whether the graph run succeeded.""" + + path: List[str] = field(default_factory=list) + """Ordered list of node keys visited during the run.""" + + duration_ms: Optional[int] = None + """Wall-clock duration of the graph run in milliseconds.""" + + usage: Optional[TokenUsage] = None + """Optional aggregate token usage information across all nodes in the graph run.""" + + node_metrics: Dict[str, LDAIMetrics] = field(default_factory=dict) + """Per-node metrics keyed by node key.""" + + +@dataclass +class GraphMetricSummary: + """Contains a summary of metrics for an agent graph run.""" + + success: bool + """Whether the graph run succeeded.""" + + path: List[str] = field(default_factory=list) + """Ordered list of node keys visited during the run.""" + + duration_ms: Optional[int] = None + """Wall-clock duration of the graph run in milliseconds.""" + + usage: Optional[TokenUsage] = None + """Optional aggregate token usage information across all nodes in the graph run.""" + + node_metrics: Dict[str, LDAIMetrics] = field(default_factory=dict) + """Per-node metrics keyed by node key.""" + + resumption_token: Optional[str] = None + """Optional resumption token from the graph tracker for cross-process resumption.""" + + +@dataclass +class ManagedGraphResult: + """Contains the result of a managed agent graph run, including metrics and optional judge evaluations.""" + + content: str + """The graph's final output content.""" + + metrics: GraphMetricSummary + """Aggregated metric summary from the graph tracker for this run.""" + + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + + evaluations: Optional[asyncio.Task[List[JudgeResult]]] = None + """Optional asyncio Task that resolves to the list of :class:`JudgeResult` instances when awaited.""" + + +@dataclass +class AgentGraphRunnerResult: + """Contains the result of an agent graph runner invocation.""" + + content: str + """The graph's final output content.""" + + metrics: GraphMetrics + """Metrics from the graph run.""" + + raw: Optional[Any] = None + """Optional provider-native response object for advanced consumers.""" + + @dataclass class JudgeResult: - """ - Result from a judge evaluation. - """ + """Contains the result of a single judge evaluation.""" + judge_config_key: Optional[str] = None + """The configuration key of the judge that produced this result.""" + success: bool = False + """Whether the judge evaluation completed successfully.""" + error_message: Optional[str] = None - sampled: bool = False # True when the evaluation was sampled and run + """Error message describing why the evaluation failed, if any.""" + + sampled: bool = False + """True when the evaluation was sampled and run.""" + metric_key: Optional[str] = None + """The metric key under which this judge's score is reported.""" + score: Optional[float] = None + """The numeric score (0-1) returned by the judge.""" + reasoning: Optional[str] = None + """The judge's reasoning text accompanying the score.""" def to_dict(self) -> Dict[str, Any]: """ @@ -96,6 +238,10 @@ def to_dict(self) -> Dict[str, Any]: class AgentResult: """ Result from a single-agent run. + + .. deprecated:: + Use :class:`ManagedResult` (managed layer) or :class:`RunnerResult` + (runner layer) instead. """ output: str raw: Any @@ -104,10 +250,16 @@ class AgentResult: @dataclass class AgentGraphResult: - """ - Result from an agent graph run. - """ + """Contains the result of an agent graph run.""" + output: str + """The agent graph's final output content.""" + raw: Any + """The provider-native response object from the graph run.""" + metrics: LDAIMetrics + """Metrics recorded during the graph run.""" + evaluations: Optional[List[JudgeResult]] = None + """Optional list of judge evaluation results produced for the graph run.""" diff --git a/packages/sdk/server-ai/src/ldai/tracker.py b/packages/sdk/server-ai/src/ldai/tracker.py index 0f5a32c5..31416649 100644 --- a/packages/sdk/server-ai/src/ldai/tracker.py +++ b/packages/sdk/server-ai/src/ldai/tracker.py @@ -1,15 +1,20 @@ +from __future__ import annotations + import base64 import json import time import warnings from dataclasses import dataclass from enum import Enum -from typing import Any, Callable, Dict, Iterable, List, Optional +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional from ldclient import Context, LDClient, Result from ldai import log +if TYPE_CHECKING: + from ldai.providers.types import LDAIMetrics + class FeedbackKind(Enum): """ @@ -41,15 +46,31 @@ class LDAIMetricSummary: """ def __init__(self): - self._duration = None - self._success = None - self._feedback = None - self._usage = None - self._time_to_first_token = None + self._duration_ms: Optional[int] = None + self._success: Optional[bool] = None + self._feedback: Optional[Dict[str, FeedbackKind]] = None + self._usage: Optional[TokenUsage] = None + self._time_to_first_token: Optional[int] = None + self._tool_calls: Optional[List[str]] = None + self._resumption_token: Optional[str] = None + + @property + def duration_ms(self) -> Optional[int]: + """Duration of the AI operation in milliseconds.""" + return self._duration_ms @property def duration(self) -> Optional[int]: - return self._duration + """ + .. deprecated:: + Use :attr:`duration_ms` instead. + """ + warnings.warn( + "LDAIMetricSummary.duration is deprecated. Use duration_ms instead.", + DeprecationWarning, + stacklevel=2, + ) + return self._duration_ms @property def success(self) -> Optional[bool]: @@ -67,6 +88,20 @@ def usage(self) -> Optional[TokenUsage]: def time_to_first_token(self) -> Optional[int]: return self._time_to_first_token + @property + def tool_calls(self) -> Optional[List[str]]: + """List of tool keys that were invoked during this operation.""" + return self._tool_calls + + @property + def resumption_token(self) -> Optional[str]: + """ + URL-safe Base64-encoded resumption token captured at tracker + instantiation. Useful for deferred feedback flows where a downstream + process needs to associate events with the original execution. + """ + return self._resumption_token + class LDAIConfigTracker: """ @@ -107,8 +142,10 @@ def __init__( self._provider_name = provider_name self._context = context self._graph_key = graph_key - self._summary = LDAIMetricSummary() self._run_id = run_id + self._summary = LDAIMetricSummary() + # Capture resumption_token immediately so it's available on the summary at instantiation. + self._summary._resumption_token = self.resumption_token @property def resumption_token(self) -> str: @@ -200,10 +237,10 @@ def track_duration(self, duration: int) -> None: :param duration: Duration in milliseconds. """ - if self._summary.duration is not None: + if self._summary.duration_ms is not None: log.warning("Duration has already been tracked for this execution. %s", self.__get_track_data()) return - self._summary._duration = duration + self._summary._duration_ms = duration self._ld_client.track( "$ld:ai:duration:total", self._context, self.__get_track_data(), duration ) @@ -250,20 +287,32 @@ def track_duration_of(self, func): def _track_from_metrics_extractor( self, result: Any, - metrics_extractor: Callable[[Any], Any], - ) -> Any: - metrics = metrics_extractor(result) + metrics_extractor: Callable[[Any], Optional[LDAIMetrics]], + elapsed_ms: int, + ) -> None: + metrics = None + try: + metrics = metrics_extractor(result) + except Exception as exc: + log.warning("Failed to extract metrics: %s", exc) + + if metrics is None: + self.track_duration(elapsed_ms) + return + + self.track_duration(metrics.duration_ms if metrics.duration_ms is not None else elapsed_ms) if metrics.success: self.track_success() else: self.track_error() if metrics.usage: self.track_tokens(metrics.usage) - return result + if metrics.tool_calls is not None: + self.track_tool_calls(metrics.tool_calls) def track_metrics_of( self, - metrics_extractor: Callable[[Any], Any], + metrics_extractor: Callable[[Any], Optional[LDAIMetrics]], func: Callable[[], Any], ) -> Any: """ @@ -278,6 +327,10 @@ def track_metrics_of( For async operations, use :meth:`track_metrics_of_async`. + When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a + non-``None`` ``duration_ms`` field, that value is used as the measured duration + instead of the wall-clock elapsed time. + :param metrics_extractor: Function that extracts LDAIMetrics from the operation result :param func: Synchronous callable that runs the operation :return: The result of the operation @@ -291,16 +344,24 @@ def track_metrics_of( self.track_error() raise err - duration = (time.perf_counter_ns() - start_ns) // 1_000_000 - self.track_duration(duration) - return self._track_from_metrics_extractor(result, metrics_extractor) + elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 + self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms) + return result - async def track_metrics_of_async(self, metrics_extractor, func): + async def track_metrics_of_async( + self, + metrics_extractor: Callable[[Any], Optional[LDAIMetrics]], + func: Callable[[], Any], + ) -> Any: """ Track metrics for an async AI operation (``func`` is awaited). Same event semantics as :meth:`track_metrics_of`. + When the extracted :class:`~ldai.providers.types.LDAIMetrics` object has a + non-``None`` ``duration_ms`` field, that value is used as the measured duration + instead of the wall-clock elapsed time. + :param metrics_extractor: Function that extracts LDAIMetrics from the operation result :param func: Async callable or zero-arg callable that returns an awaitable when called :return: The result of the operation @@ -315,9 +376,9 @@ async def track_metrics_of_async(self, metrics_extractor, func): self.track_error() raise err - duration = (time.perf_counter_ns() - start_ns) // 1_000_000 - self.track_duration(duration) - return self._track_from_metrics_extractor(result, metrics_extractor) + elapsed_ms = (time.perf_counter_ns() - start_ns) // 1_000_000 + self._track_from_metrics_extractor(result, metrics_extractor, elapsed_ms) + return result def track_judge_result(self, judge_result: Any) -> None: """ @@ -364,6 +425,24 @@ def track_feedback(self, feedback: Dict[str, FeedbackKind]) -> None: 1, ) + def track_tool_calls(self, tool_calls: Iterable[str]) -> None: + """ + Track the tool calls made during an AI operation. + + Stores the tool call names on the summary (guarding against duplicate + tracking) and fires a ``$ld:ai:tool_call`` event for each tool. + + :param tool_calls: Tool identifiers (e.g. from a model response). + """ + if self._summary.tool_calls is not None: + log.warning("Tool calls have already been tracked for this execution. %s", self.__get_track_data()) + return + tool_calls_list = list(tool_calls) + self._summary._tool_calls = tool_calls_list + for tool_key in tool_calls_list: + self.track_tool_call(tool_key) + + def track_success(self) -> None: """ Track a successful AI generation. @@ -499,15 +578,6 @@ def track_tool_call(self, tool_key: str) -> None: 1, ) - def track_tool_calls(self, tool_keys: Iterable[str]) -> None: - """ - Track multiple tool invocations for this configuration. - - :param tool_keys: Tool identifiers (e.g. from a model response). - """ - for tool_key in tool_keys: - self.track_tool_call(tool_key) - def get_summary(self) -> LDAIMetricSummary: """ Get the current summary of AI metrics. diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index c2690b6a..3ca0750b 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -9,7 +9,7 @@ from ldai.judge import Judge from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import JudgeResult, LDAIMetrics, StructuredResponse +from ldai.providers.types import JudgeResult, LDAIMetrics, RunnerResult from ldai.tracker import LDAIConfigTracker @@ -40,9 +40,9 @@ def client(td: TestData) -> LDClient: @pytest.fixture def mock_runner(): - """Create a mock AI provider.""" + """Create a mock AI runner.""" provider = MagicMock() - provider.invoke_structured_model = AsyncMock() + provider.run = AsyncMock() return provider @@ -137,7 +137,7 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing( assert isinstance(result, JudgeResult) assert result.success is False assert result.sampled is False - mock_runner.invoke_structured_model.assert_not_called() + mock_runner.run.assert_not_called() @pytest.mark.asyncio async def test_evaluate_returns_failure_when_messages_missing( @@ -151,23 +151,23 @@ async def test_evaluate_returns_failure_when_messages_missing( assert isinstance(result, JudgeResult) assert result.success is False assert result.sampled is False - mock_runner.invoke_structured_model.assert_not_called() + mock_runner.run.assert_not_called() @pytest.mark.asyncio async def test_evaluate_success_with_valid_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should return JudgeResponse with valid evaluation.""" - mock_response = StructuredResponse( - data={ + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={ 'score': 0.85, 'reasoning': 'The response is highly relevant to the input.' }, - raw_response='{"score": 0.85, "reasoning": "..."}', - metrics=LDAIMetrics(success=True) ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -187,15 +187,15 @@ async def test_evaluate_success_with_evaluation_response_shape( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should accept shape { score, reasoning } and key by metric.""" - mock_response = StructuredResponse( - data={ + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={ 'score': 0.9, 'reasoning': 'The response is accurate and complete.', }, - raw_response='{"score": 0.9, "reasoning": "..."}', - metrics=LDAIMetrics(success=True), ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -214,13 +214,13 @@ async def test_evaluate_handles_missing_evaluation_in_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle missing score/reasoning in response.""" - mock_response = StructuredResponse( - data={}, - raw_response='{}', - metrics=LDAIMetrics(success=True) + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={}, ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -236,16 +236,16 @@ async def test_evaluate_handles_invalid_score( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle invalid score values.""" - mock_response = StructuredResponse( - data={ + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={ 'score': 1.5, - 'reasoning': 'Some reasoning' + 'reasoning': 'Some reasoning', }, - raw_response='{"score": 1.5, "reasoning": "..."}', - metrics=LDAIMetrics(success=True) ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -261,13 +261,13 @@ async def test_evaluate_handles_missing_reasoning( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle missing reasoning.""" - mock_response = StructuredResponse( - data={'score': 0.8}, - raw_response='{"score": 0.8}', - metrics=LDAIMetrics(success=True) + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.8}, ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -283,7 +283,7 @@ async def test_evaluate_handles_exception( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """Evaluate should handle exceptions gracefully.""" - mock_runner.invoke_structured_model.side_effect = Exception("Provider error") + mock_runner.run.side_effect = Exception("Provider error") tracker.track_metrics_of_async = AsyncMock(side_effect=Exception("Provider error")) judge = Judge(judge_config_with_key, mock_runner) @@ -306,7 +306,7 @@ async def test_evaluate_respects_sampling_rate( assert isinstance(result, JudgeResult) assert result.sampled is False assert result.success is False - mock_runner.invoke_structured_model.assert_not_called() + mock_runner.run.assert_not_called() class TestJudgeEvaluateMessages: @@ -317,15 +317,13 @@ async def test_evaluate_messages_calls_evaluate( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): """evaluate_messages should call evaluate with constructed input/output.""" - from ldai.providers.types import ModelResponse - - mock_response = StructuredResponse( - data={'score': 0.9, 'reasoning': 'Very relevant'}, - raw_response='{"score": 0.9, "reasoning": "..."}', - metrics=LDAIMetrics(success=True) + mock_response = RunnerResult( + content='', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.9, 'reasoning': 'Very relevant'}, ) - mock_runner.invoke_structured_model.return_value = mock_response + mock_runner.run.return_value = mock_response tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) judge = Judge(judge_config_with_key, mock_runner) @@ -334,9 +332,9 @@ async def test_evaluate_messages_calls_evaluate( LDMessage(role='user', content='Question 1'), LDMessage(role='assistant', content='Answer 1'), ] - chat_response = ModelResponse( - message=LDMessage(role='assistant', content='Answer 2'), - metrics=LDAIMetrics(success=True) + chat_response = RunnerResult( + content='Answer 2', + metrics=LDAIMetrics(success=True), ) result = await judge.evaluate_messages(messages, chat_response) diff --git a/packages/sdk/server-ai/tests/test_managed_agent.py b/packages/sdk/server-ai/tests/test_managed_agent.py index 144641fc..0c30637a 100644 --- a/packages/sdk/server-ai/tests/test_managed_agent.py +++ b/packages/sdk/server-ai/tests/test_managed_agent.py @@ -6,13 +6,19 @@ from ldai import LDAIClient, ManagedAgent from ldai.managed_agent import ManagedAgent from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig -from ldai.providers import AgentResult -from ldai.providers.types import LDAIMetrics +from ldai.providers.types import LDAIMetrics, ManagedResult, RunnerResult +from ldai.tracker import LDAIMetricSummary from ldclient import Config, Context, LDClient from ldclient.integrations.test_data import TestData +def _make_summary(success: bool = True) -> LDAIMetricSummary: + summary = LDAIMetricSummary() + summary._success = success + return summary + + @pytest.fixture def td() -> TestData: td = TestData.data_source() @@ -53,30 +59,32 @@ class TestManagedAgentRun: @pytest.mark.asyncio async def test_run_delegates_to_agent_runner(self): - """Should delegate run() to the underlying AgentRunner.""" + """Should delegate run() to the underlying AgentRunner and return ManagedResult.""" mock_config = MagicMock(spec=AIAgentConfig) mock_tracker = MagicMock() mock_tracker.track_metrics_of_async = AsyncMock( - return_value=AgentResult( - output="Test response", - raw=None, + return_value=RunnerResult( + content="Test response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) + mock_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=mock_tracker) mock_runner = MagicMock() mock_runner.run = AsyncMock( - return_value=AgentResult( - output="Test response", - raw=None, + return_value=RunnerResult( + content="Test response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) agent = ManagedAgent(mock_config, mock_runner) result = await agent.run("Hello") - assert result.output == "Test response" + assert isinstance(result, ManagedResult) + assert result.content == "Test response" assert result.metrics.success is True mock_config.create_tracker.assert_called_once() mock_tracker.track_metrics_of_async.assert_called_once() @@ -87,12 +95,13 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): mock_config = MagicMock(spec=AIAgentConfig) fresh_tracker = MagicMock() fresh_tracker.track_metrics_of_async = AsyncMock( - return_value=AgentResult( - output="Fresh tracker response", - raw=None, + return_value=RunnerResult( + content="Fresh tracker response", metrics=LDAIMetrics(success=True, usage=None), + raw=None, ) ) + fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True)) mock_config.create_tracker = MagicMock(return_value=fresh_tracker) mock_runner = MagicMock() @@ -100,7 +109,8 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self): agent = ManagedAgent(mock_config, mock_runner) result = await agent.run("Hello") - assert result.output == "Fresh tracker response" + assert isinstance(result, ManagedResult) + assert result.content == "Fresh tracker response" mock_config.create_tracker.assert_called_once() fresh_tracker.track_metrics_of_async.assert_called_once() @@ -152,7 +162,7 @@ async def test_returns_managed_agent_when_runner_available(self, ldai_client: LD mock_runner = MagicMock() mock_runner.run = AsyncMock( - return_value=AgentResult(output="Hello!", raw=None, metrics=LDAIMetrics(success=True, usage=None)) + return_value=RunnerResult(content="Hello!", metrics=LDAIMetrics(success=True, usage=None), raw=None) ) original = rf.RunnerFactory.create_agent diff --git a/packages/sdk/server-ai/tests/test_managed_agent_graph.py b/packages/sdk/server-ai/tests/test_managed_agent_graph.py index 35be2766..05b0ed27 100644 --- a/packages/sdk/server-ai/tests/test_managed_agent_graph.py +++ b/packages/sdk/server-ai/tests/test_managed_agent_graph.py @@ -5,14 +5,16 @@ from ldclient import Config, Context, LDClient from ldclient.integrations.test_data import TestData -from ldai import LDAIClient, ManagedAgentGraph -from ldai.providers.types import LDAIMetrics +from ldai import LDAIClient, ManagedAgentGraph, ManagedGraphResult +from ldai.providers.types import AgentGraphRunnerResult, GraphMetrics, LDAIMetrics from ldai.providers import AgentGraphResult, AgentGraphRunner, ToolRegistry +from ldai.tracker import TokenUsage -# --- Test double --- +# --- Test doubles --- class StubAgentGraphRunner(AgentGraphRunner): + """Legacy runner that returns AgentGraphResult (old shape).""" def __init__(self, output: str = "stub output"): self._output = output @@ -24,14 +26,35 @@ async def run(self, input) -> AgentGraphResult: ) -# --- ManagedAgentGraph unit tests --- +class StubNewShapeRunner(AgentGraphRunner): + """New-shape runner that returns AgentGraphRunnerResult.""" + def __init__(self, content: str = "new shape output"): + self._content = content + + async def run(self, input) -> AgentGraphRunnerResult: + return AgentGraphRunnerResult( + content=self._content, + metrics=GraphMetrics( + success=True, + path=["root", "specialist"], + duration_ms=42, + usage=TokenUsage(total=10, input=5, output=5), + node_metrics={}, + ), + raw={"input": input}, + ) + + +# --- ManagedAgentGraph unit tests (legacy shape) --- @pytest.mark.asyncio async def test_managed_agent_graph_run_delegates_to_runner(): + """Legacy AgentGraphResult shape: content comes from output field.""" runner = StubAgentGraphRunner("hello world") managed = ManagedAgentGraph(runner) result = await managed.run("test input") - assert result.output == "hello world" + assert isinstance(result, ManagedGraphResult) + assert result.content == "hello world" assert result.metrics.success is True @@ -41,6 +64,56 @@ def test_managed_agent_graph_get_runner(): assert managed.get_agent_graph_runner() is runner +# --- ManagedAgentGraph unit tests (new AgentGraphRunnerResult shape) --- + +@pytest.mark.asyncio +async def test_managed_agent_graph_run_handles_new_shape(): + """New AgentGraphRunnerResult shape: content and GraphMetrics are surfaced.""" + runner = StubNewShapeRunner("final answer") + mock_graph = MagicMock() + mock_tracker = MagicMock() + mock_graph.create_tracker = MagicMock(return_value=mock_tracker) + + managed = ManagedAgentGraph(runner, graph=mock_graph) + result = await managed.run("test input") + + assert isinstance(result, ManagedGraphResult) + assert result.content == "final answer" + assert result.metrics.success is True + assert result.metrics.path == ["root", "specialist"] + assert result.metrics.duration_ms == 42 + assert result.metrics.usage is not None + assert result.metrics.usage.total == 10 + + +@pytest.mark.asyncio +async def test_managed_agent_graph_new_shape_drives_tracking(): + """New shape: managed layer calls tracker methods from result.metrics.""" + runner = StubNewShapeRunner() + mock_graph = MagicMock() + mock_tracker = MagicMock() + mock_graph.create_tracker = MagicMock(return_value=mock_tracker) + + managed = ManagedAgentGraph(runner, graph=mock_graph) + await managed.run("test input") + + mock_tracker.track_path.assert_called_once_with(["root", "specialist"]) + mock_tracker.track_duration.assert_called_once_with(42) + mock_tracker.track_invocation_success.assert_called_once() + mock_tracker.track_total_tokens.assert_called_once() + + +@pytest.mark.asyncio +async def test_managed_agent_graph_new_shape_no_graph_skips_tracking(): + """New shape without graph: no tracking called (graph not available).""" + runner = StubNewShapeRunner() + managed = ManagedAgentGraph(runner, graph=None) + # Should not raise even without a graph reference + result = await managed.run("test input") + assert result.content == "new shape output" + assert result.metrics.success is True + + # --- LDAIClient.create_agent_graph() integration tests --- @@ -172,7 +245,8 @@ async def test_create_agent_graph_run_produces_result(ldai_client: LDAIClient): assert managed is not None result = await managed.run("find restaurants") - assert result.output == "final answer" + assert isinstance(result, ManagedGraphResult) + assert result.content == "final answer" assert result.metrics.success is True diff --git a/packages/sdk/server-ai/tests/test_managed_model.py b/packages/sdk/server-ai/tests/test_managed_model.py index 36802a14..6d679552 100644 --- a/packages/sdk/server-ai/tests/test_managed_model.py +++ b/packages/sdk/server-ai/tests/test_managed_model.py @@ -2,48 +2,65 @@ import asyncio from typing import List -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, MagicMock import pytest from ldai.evaluator import Evaluator from ldai.managed_model import ManagedModel from ldai.models import AICompletionConfig, LDMessage, ModelConfig, ProviderConfig -from ldai.providers.types import JudgeResult, LDAIMetrics, ModelResponse -from ldai.tracker import LDAIConfigTracker +from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, RunnerResult +from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary -def _make_model_response(content: str = 'response text') -> ModelResponse: - return ModelResponse( - message=LDMessage(role='assistant', content=content), +def _make_runner_result(content: str = 'response text') -> RunnerResult: + return RunnerResult( + content=content, metrics=LDAIMetrics(success=True, usage=None), ) -class TestManagedModelInvokeReturnsImmediately: - """invoke() must return before the evaluations task resolves.""" +def _make_summary() -> LDAIMetricSummary: + summary = LDAIMetricSummary() + summary._success = True + return summary + + +def _make_config_with_tracker(evaluator: Evaluator) -> tuple[AICompletionConfig, MagicMock]: + """Build an AICompletionConfig with a fully-mocked tracker.""" + mock_tracker = MagicMock(spec=LDAIConfigTracker) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result()) + mock_tracker.get_summary = MagicMock(return_value=_make_summary()) + config = AICompletionConfig( + key='test-config', + enabled=True, + create_tracker=MagicMock(return_value=mock_tracker), + model=ModelConfig('gpt-4'), + provider=ProviderConfig('openai'), + messages=[], + evaluator=evaluator, + ) + return config, mock_tracker - @pytest.mark.asyncio - async def test_invoke_returns_before_evaluations_resolve(self): - """invoke() should return a ModelResponse before evaluations complete.""" - # Set up a barrier so the evaluation coroutine doesn't complete until we release it - barrier = asyncio.Event() - async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]: - await barrier.wait() - return [] +class TestManagedModelRunReturnsImmediately: + """run() must return before the evaluations task resolves.""" + @pytest.mark.asyncio + async def test_run_returns_managed_result(self): + """run() should return a ManagedResult with content from the runner.""" evaluator = MagicMock(spec=Evaluator) evaluator.evaluate = MagicMock( - side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o)) + side_effect=lambda i, o: asyncio.create_task(_empty_eval()) ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result('hi')) mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_runner_result('hi')) + mock_tracker.get_summary = MagicMock(return_value=_make_summary()) config = AICompletionConfig( key='test-config', enabled=True, @@ -55,20 +72,46 @@ async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult] ) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') + + assert isinstance(result, ManagedResult) + assert result.content == 'hi' + assert isinstance(result.metrics, LDAIMetricSummary) + # Cleanup the still-pending evaluations task. + if result.evaluations is not None: + await result.evaluations + + @pytest.mark.asyncio + async def test_run_returns_before_evaluations_resolve(self): + """run() should return a ManagedResult before evaluations complete.""" + barrier = asyncio.Event() + + async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]: + await barrier.wait() + return [] + + evaluator = MagicMock(spec=Evaluator) + evaluator.evaluate = MagicMock( + side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o)) + ) + + mock_runner = MagicMock() + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) + + config, _tracker = _make_config_with_tracker(evaluator) + model = ManagedModel(config, mock_runner) + result = await model.run('Hello') - # invoke() returned — evaluations task should still be pending - assert response is not None - assert response.evaluations is not None - assert not response.evaluations.done(), "evaluations task should still be pending" + assert result is not None + assert result.evaluations is not None + assert not result.evaluations.done(), "evaluations task should still be pending" - # Release the barrier and let it finish cleanly barrier.set() - await response.evaluations + await result.evaluations @pytest.mark.asyncio async def test_await_evaluations_collects_results(self): - """await response.evaluations should return the list of JudgeResult instances.""" + """await result.evaluations should return the list of JudgeResult instances.""" judge_result = JudgeResult( judge_config_key='judge-key', success=True, @@ -87,24 +130,13 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) + config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') - results = await response.evaluations # type: ignore[misc] + results = await result.evaluations # type: ignore[misc] assert results == [judge_result] @pytest.mark.asyncio @@ -128,30 +160,19 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) - model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') + result = await model.run('Hello') # Tracking should NOT have fired yet (before we await evaluations) mock_tracker.track_judge_result.assert_not_called() # Now await the evaluations task — tracking fires inside the chain - await response.evaluations # type: ignore[misc] + await result.evaluations # type: ignore[misc] mock_tracker.track_judge_result.assert_called_once_with(judge_result) @@ -174,25 +195,14 @@ async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult] ) mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) + config, mock_tracker = _make_config_with_tracker(evaluator) mock_tracker.track_judge_result = MagicMock() - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) - model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') - await response.evaluations # type: ignore[misc] + result = await model.run('Hello') + await result.evaluations # type: ignore[misc] mock_tracker.track_judge_result.assert_not_called() @@ -202,23 +212,15 @@ async def test_noop_evaluator_returns_empty_list(self): evaluator = Evaluator.noop() mock_runner = MagicMock() - mock_runner.invoke_model = AsyncMock(return_value=_make_model_response()) - - mock_tracker = MagicMock(spec=LDAIConfigTracker) - mock_tracker.track_metrics_of_async = AsyncMock(return_value=_make_model_response()) - - config = AICompletionConfig( - key='test-config', - enabled=True, - create_tracker=MagicMock(return_value=mock_tracker), - model=ModelConfig('gpt-4'), - provider=ProviderConfig('openai'), - messages=[], - evaluator=evaluator, - ) + mock_runner.invoke_model = AsyncMock(return_value=_make_runner_result()) + config, _tracker = _make_config_with_tracker(evaluator) model = ManagedModel(config, mock_runner) - response = await model.invoke('Hello') - results = await response.evaluations # type: ignore[misc] + result = await model.run('Hello') + results = await result.evaluations # type: ignore[misc] assert results == [] + + +async def _empty_eval() -> List[JudgeResult]: + return [] diff --git a/packages/sdk/server-ai/tests/test_runner_abcs.py b/packages/sdk/server-ai/tests/test_runner_abcs.py index d5136fd0..7e8087cd 100644 --- a/packages/sdk/server-ai/tests/test_runner_abcs.py +++ b/packages/sdk/server-ai/tests/test_runner_abcs.py @@ -1,17 +1,17 @@ import pytest -from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentResult, AgentRunner, ToolRegistry -from ldai.providers.types import LDAIMetrics +from ldai.providers import AgentGraphResult, AgentGraphRunner, AgentRunner, ToolRegistry +from ldai.providers.types import LDAIMetrics, RunnerResult # --- Concrete test doubles --- class ConcreteAgentRunner: async def run(self, input): - return AgentResult( - output=f"agent response to: {input}", - raw={"raw": input}, + return RunnerResult( + content=f"agent response to: {input}", metrics=LDAIMetrics(success=True), + raw={"raw": input}, ) @@ -39,20 +39,20 @@ def test_agent_runner_structural_check_fails_when_run_missing(): @pytest.mark.asyncio -async def test_agent_runner_run_returns_agent_result(): +async def test_agent_runner_run_returns_runner_result(): runner = ConcreteAgentRunner() result = await runner.run("hello") - assert isinstance(result, AgentResult) - assert result.output == "agent response to: hello" + assert isinstance(result, RunnerResult) + assert result.content == "agent response to: hello" assert result.raw == {"raw": "hello"} assert result.metrics.success is True @pytest.mark.asyncio -async def test_agent_result_fields(): +async def test_runner_result_fields(): metrics = LDAIMetrics(success=True) - result = AgentResult(output="done", raw={"key": "val"}, metrics=metrics) - assert result.output == "done" + result = RunnerResult(content="done", metrics=metrics, raw={"key": "val"}) + assert result.content == "done" assert result.raw == {"key": "val"} assert result.metrics is metrics @@ -103,6 +103,6 @@ def test_top_level_exports(): import ldai assert hasattr(ldai, 'AgentRunner') assert hasattr(ldai, 'AgentGraphRunner') - assert hasattr(ldai, 'AgentResult') assert hasattr(ldai, 'AgentGraphResult') + assert hasattr(ldai, 'RunnerResult') assert hasattr(ldai, 'ToolRegistry') diff --git a/packages/sdk/server-ai/tests/test_tracker.py b/packages/sdk/server-ai/tests/test_tracker.py index c2ae2dde..4ed53441 100644 --- a/packages/sdk/server-ai/tests/test_tracker.py +++ b/packages/sdk/server-ai/tests/test_tracker.py @@ -909,3 +909,108 @@ def test_client_create_tracker_fails_on_invalid_json(): result = ai_client.create_tracker(bad_token, context) assert not result.is_success() assert "Invalid resumption token" in result.error + + +def test_ldai_metrics_to_dict_includes_tool_calls_and_duration_ms(): + metrics = LDAIMetrics( + success=True, + usage=TokenUsage(total=10, input=4, output=6), + tool_calls=["search", "lookup"], + duration_ms=123, + ) + d = metrics.to_dict() + assert d["success"] is True + assert d["usage"] == {"total": 10, "input": 4, "output": 6} + assert d["toolCalls"] == ["search", "lookup"] + assert d["durationMs"] == 123 + + +def test_ldai_metrics_to_dict_omits_optional_fields_when_none(): + metrics = LDAIMetrics(success=False) + d = metrics.to_dict() + assert d == {"success": False} + + +def test_track_metrics_of_uses_metrics_duration_ms_when_set(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, duration_ms=999) + + tracker.track_metrics_of(extract, fn) + assert tracker.get_summary().duration_ms == 999 + + +@pytest.mark.asyncio +async def test_track_metrics_of_async_uses_metrics_duration_ms_when_set(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + async def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, duration_ms=42) + + await tracker.track_metrics_of_async(extract, fn) + assert tracker.get_summary().duration_ms == 42 + + +def test_track_metrics_of_calls_track_tool_calls_when_present(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, tool_calls=["foo", "bar"]) + + tracker.track_metrics_of(extract, fn) + summary = tracker.get_summary() + assert summary.tool_calls == ["foo", "bar"] + # One $ld:ai:tool_call event per tool key. + tool_call_events = [ + c for c in client.track.mock_calls # type: ignore + if c.args[0] == "$ld:ai:tool_call" + ] + assert len(tool_call_events) == 2 + + +def test_track_metrics_of_skips_track_tool_calls_when_absent(client: LDClient): + context = Context.create("user-key") + tracker = LDAIConfigTracker( + ld_client=client, run_id="test-run-id", config_key="config-key", + variation_key="variation-key", version=3, model_name="m", + provider_name="p", context=context, + ) + + def fn(): + return "done" + + def extract(_r): + return LDAIMetrics(success=True, usage=None) + + tracker.track_metrics_of(extract, fn) + assert tracker.get_summary().tool_calls is None + tool_call_events = [ + c for c in client.track.mock_calls # type: ignore + if c.args[0] == "$ld:ai:tool_call" + ] + assert tool_call_events == []