From f8c6ebaa15f7acca962784f217ede1c4447f51dc Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 11 Mar 2026 17:40:06 -0500 Subject: [PATCH 01/11] fix: Remove evaluation metric key from schema which failed on some LLMs --- .../sdk/server-ai/src/ldai/judge/__init__.py | 37 ++------ .../ldai/judge/evaluation_schema_builder.py | 90 +++++++----------- packages/sdk/server-ai/tests/test_judge.py | 92 ++++++++++++++----- 3 files changed, 108 insertions(+), 111 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index 0ca402a..2557e4d 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -37,7 +37,7 @@ def __init__( self._ai_config = ai_config self._ai_config_tracker = ai_config_tracker self._ai_provider = ai_provider - self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key) + self._evaluation_response_structure = EvaluationSchemaBuilder.build() async def evaluate( self, @@ -77,10 +77,9 @@ async def evaluate( ) success = response.metrics.success - evals = self._parse_evaluation_response(response.data) - if self._ai_config.evaluation_metric_key not in evals: + if not evals: log.warn('Judge evaluation did not return the expected evaluation') success = False @@ -175,47 +174,27 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str: def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]: """ - Parses the structured evaluation response from the AI provider. - - :param data: The structured response data - :return: Dictionary of evaluation scores keyed by metric key + Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}. """ results: Dict[str, EvalScore] = {} - - if not data.get('evaluations') or not isinstance(data['evaluations'], dict): - log.warn('Invalid response: missing or invalid evaluations object') - return results - - evaluations = data['evaluations'] - metric_key = self._ai_config.evaluation_metric_key if not metric_key: log.warn('Evaluation metric key is missing') return results - evaluation = evaluations.get(metric_key) - - if not evaluation or not isinstance(evaluation, dict): - log.warn(f'Missing evaluation for metric key: {metric_key}') + evaluation = data.get('evaluation') if isinstance(data, dict) else None + if not isinstance(evaluation, dict): + log.warn('Invalid response: missing or invalid evaluation') return results score = evaluation.get('score') reasoning = evaluation.get('reasoning') - if not isinstance(score, (int, float)) or score < 0 or score > 1: - log.warn( - f'Invalid score evaluated for {metric_key}: {score}. ' - 'Score must be a number between 0 and 1 inclusive' - ) + log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive') return results - if not isinstance(reasoning, str): - log.warn( - f'Invalid reasoning evaluated for {metric_key}: {reasoning}. ' - 'Reasoning must be a string' - ) + log.warn('Invalid reasoning: must be a string') return results results[metric_key] = EvalScore(score=float(score), reasoning=reasoning) - return results diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index c69e0af..3616ac4 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -1,79 +1,51 @@ -"""Internal class for building dynamic evaluation response schemas.""" +"""Internal class for building evaluation response schemas.""" -from typing import Any, Dict, Optional +from typing import Any, Dict class EvaluationSchemaBuilder: """ - Internal class for building dynamic evaluation response schemas. + Internal class for building evaluation response schemas. Not exported - only used internally by Judge. + Schema is a fixed shape: one "evaluation" object with score and reasoning. + The judge config's evaluation_metric_key is only used when keying the result, + not in the schema. """ @staticmethod - def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]: + def build() -> Dict[str, Any]: """ - Build an evaluation response schema from evaluation metric key. + Build the evaluation response schema. No parameters; the schema is + always the same. The judge keys the parsed result by its config's + evaluation_metric_key. - :param evaluation_metric_key: Evaluation metric key, or None if not available - :return: Schema dictionary for structured output, or None if evaluation_metric_key is None - """ - if not evaluation_metric_key: - return None + In practice the model returns JSON like: + {"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}} + :return: Schema dictionary for structured output + """ return { 'title': 'EvaluationResponse', - 'description': f"Response containing evaluation results for {evaluation_metric_key} metric", + 'description': 'Response containing an evaluation (score and reasoning).', 'type': 'object', 'properties': { - 'evaluations': { + 'evaluation': { 'type': 'object', - 'description': ( - f"Object containing evaluation results for " - f"{evaluation_metric_key} metric" - ), - 'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key), - 'required': [evaluation_metric_key], - 'additionalProperties': False, - }, - }, - 'required': ['evaluations'], - 'additionalProperties': False, - } - - @staticmethod - def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]: - """ - Build properties for a single evaluation metric key. - - :param evaluation_metric_key: Evaluation metric key - :return: Dictionary of properties for the key - """ - return { - evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key) - } - - @staticmethod - def _build_key_schema(key: str) -> Dict[str, Any]: - """ - Build schema for a single evaluation metric key. - - :param key: Evaluation metric key - :return: Schema dictionary for the key - """ - return { - 'type': 'object', - 'properties': { - 'score': { - 'type': 'number', - 'minimum': 0, - 'maximum': 1, - 'description': f'Score between 0.0 and 1.0 for {key}', - }, - 'reasoning': { - 'type': 'string', - 'description': f'Reasoning behind the score for {key}', + 'description': 'The evaluation result.', + 'properties': { + 'score': { + 'type': 'number', + 'minimum': 0, + 'maximum': 1, + 'description': 'Score between 0.0 and 1.0.', + }, + 'reasoning': { + 'type': 'string', + 'description': 'Reasoning behind the score.', + }, + }, + 'required': ['score', 'reasoning'], }, }, - 'required': ['score', 'reasoning'], - 'additionalProperties': False, + 'required': ['evaluation'], } diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index d386b92..b1a1cba 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -109,7 +109,10 @@ def test_judge_initializes_with_evaluation_metric_key( assert judge._ai_config == judge_config_with_key assert judge._evaluation_response_structure is not None assert judge._evaluation_response_structure['title'] == 'EvaluationResponse' - assert '$ld:ai:judge:relevance' in judge._evaluation_response_structure['properties']['evaluations']['required'] + assert judge._evaluation_response_structure['required'] == ['evaluation'] + eval_schema = judge._evaluation_response_structure['properties']['evaluation'] + assert eval_schema['required'] == ['score', 'reasoning'] + assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties'] def test_judge_initializes_without_evaluation_metric_key( self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider @@ -180,6 +183,58 @@ async def test_evaluate_success_with_valid_response( assert result.evals['$ld:ai:judge:relevance'].score == 0.85 assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + @pytest.mark.asyncio + async def test_evaluate_success_with_evaluation_response_shape( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric.""" + mock_response = StructuredResponse( + data={ + 'evaluation': { + 'score': 0.9, + 'reasoning': 'The response is accurate and complete.', + } + }, + raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}', + metrics=LDAIMetrics(success=True), + ) + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + result = await judge.evaluate("What is feature flagging?", "Feature flagging is...") + + assert result is not None + assert result.success is True + assert '$ld:ai:judge:relevance' in result.evals + assert result.evals['$ld:ai:judge:relevance'].score == 0.9 + assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() + + @pytest.mark.asyncio + async def test_evaluate_success_with_evaluations_backward_compat( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider + ): + """Evaluate should accept legacy shape { evaluations: { score, reasoning } }.""" + mock_response = StructuredResponse( + data={ + 'evaluations': { + 'score': 0.7, + 'reasoning': 'Partially correct.', + } + }, + raw_response='{"evaluations": {"score": 0.7, "reasoning": "..."}}', + metrics=LDAIMetrics(success=True), + ) + mock_ai_provider.invoke_structured_model.return_value = mock_response + tracker.track_metrics_of = AsyncMock(return_value=mock_response) + + judge = Judge(judge_config_with_key, tracker, mock_ai_provider) + result = await judge.evaluate("input", "output") + + assert result is not None + assert result.success is True + assert result.evals['$ld:ai:judge:relevance'].score == 0.7 + @pytest.mark.asyncio async def test_evaluate_handles_missing_evaluation_in_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider @@ -345,30 +400,21 @@ class TestEvaluationSchemaBuilder: """Tests for EvaluationSchemaBuilder.""" def test_build_creates_correct_schema(self): - """Schema builder should create correct schema structure.""" - schema = EvaluationSchemaBuilder.build('$ld:ai:judge:relevance') - + """Schema builder should create fixed schema (evaluation with score + reasoning, no key param).""" + schema = EvaluationSchemaBuilder.build() + assert schema['title'] == 'EvaluationResponse' assert schema['type'] == 'object' - assert 'evaluations' in schema['properties'] - assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['required'] - assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['properties'] - - metric_schema = schema['properties']['evaluations']['properties']['$ld:ai:judge:relevance'] - assert metric_schema['type'] == 'object' - assert 'score' in metric_schema['properties'] - assert 'reasoning' in metric_schema['properties'] - assert metric_schema['properties']['score']['type'] == 'number' - assert metric_schema['properties']['score']['minimum'] == 0 - assert metric_schema['properties']['score']['maximum'] == 1 - - def test_build_key_properties_creates_single_key(self): - """_build_key_properties should create properties for a single key.""" - properties = EvaluationSchemaBuilder._build_key_properties('$ld:ai:judge:relevance') - - assert '$ld:ai:judge:relevance' in properties - assert len(properties) == 1 - assert properties['$ld:ai:judge:relevance']['type'] == 'object' + assert schema['required'] == ['evaluation'] + assert 'evaluation' in schema['properties'] + eval_schema = schema['properties']['evaluation'] + assert eval_schema['type'] == 'object' + assert eval_schema['required'] == ['score', 'reasoning'] + assert 'score' in eval_schema['properties'] + assert 'reasoning' in eval_schema['properties'] + assert eval_schema['properties']['score']['type'] == 'number' + assert eval_schema['properties']['score']['minimum'] == 0 + assert eval_schema['properties']['score']['maximum'] == 1 class TestJudgeConfigSerialization: From 49f5e2e8cf3929691464828801388c2d1ac74470 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 11 Mar 2026 17:43:36 -0500 Subject: [PATCH 02/11] additional properties is required for openai schemas --- .../sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index 3616ac4..d19bd48 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -45,7 +45,9 @@ def build() -> Dict[str, Any]: }, }, 'required': ['score', 'reasoning'], + 'additionalProperties': False, }, }, 'required': ['evaluation'], + 'additionalProperties': False, } From 916df2ae1d1a3fd6bdfcb55c604d77358bfcc01d Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Wed, 11 Mar 2026 17:50:09 -0500 Subject: [PATCH 03/11] fix tests --- packages/sdk/server-ai/tests/test_judge.py | 83 +++++----------------- 1 file changed, 17 insertions(+), 66 deletions(-) diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index b1a1cba..9ac3c64 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -114,15 +114,6 @@ def test_judge_initializes_with_evaluation_metric_key( assert eval_schema['required'] == ['score', 'reasoning'] assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties'] - def test_judge_initializes_without_evaluation_metric_key( - self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider - ): - """Judge should initialize but have None for evaluation_response_structure.""" - judge = Judge(judge_config_without_key, tracker, mock_ai_provider) - - assert judge._ai_config == judge_config_without_key - assert judge._evaluation_response_structure is None - class TestJudgeEvaluate: """Tests for Judge.evaluate() method.""" @@ -158,14 +149,12 @@ async def test_evaluate_success_with_valid_response( """Evaluate should return JudgeResponse with valid evaluation.""" mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 0.85, - 'reasoning': 'The response is highly relevant to the input.' - } + 'evaluation': { + 'score': 0.85, + 'reasoning': 'The response is highly relevant to the input.' } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}', metrics=LDAIMetrics(success=True) ) @@ -210,46 +199,14 @@ async def test_evaluate_success_with_evaluation_response_shape( assert result.evals['$ld:ai:judge:relevance'].score == 0.9 assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower() - @pytest.mark.asyncio - async def test_evaluate_success_with_evaluations_backward_compat( - self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider - ): - """Evaluate should accept legacy shape { evaluations: { score, reasoning } }.""" - mock_response = StructuredResponse( - data={ - 'evaluations': { - 'score': 0.7, - 'reasoning': 'Partially correct.', - } - }, - raw_response='{"evaluations": {"score": 0.7, "reasoning": "..."}}', - metrics=LDAIMetrics(success=True), - ) - mock_ai_provider.invoke_structured_model.return_value = mock_response - tracker.track_metrics_of = AsyncMock(return_value=mock_response) - - judge = Judge(judge_config_with_key, tracker, mock_ai_provider) - result = await judge.evaluate("input", "output") - - assert result is not None - assert result.success is True - assert result.evals['$ld:ai:judge:relevance'].score == 0.7 - @pytest.mark.asyncio async def test_evaluate_handles_missing_evaluation_in_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider ): """Evaluate should handle missing evaluation in response.""" mock_response = StructuredResponse( - data={ - 'evaluations': { - 'wrong-key': { - 'score': 0.5, - 'reasoning': 'Some reasoning' - } - } - }, - raw_response='{"evaluations": {...}}', + data={}, + raw_response='{}', metrics=LDAIMetrics(success=True) ) @@ -271,14 +228,12 @@ async def test_evaluate_handles_invalid_score( """Evaluate should handle invalid score values.""" mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 1.5, - 'reasoning': 'Some reasoning' - } + 'evaluation': { + 'score': 1.5, + 'reasoning': 'Some reasoning' } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}', metrics=LDAIMetrics(success=True) ) @@ -300,13 +255,11 @@ async def test_evaluate_handles_missing_reasoning( """Evaluate should handle missing reasoning.""" mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 0.8, - } + 'evaluation': { + 'score': 0.8, } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 0.8}}', metrics=LDAIMetrics(success=True) ) @@ -364,14 +317,12 @@ async def test_evaluate_messages_calls_evaluate( mock_response = StructuredResponse( data={ - 'evaluations': { - '$ld:ai:judge:relevance': { - 'score': 0.9, - 'reasoning': 'Very relevant' - } + 'evaluation': { + 'score': 0.9, + 'reasoning': 'Very relevant' } }, - raw_response='{"evaluations": {...}}', + raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}', metrics=LDAIMetrics(success=True) ) From 1d78f8ad8e1799ab7175668f7058e546e3837683 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 12 Mar 2026 13:33:20 -0500 Subject: [PATCH 04/11] include raw response and collect judge metric tokens --- .../src/ldai_langchain/langchain_provider.py | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py index f4fa62d..5f921f6 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py @@ -83,9 +83,17 @@ async def invoke_structured_model( :param response_structure: Dictionary defining the output structure :return: StructuredResponse containing the structured data """ + structured_response = StructuredResponse( + data={}, + raw_response='', + metrics=LDAIMetrics( + success=False, + usage=TokenUsage(total=0, input=0, output=0), + ), + ) try: langchain_messages = LangChainProvider.convert_messages_to_langchain(messages) - structured_llm = self._llm.with_structured_output(response_structure) + structured_llm = self._llm.with_structured_output(response_structure, include_raw=True) response = await structured_llm.ainvoke(langchain_messages) if not isinstance(response, dict): @@ -93,34 +101,25 @@ async def invoke_structured_model( f'Structured output did not return a dict. ' f'Got: {type(response)}' ) - return StructuredResponse( - data={}, - raw_response='', - metrics=LDAIMetrics( - success=False, - usage=TokenUsage(total=0, input=0, output=0), - ), - ) - - return StructuredResponse( - data=response, - raw_response=str(response), - metrics=LDAIMetrics( - success=True, - usage=TokenUsage(total=0, input=0, output=0), - ), - ) + return structured_response + + raw_response = response.get('raw') + if raw_response is not None: + if hasattr(raw_response, 'content'): + structured_response.raw_response = raw_response.content + structured_response.metrics = LangChainProvider.get_ai_metrics_from_response(raw_response) + + if response.get('parsing_error'): + log.warning(f'LangChain structured model invocation had a parsing error') + structured_response.metrics.success = False + return structured_response + + structured_response.metrics.success = True + structured_response.data = response.get('parsed') or {} + return structured_response except Exception as error: log.warning(f'LangChain structured model invocation failed: {error}') - - return StructuredResponse( - data={}, - raw_response='', - metrics=LDAIMetrics( - success=False, - usage=TokenUsage(total=0, input=0, output=0), - ), - ) + return structured_response def get_chat_model(self) -> BaseChatModel: """ @@ -169,7 +168,13 @@ def get_ai_metrics_from_response(response: BaseMessage) -> LDAIMetrics: """ # Extract token usage if available usage: Optional[TokenUsage] = None - if hasattr(response, 'response_metadata') and response.response_metadata: + if hasattr(response, 'usage_metadata') and response.usage_metadata: + usage = TokenUsage( + total=response.usage_metadata.get('total_tokens', 0), + input=response.usage_metadata.get('input_tokens', 0), + output=response.usage_metadata.get('output_tokens', 0), + ) + if not usage and hasattr(response, 'response_metadata') and response.response_metadata: token_usage = response.response_metadata.get('tokenUsage') or response.response_metadata.get('token_usage') if token_usage: usage = TokenUsage( From 75f75cf38810ba4f5a4b0fb40f418323ed65b7bc Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Fri, 13 Mar 2026 17:06:55 -0500 Subject: [PATCH 05/11] fix bedrock models not running properly in langchain --- .../src/ldai_langchain/langchain_provider.py | 17 +++++++++++------ .../tests/test_langchain_provider.py | 13 +++++++++++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py index 5f921f6..cb4a01d 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py @@ -134,18 +134,18 @@ def map_provider(ld_provider_name: str) -> str: """ Map LaunchDarkly provider names to LangChain provider names. - This method enables seamless integration between LaunchDarkly's standardized - provider naming and LangChain's naming conventions. - :param ld_provider_name: LaunchDarkly provider name :return: LangChain-compatible provider name """ lowercased_name = ld_provider_name.lower() + # Bedrock is the only provider that uses "provider:model_family" (e.g. Bedrock:Anthropic). + if lowercased_name.startswith('bedrock:'): + return 'bedrock_converse' mapping: Dict[str, str] = { 'gemini': 'google-genai', + 'bedrock': 'bedrock_converse', } - return mapping.get(lowercased_name, lowercased_name) @staticmethod @@ -232,10 +232,15 @@ def create_langchain_model(ai_config: AIConfigKind) -> BaseChatModel: model_name = model_dict.get('name', '') provider = provider_dict.get('name', '') - parameters = model_dict.get('parameters') or {} + parameters = dict(model_dict.get('parameters') or {}) + mapped_provider = LangChainProvider.map_provider(provider) + # Bedrock requires the foundation provider (e.g. Bedrock:Anthropic) passed in + # parameters separately from model_provider, which is used for LangChain routing. + if mapped_provider == 'bedrock_converse' and 'provider' not in parameters: + parameters['provider'] = provider return init_chat_model( model_name, - model_provider=LangChainProvider.map_provider(provider), + model_provider=mapped_provider, **parameters, ) diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py index 0c90a43..6bc6d98 100644 --- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py @@ -130,6 +130,14 @@ def test_maps_gemini_to_google_genai(self): assert LangChainProvider.map_provider('Gemini') == 'google-genai' assert LangChainProvider.map_provider('GEMINI') == 'google-genai' + def test_maps_bedrock_and_model_families_to_bedrock_converse(self): + """Should map bedrock and bedrock:model_family to bedrock_converse.""" + assert LangChainProvider.map_provider('bedrock') == 'bedrock_converse' + assert LangChainProvider.map_provider('Bedrock:Anthropic') == 'bedrock_converse' + assert LangChainProvider.map_provider('bedrock:anthropic') == 'bedrock_converse' + assert LangChainProvider.map_provider('bedrock:amazon') == 'bedrock_converse' + assert LangChainProvider.map_provider('bedrock:cohere') == 'bedrock_converse' + def test_returns_provider_name_unchanged_for_unmapped_providers(self): """Should return provider name unchanged for unmapped providers.""" assert LangChainProvider.map_provider('openai') == 'openai' @@ -197,7 +205,8 @@ def mock_llm(self): @pytest.mark.asyncio async def test_returns_success_true_for_successful_invocation(self, mock_llm): """Should return success=True for successful invocation.""" - mock_response = {'result': 'structured data'} + parsed_data = {'result': 'structured data'} + mock_response = {'parsed': parsed_data, 'raw': None} mock_structured_llm = MagicMock() mock_structured_llm.ainvoke = AsyncMock(return_value=mock_response) mock_llm.with_structured_output = MagicMock(return_value=mock_structured_llm) @@ -208,7 +217,7 @@ async def test_returns_success_true_for_successful_invocation(self, mock_llm): result = await provider.invoke_structured_model(messages, response_structure) assert result.metrics.success is True - assert result.data == mock_response + assert result.data == parsed_data @pytest.mark.asyncio async def test_returns_success_false_when_structured_model_invocation_throws_error(self, mock_llm): From 1ed23cf91157a29c4ab5a4291924fe4d2c829bca Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Fri, 13 Mar 2026 22:54:55 -0500 Subject: [PATCH 06/11] fix lint issue --- .../src/ldai_langchain/langchain_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py index cb4a01d..32213aa 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py @@ -102,7 +102,7 @@ async def invoke_structured_model( f'Got: {type(response)}' ) return structured_response - + raw_response = response.get('raw') if raw_response is not None: if hasattr(raw_response, 'content'): From f6a92ccfdae5ff6176e366916e7a1d1714c657d9 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Mon, 16 Mar 2026 08:08:10 -0600 Subject: [PATCH 07/11] don't set success --- .../src/ldai_langchain/langchain_provider.py | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py index 32213aa..2b89c4a 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py @@ -1,5 +1,6 @@ """LangChain implementation of AIProvider for LaunchDarkly AI SDK.""" +from tokenize import Token from typing import Any, Dict, List, Optional, Union from langchain_core.language_models.chat_models import BaseChatModel @@ -107,11 +108,10 @@ async def invoke_structured_model( if raw_response is not None: if hasattr(raw_response, 'content'): structured_response.raw_response = raw_response.content - structured_response.metrics = LangChainProvider.get_ai_metrics_from_response(raw_response) + structured_response.metrics.usage = LangChainProvider.get_ai_usage_from_response(raw_response) if response.get('parsing_error'): log.warning(f'LangChain structured model invocation had a parsing error') - structured_response.metrics.success = False return structured_response structured_response.metrics.success = True @@ -147,24 +147,14 @@ def map_provider(ld_provider_name: str) -> str: 'bedrock': 'bedrock_converse', } return mapping.get(lowercased_name, lowercased_name) - + @staticmethod - def get_ai_metrics_from_response(response: BaseMessage) -> LDAIMetrics: + def get_ai_usage_from_response(response: BaseMessage) -> TokenUsage: """ - Get AI metrics from a LangChain provider response. - - This method extracts token usage information and success status from LangChain responses - and returns a LaunchDarkly AIMetrics object. + Get token usage from a LangChain provider response. :param response: The response from the LangChain model - :return: LDAIMetrics with success status and token usage - - Example: - # Use with tracker.track_metrics_of for automatic tracking - response = await tracker.track_metrics_of( - lambda: llm.ainvoke(messages), - LangChainProvider.get_ai_metrics_from_response - ) + :return: TokenUsage with success status and token usage """ # Extract token usage if available usage: Optional[TokenUsage] = None @@ -183,6 +173,29 @@ def get_ai_metrics_from_response(response: BaseMessage) -> LDAIMetrics: output=token_usage.get('completionTokens', 0) or token_usage.get('completion_tokens', 0), ) + return usage + + @staticmethod + def get_ai_metrics_from_response(response: BaseMessage) -> LDAIMetrics: + """ + Get AI metrics from a LangChain provider response. + + This method extracts token usage information and success status from LangChain responses + and returns a LaunchDarkly AIMetrics object. + + :param response: The response from the LangChain model + :return: LDAIMetrics with success status and token usage + + Example: + # Use with tracker.track_metrics_of for automatic tracking + response = await tracker.track_metrics_of( + lambda: llm.ainvoke(messages), + LangChainProvider.get_ai_metrics_from_response + ) + """ + # Extract token usage if available + usage = LangChainProvider.get_ai_usage_from_response(response) + return LDAIMetrics(success=True, usage=usage) @staticmethod @@ -238,7 +251,7 @@ def create_langchain_model(ai_config: AIConfigKind) -> BaseChatModel: # Bedrock requires the foundation provider (e.g. Bedrock:Anthropic) passed in # parameters separately from model_provider, which is used for LangChain routing. if mapped_provider == 'bedrock_converse' and 'provider' not in parameters: - parameters['provider'] = provider + parameters['provider'] = provider.removeprefix('bedrock:') return init_chat_model( model_name, model_provider=mapped_provider, From 554121bd80f51bf8d610b27a856e53b5ad21a49c Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Mon, 16 Mar 2026 12:43:26 -0700 Subject: [PATCH 08/11] address code review feedback --- .../src/ldai_langchain/langchain_provider.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py index 2b89c4a..cb4eb91 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py @@ -1,6 +1,5 @@ """LangChain implementation of AIProvider for LaunchDarkly AI SDK.""" -from tokenize import Token from typing import Any, Dict, List, Optional, Union from langchain_core.language_models.chat_models import BaseChatModel @@ -87,10 +86,7 @@ async def invoke_structured_model( structured_response = StructuredResponse( data={}, raw_response='', - metrics=LDAIMetrics( - success=False, - usage=TokenUsage(total=0, input=0, output=0), - ), + metrics=LDAIMetrics(success=False, usage=None), ) try: langchain_messages = LangChainProvider.convert_messages_to_langchain(messages) From f7f286a2a2010e9545e93ef1a96d938166ab2496 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Mon, 16 Mar 2026 12:55:16 -0700 Subject: [PATCH 09/11] lint fix --- .../src/ldai_langchain/langchain_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py index cb4eb91..702a6f0 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_provider.py @@ -143,7 +143,7 @@ def map_provider(ld_provider_name: str) -> str: 'bedrock': 'bedrock_converse', } return mapping.get(lowercased_name, lowercased_name) - + @staticmethod def get_ai_usage_from_response(response: BaseMessage) -> TokenUsage: """ From b4e31185640fcd26ce3128d3675637c0689a4f5e Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Mon, 16 Mar 2026 13:10:51 -0700 Subject: [PATCH 10/11] fix test --- .../server-ai-langchain/tests/test_langchain_provider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py index 6bc6d98..402faa6 100644 --- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py @@ -235,8 +235,7 @@ async def test_returns_success_false_when_structured_model_invocation_throws_err assert result.metrics.success is False assert result.data == {} assert result.raw_response == '' - assert result.metrics.usage is not None - assert result.metrics.usage.total == 0 + assert result.metrics.usage is None class TestGetChatModel: From a303c3d173d90c1b04f0da69a170f031a0505e13 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Mon, 16 Mar 2026 13:37:45 -0700 Subject: [PATCH 11/11] simplify the structured output for judges further --- .../sdk/server-ai/src/ldai/judge/__init__.py | 9 ++- .../ldai/judge/evaluation_schema_builder.py | 32 ++++----- packages/sdk/server-ai/tests/test_judge.py | 70 +++++++------------ 3 files changed, 41 insertions(+), 70 deletions(-) diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index 2557e4d..280dc79 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -174,7 +174,7 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str: def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]: """ - Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}. + Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}. """ results: Dict[str, EvalScore] = {} metric_key = self._ai_config.evaluation_metric_key @@ -182,13 +182,12 @@ def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScor log.warn('Evaluation metric key is missing') return results - evaluation = data.get('evaluation') if isinstance(data, dict) else None - if not isinstance(evaluation, dict): + if not isinstance(data, dict): log.warn('Invalid response: missing or invalid evaluation') return results - score = evaluation.get('score') - reasoning = evaluation.get('reasoning') + score = data.get('score') + reasoning = data.get('reasoning') if not isinstance(score, (int, float)) or score < 0 or score > 1: log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive') return results diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py index d19bd48..3af679b 100644 --- a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py +++ b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py @@ -7,7 +7,7 @@ class EvaluationSchemaBuilder: """ Internal class for building evaluation response schemas. Not exported - only used internally by Judge. - Schema is a fixed shape: one "evaluation" object with score and reasoning. + Schema is a fixed shape: top-level score and reasoning. The judge config's evaluation_metric_key is only used when keying the result, not in the schema. """ @@ -20,7 +20,7 @@ def build() -> Dict[str, Any]: evaluation_metric_key. In practice the model returns JSON like: - {"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}} + {"score": 0.85, "reasoning": "The response is accurate."} :return: Schema dictionary for structured output """ @@ -29,25 +29,17 @@ def build() -> Dict[str, Any]: 'description': 'Response containing an evaluation (score and reasoning).', 'type': 'object', 'properties': { - 'evaluation': { - 'type': 'object', - 'description': 'The evaluation result.', - 'properties': { - 'score': { - 'type': 'number', - 'minimum': 0, - 'maximum': 1, - 'description': 'Score between 0.0 and 1.0.', - }, - 'reasoning': { - 'type': 'string', - 'description': 'Reasoning behind the score.', - }, - }, - 'required': ['score', 'reasoning'], - 'additionalProperties': False, + 'score': { + 'type': 'number', + 'minimum': 0, + 'maximum': 1, + 'description': 'Score between 0.0 and 1.0.', + }, + 'reasoning': { + 'type': 'string', + 'description': 'Reasoning behind the score.', }, }, - 'required': ['evaluation'], + 'required': ['score', 'reasoning'], 'additionalProperties': False, } diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index 9ac3c64..7c0d378 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -109,10 +109,9 @@ def test_judge_initializes_with_evaluation_metric_key( assert judge._ai_config == judge_config_with_key assert judge._evaluation_response_structure is not None assert judge._evaluation_response_structure['title'] == 'EvaluationResponse' - assert judge._evaluation_response_structure['required'] == ['evaluation'] - eval_schema = judge._evaluation_response_structure['properties']['evaluation'] - assert eval_schema['required'] == ['score', 'reasoning'] - assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties'] + assert judge._evaluation_response_structure['required'] == ['score', 'reasoning'] + assert 'score' in judge._evaluation_response_structure['properties'] + assert 'reasoning' in judge._evaluation_response_structure['properties'] class TestJudgeEvaluate: @@ -149,12 +148,10 @@ async def test_evaluate_success_with_valid_response( """Evaluate should return JudgeResponse with valid evaluation.""" mock_response = StructuredResponse( data={ - 'evaluation': { - 'score': 0.85, - 'reasoning': 'The response is highly relevant to the input.' - } + 'score': 0.85, + 'reasoning': 'The response is highly relevant to the input.' }, - raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}', + raw_response='{"score": 0.85, "reasoning": "..."}', metrics=LDAIMetrics(success=True) ) @@ -176,15 +173,13 @@ async def test_evaluate_success_with_valid_response( async def test_evaluate_success_with_evaluation_response_shape( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider ): - """Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric.""" + """Evaluate should accept shape { score, reasoning } and key by metric.""" mock_response = StructuredResponse( data={ - 'evaluation': { - 'score': 0.9, - 'reasoning': 'The response is accurate and complete.', - } + 'score': 0.9, + 'reasoning': 'The response is accurate and complete.', }, - raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}', + raw_response='{"score": 0.9, "reasoning": "..."}', metrics=LDAIMetrics(success=True), ) mock_ai_provider.invoke_structured_model.return_value = mock_response @@ -203,7 +198,7 @@ async def test_evaluate_success_with_evaluation_response_shape( async def test_evaluate_handles_missing_evaluation_in_response( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider ): - """Evaluate should handle missing evaluation in response.""" + """Evaluate should handle missing score/reasoning in response.""" mock_response = StructuredResponse( data={}, raw_response='{}', @@ -228,12 +223,10 @@ async def test_evaluate_handles_invalid_score( """Evaluate should handle invalid score values.""" mock_response = StructuredResponse( data={ - 'evaluation': { - 'score': 1.5, - 'reasoning': 'Some reasoning' - } + 'score': 1.5, + 'reasoning': 'Some reasoning' }, - raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}', + raw_response='{"score": 1.5, "reasoning": "..."}', metrics=LDAIMetrics(success=True) ) @@ -254,12 +247,8 @@ async def test_evaluate_handles_missing_reasoning( ): """Evaluate should handle missing reasoning.""" mock_response = StructuredResponse( - data={ - 'evaluation': { - 'score': 0.8, - } - }, - raw_response='{"evaluation": {"score": 0.8}}', + data={'score': 0.8}, + raw_response='{"score": 0.8}', metrics=LDAIMetrics(success=True) ) @@ -316,13 +305,8 @@ async def test_evaluate_messages_calls_evaluate( from ldai.providers.types import ChatResponse mock_response = StructuredResponse( - data={ - 'evaluation': { - 'score': 0.9, - 'reasoning': 'Very relevant' - } - }, - raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}', + data={'score': 0.9, 'reasoning': 'Very relevant'}, + raw_response='{"score": 0.9, "reasoning": "..."}', metrics=LDAIMetrics(success=True) ) @@ -351,21 +335,17 @@ class TestEvaluationSchemaBuilder: """Tests for EvaluationSchemaBuilder.""" def test_build_creates_correct_schema(self): - """Schema builder should create fixed schema (evaluation with score + reasoning, no key param).""" + """Schema builder should create fixed schema (top-level score + reasoning, no key param).""" schema = EvaluationSchemaBuilder.build() assert schema['title'] == 'EvaluationResponse' assert schema['type'] == 'object' - assert schema['required'] == ['evaluation'] - assert 'evaluation' in schema['properties'] - eval_schema = schema['properties']['evaluation'] - assert eval_schema['type'] == 'object' - assert eval_schema['required'] == ['score', 'reasoning'] - assert 'score' in eval_schema['properties'] - assert 'reasoning' in eval_schema['properties'] - assert eval_schema['properties']['score']['type'] == 'number' - assert eval_schema['properties']['score']['minimum'] == 0 - assert eval_schema['properties']['score']['maximum'] == 1 + assert schema['required'] == ['score', 'reasoning'] + assert 'score' in schema['properties'] + assert 'reasoning' in schema['properties'] + assert schema['properties']['score']['type'] == 'number' + assert schema['properties']['score']['minimum'] == 0 + assert schema['properties']['score']['maximum'] == 1 class TestJudgeConfigSerialization: