diff --git a/samples/calculator/evaluations/eval-sets/eval-level-expected-output-llm-judge.json b/samples/calculator/evaluations/eval-sets/eval-level-expected-output-llm-judge.json new file mode 100644 index 000000000..6e8449de5 --- /dev/null +++ b/samples/calculator/evaluations/eval-sets/eval-level-expected-output-llm-judge.json @@ -0,0 +1,45 @@ +{ + "version": "1.0", + "id": "EvalLevelExpectedOutputLLMJudgeSet", + "name": "Evaluation-Level expectedOutput with LLM Judge", + "evaluatorRefs": [ + "ExactMatchEvaluator", + "LLMJudgeOutputEvaluator" + ], + "evaluations": [ + { + "id": "eval-level-llm-judge-add", + "name": "LLM Judge uses eval-level expectedOutput (addition)", + "inputs": { + "a": 3, + "b": 7, + "operator": "+" + }, + "expectedOutput": { + "result": 10.0 + }, + "expectedAgentBehavior": "The agent should correctly add the two numbers and return the result.", + "evaluationCriterias": { + "ExactMatchEvaluator": null, + "LLMJudgeOutputEvaluator": null + } + }, + { + "id": "eval-level-llm-judge-multiply", + "name": "LLM Judge uses eval-level expectedOutput (multiplication)", + "inputs": { + "a": 6, + "b": 8, + "operator": "*" + }, + "expectedOutput": { + "result": 48.0 + }, + "expectedAgentBehavior": "The agent should correctly multiply the two numbers and return the result.", + "evaluationCriterias": { + "ExactMatchEvaluator": null, + "LLMJudgeOutputEvaluator": null + } + } + ] +} diff --git a/samples/calculator/evaluations/eval-sets/eval-level-expected-output.json b/samples/calculator/evaluations/eval-sets/eval-level-expected-output.json new file mode 100644 index 000000000..3d4b2f159 --- /dev/null +++ b/samples/calculator/evaluations/eval-sets/eval-level-expected-output.json @@ -0,0 +1,100 @@ +{ + "version": "1.0", + "id": "EvalLevelExpectedOutputSet", + "name": "Evaluation-Level expectedOutput Tests", + "evaluatorRefs": [ + "ExactMatchEvaluator", + "JsonSimilarityEvaluator", + "ContainsEvaluator" + ], + "evaluations": [ + { + "id": "eval-level-null-criteria-add", + "name": "Eval-level expectedOutput with null criteria (addition)", + "inputs": { + "a": 2, + "b": 3, + "operator": "+" + }, + "expectedOutput": { + "result": 5.0 + }, + "evaluationCriterias": { + "ExactMatchEvaluator": null, + "JsonSimilarityEvaluator": null, + "ContainsEvaluator": { + "searchText": "5" + } + } + }, + { + "id": "eval-level-null-criteria-multiply", + "name": "Eval-level expectedOutput with null criteria (multiplication)", + "inputs": { + "a": 4, + "b": 5, + "operator": "*" + }, + "expectedOutput": { + "result": 20.0 + }, + "evaluationCriterias": { + "ExactMatchEvaluator": null, + "JsonSimilarityEvaluator": null, + "ContainsEvaluator": { + "searchText": "20" + } + } + }, + { + "id": "eval-level-per-evaluator-override", + "name": "Per-evaluator expectedOutput overrides eval-level", + "inputs": { + "a": 10, + "b": 5, + "operator": "-" + }, + "expectedOutput": { + "result": 5.0 + }, + "evaluationCriterias": { + "ExactMatchEvaluator": { + "expectedOutput": { + "result": 5.0 + } + }, + "JsonSimilarityEvaluator": { + "expectedOutput": { + "result": 5.0 + } + }, + "ContainsEvaluator": { + "searchText": "5" + } + } + }, + { + "id": "eval-level-mixed-null-and-explicit", + "name": "Mixed: some evaluators null, some explicit", + "inputs": { + "a": 7, + "b": 3, + "operator": "+" + }, + "expectedOutput": { + "result": 10.0 + }, + "evaluationCriterias": { + "ExactMatchEvaluator": null, + "JsonSimilarityEvaluator": { + "expectedOutput": { + "result": 10.0 + } + }, + "ContainsEvaluator": { + "searchText": "10" + } + } + } + ] +} diff --git a/src/uipath/eval/models/evaluation_set.py b/src/uipath/eval/models/evaluation_set.py index 925aa7c58..89356f474 100644 --- a/src/uipath/eval/models/evaluation_set.py +++ b/src/uipath/eval/models/evaluation_set.py @@ -81,6 +81,9 @@ class EvaluationItem(BaseModel): id: str name: str inputs: dict[str, Any] + expected_output: dict[str, Any] | str | None = Field( + default=None, alias="expectedOutput" + ) evaluation_criterias: dict[str, dict[str, Any] | None] = Field( ..., alias="evaluationCriterias" ) diff --git a/src/uipath/eval/runtime/runtime.py b/src/uipath/eval/runtime/runtime.py index b8589dee3..4ed305f1d 100644 --- a/src/uipath/eval/runtime/runtime.py +++ b/src/uipath/eval/runtime/runtime.py @@ -46,6 +46,7 @@ from .._execution_context import ExecutionSpanCollector from ..evaluators.base_evaluator import GenericBaseEvaluator +from ..evaluators.output_evaluator import OutputEvaluationCriteria from ..mocks._cache_manager import CacheManager from ..mocks._input_mocker import ( generate_llm_input, @@ -549,6 +550,22 @@ async def _execute_eval( continue evaluation_criteria = eval_item.evaluation_criterias[evaluator.id] + # Inject eval-level expectedOutput for output-based evaluators + if eval_item.expected_output is not None and issubclass( + evaluator.evaluation_criteria_type, + OutputEvaluationCriteria, + ): + if evaluation_criteria is None: + evaluation_criteria = { + "expectedOutput": eval_item.expected_output + } + elif "expectedOutput" not in evaluation_criteria: + evaluation_criteria = { + **evaluation_criteria, + "expectedOutput": eval_item.expected_output, + } + # else: per-evaluator expectedOutput takes precedence + evaluation_result = await self.run_evaluator( evaluator=evaluator, execution_output=agent_execution_output, diff --git a/testcases/eval-level-expected-output/pyproject.toml b/testcases/eval-level-expected-output/pyproject.toml new file mode 100644 index 000000000..73286e014 --- /dev/null +++ b/testcases/eval-level-expected-output/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "eval-level-expected-output" +version = "0.0.1" +description = "Tests for evaluation-level expectedOutput on EvaluationItem" +authors = [{ name = "John Doe", email = "john.doe@myemail.com" }] +dependencies = [ + "uipath", +] +requires-python = ">=3.11" + +[tool.uv.sources] +uipath = { path = "../../", editable = true } diff --git a/testcases/eval-level-expected-output/run.sh b/testcases/eval-level-expected-output/run.sh new file mode 100755 index 000000000..e62959d23 --- /dev/null +++ b/testcases/eval-level-expected-output/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +echo "Syncing dependencies..." +uv sync + +echo "Authenticating with UiPath..." +uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL" + +echo "Running eval-level expectedOutput evaluations (deterministic evaluators)..." +uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/eval-level-expected-output.json --no-report --output-file eval-level-expected-output.json + +echo "Running eval-level expectedOutput evaluations (LLM judge)..." +uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/eval-level-expected-output-llm-judge.json --no-report --output-file eval-level-expected-output-llm-judge.json + +echo "Test completed successfully!" diff --git a/testcases/eval-level-expected-output/src/assert.py b/testcases/eval-level-expected-output/src/assert.py new file mode 100644 index 000000000..0ce7af9f1 --- /dev/null +++ b/testcases/eval-level-expected-output/src/assert.py @@ -0,0 +1,148 @@ +"""Assertions for eval-level-expected-output testcase. + +Validates that evaluation-level expectedOutput is correctly injected +into output-based evaluators (ExactMatch, JsonSimilarity, LLMJudge) +while non-output evaluators (Contains) remain unaffected. +""" + +import json +import os + +# Evaluators expected in the deterministic eval set +DETERMINISTIC_EVALUATORS = { + "ExactMatchEvaluator", + "JsonSimilarityEvaluator", + "ContainsEvaluator", +} + +# Evaluators expected in the LLM judge eval set +LLM_JUDGE_EVALUATORS = { + "ExactMatchEvaluator", + "LLMJudgeOutputEvaluator", +} + +# Evaluations in the deterministic eval set +DETERMINISTIC_EVALUATIONS = { + "Eval-level expectedOutput with null criteria (addition)", + "Eval-level expectedOutput with null criteria (multiplication)", + "Per-evaluator expectedOutput overrides eval-level", + "Mixed: some evaluators null, some explicit", +} + +# Evaluations in the LLM judge eval set +LLM_JUDGE_EVALUATIONS = { + "LLM Judge uses eval-level expectedOutput (addition)", + "LLM Judge uses eval-level expectedOutput (multiplication)", +} + + +def validate_output_file( + output_file: str, + expected_evaluations: set[str], + expected_evaluators: set[str], + min_score: float = 0.99, +) -> None: + """Validate an evaluation output file. + + Args: + output_file: Path to the evaluation output JSON file. + expected_evaluations: Set of evaluation names to expect. + expected_evaluators: Set of evaluator IDs/names to expect. + min_score: Minimum acceptable score for all evaluators. + """ + assert os.path.isfile(output_file), f"Output file '{output_file}' not found" + print(f" Found output file: {output_file}") + + with open(output_file, "r", encoding="utf-8") as f: + output_data = json.load(f) + + assert "evaluationSetResults" in output_data, "Missing 'evaluationSetResults'" + + evaluation_results = output_data["evaluationSetResults"] + assert len(evaluation_results) > 0, "No evaluation results found" + print(f" Found {len(evaluation_results)} evaluation result(s)") + + failed_count = 0 + seen_evaluations: set[str] = set() + seen_evaluators: set[str] = set() + + for eval_result in evaluation_results: + eval_name = eval_result.get("evaluationName", "Unknown") + seen_evaluations.add(eval_name) + print(f"\n Validating: {eval_name}") + + eval_run_results = eval_result.get("evaluationRunResults", []) + assert len(eval_run_results) > 0, f"No run results for '{eval_name}'" + + for eval_run in eval_run_results: + evaluator_id = eval_run.get("evaluatorId", "Unknown") + evaluator_name = eval_run.get("evaluatorName", evaluator_id) + result = eval_run.get("result", {}) + score = result.get("score") + + seen_evaluators.add(evaluator_id) + + is_passing = False + if score is True: + is_passing = True + elif isinstance(score, (int, float)) and score >= min_score: + is_passing = True + + if is_passing: + display = f"{score:.2f}" if isinstance(score, float) else str(score) + print(f" {evaluator_name}: score={display} (pass)") + else: + print( + f" {evaluator_name}: score={score} " + f"(FAILED - expected >= {min_score})" + ) + failed_count += 1 + + # Verify all expected evaluations were seen + missing_evals = expected_evaluations - seen_evaluations + if missing_evals: + print(f"\n Missing evaluations: {missing_evals}") + failed_count += len(missing_evals) + + # Verify all expected evaluators were seen + missing_evaluators = expected_evaluators - seen_evaluators + if missing_evaluators: + print(f"\n Missing evaluators: {missing_evaluators}") + failed_count += len(missing_evaluators) + + print(f"\n{'=' * 60}") + print(f" Failed: {failed_count}") + print(f"{'=' * 60}") + + assert failed_count == 0, f"{failed_count} assertion(s) failed for {output_file}" + print(f"\n All assertions passed for {output_file}!") + + +def main() -> None: + """Main assertion logic.""" + # 1. Validate deterministic evaluators (ExactMatch, JsonSimilarity, Contains) + # All scores should be >= 0.99 since these are deterministic calculations + print("\n--- Deterministic Evaluators ---") + validate_output_file( + "eval-level-expected-output.json", + expected_evaluations=DETERMINISTIC_EVALUATIONS, + expected_evaluators=DETERMINISTIC_EVALUATORS, + min_score=0.99, + ) + + # 2. Validate LLM judge evaluators + # ExactMatch should score >= 0.99, LLM judge scores can vary + # but should be > 0 (semantically correct answers) + print("\n--- LLM Judge Evaluators ---") + validate_output_file( + "eval-level-expected-output-llm-judge.json", + expected_evaluations=LLM_JUDGE_EVALUATIONS, + expected_evaluators=LLM_JUDGE_EVALUATORS, + min_score=0.5, # LLM judge scores can vary, but should be well above 0 + ) + + print("\n All eval-level expectedOutput assertions passed!") + + +if __name__ == "__main__": + main() diff --git a/testcases/eval-level-expected-output/uipath.json b/testcases/eval-level-expected-output/uipath.json new file mode 100644 index 000000000..2b8e5b396 --- /dev/null +++ b/testcases/eval-level-expected-output/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "../../samples/calculator/main.py:main" + } +} diff --git a/tests/evaluators/test_eval_level_expected_output.py b/tests/evaluators/test_eval_level_expected_output.py new file mode 100644 index 000000000..1c2b8fd44 --- /dev/null +++ b/tests/evaluators/test_eval_level_expected_output.py @@ -0,0 +1,761 @@ +"""Tests for evaluation-level expectedOutput schema enhancement. + +Tests the new optional `expectedOutput` field on EvaluationItem and the +runtime criteria resolution logic that injects it into output-based evaluators. +""" + +import uuid +from typing import Any + +import pytest +from pytest_mock.plugin import MockerFixture + +from uipath.eval.evaluators.contains_evaluator import ( + ContainsEvaluationCriteria, + ContainsEvaluator, +) +from uipath.eval.evaluators.exact_match_evaluator import ExactMatchEvaluator +from uipath.eval.evaluators.json_similarity_evaluator import ( + JsonSimilarityEvaluator, +) +from uipath.eval.evaluators.llm_as_judge_evaluator import LLMJudgeJustification +from uipath.eval.evaluators.llm_judge_output_evaluator import ( + LLMJudgeOutputEvaluator, +) +from uipath.eval.evaluators.output_evaluator import OutputEvaluationCriteria +from uipath.eval.models import NumericEvaluationResult +from uipath.eval.models.evaluation_set import ( + EvaluationItem, + EvaluationSet, +) +from uipath.eval.models.models import AgentExecution + +# ───────────────────────────────────────────────────────────────── +# Model Tests +# ───────────────────────────────────────────────────────────────── + + +class TestEvaluationItemExpectedOutput: + """Test the new expectedOutput field on EvaluationItem.""" + + def test_evaluation_item_with_dict_expected_output(self) -> None: + """EvaluationItem with dict expectedOutput parses correctly.""" + item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": 4}, + "evaluationCriterias": { + "exact-match": None, + }, + } + ) + assert item.expected_output == {"result": 4} + + def test_evaluation_item_with_string_expected_output(self) -> None: + """EvaluationItem with string expectedOutput parses correctly.""" + item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "hello"}, + "expectedOutput": "Hello World", + "evaluationCriterias": { + "exact-match": None, + }, + } + ) + assert item.expected_output == "Hello World" + + def test_evaluation_item_without_expected_output(self) -> None: + """EvaluationItem without expectedOutput still works (backward compat).""" + item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "evaluationCriterias": { + "exact-match": {"expectedOutput": {"result": 4}}, + }, + } + ) + assert item.expected_output is None + + def test_evaluation_item_with_null_expected_output(self) -> None: + """EvaluationItem with explicit null expectedOutput parses as None.""" + item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "expectedOutput": None, + "evaluationCriterias": { + "exact-match": {"expectedOutput": {"result": 4}}, + }, + } + ) + assert item.expected_output is None + + def test_evaluation_item_serialization_roundtrip_with_expected_output( + self, + ) -> None: + """Serialization roundtrip preserves expectedOutput.""" + original_data = { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": 4}, + "evaluationCriterias": { + "exact-match": None, + }, + } + item = EvaluationItem.model_validate(original_data) + serialized = item.model_dump(by_alias=True, exclude_none=True) + + assert serialized["expectedOutput"] == {"result": 4} + + # Roundtrip + item2 = EvaluationItem.model_validate(serialized) + assert item2.expected_output == {"result": 4} + + def test_evaluation_item_serialization_omits_none_expected_output(self) -> None: + """Serialization omits expectedOutput when None and exclude_none=True.""" + item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "evaluationCriterias": {"exact-match": None}, + } + ) + serialized = item.model_dump(by_alias=True, exclude_none=True) + assert "expectedOutput" not in serialized + + def test_evaluation_set_with_evaluation_level_expected_output(self) -> None: + """EvaluationSet with evaluation-level expectedOutput parses correctly.""" + evaluation_set = EvaluationSet.model_validate( + { + "id": "set-1", + "name": "Test Set", + "version": "1.0", + "evaluatorConfigs": ["exact-match"], + "evaluations": [ + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": 4}, + "evaluationCriterias": { + "exact-match": None, + }, + } + ], + } + ) + assert evaluation_set.evaluations[0].expected_output == {"result": 4} + + def test_evaluation_item_with_python_field_name(self) -> None: + """EvaluationItem works with Python field name (populate_by_name=True).""" + item = EvaluationItem( + id="eval-1", + name="Test Evaluation", + inputs={"query": "2+2"}, + expected_output={"result": 4}, + evaluation_criterias={"exact-match": None}, + ) + assert item.expected_output == {"result": 4} + + +# ───────────────────────────────────────────────────────────────── +# Runtime Criteria Resolution Tests +# ───────────────────────────────────────────────────────────────── + + +class TestRuntimeCriteriaResolution: + """Test the runtime criteria merge logic for evaluation-level expectedOutput. + + These tests verify the merge logic directly against evaluators, + simulating what runtime.py does when building typed criteria. + """ + + def _build_criteria( + self, + evaluator: Any, + evaluation_item: EvaluationItem, + evaluator_id: str, + ) -> Any: + """Simulate the runtime's criteria resolution logic. + + This mirrors the logic in runtime.py _execute_eval(). + """ + if evaluator_id not in evaluation_item.evaluation_criterias: + return None + + evaluation_criteria = evaluation_item.evaluation_criterias[evaluator_id] + + # Inject evaluation-level expectedOutput for output-based evaluators + if evaluation_item.expected_output is not None and issubclass( + evaluator.evaluation_criteria_type, + OutputEvaluationCriteria, + ): + if evaluation_criteria is None: + evaluation_criteria = { + "expectedOutput": evaluation_item.expected_output + } + elif "expectedOutput" not in evaluation_criteria: + evaluation_criteria = { + **evaluation_criteria, + "expectedOutput": evaluation_item.expected_output, + } + # else: per-evaluator expectedOutput takes precedence + + if evaluation_criteria: + return evaluator.evaluation_criteria_type(**evaluation_criteria) + return None + + @pytest.mark.asyncio + async def test_evaluation_level_used_when_criteria_is_null(self) -> None: + """When per-evaluator criteria is null, evaluation-level expectedOutput is injected.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test", + "inputs": {"query": "2+2"}, + "expectedOutput": {"output": "Test output"}, + "evaluationCriterias": {evaluator_id: None}, + } + ) + + criteria = self._build_criteria(evaluator, evaluation_item, evaluator_id) + + assert criteria is not None + assert isinstance(criteria, OutputEvaluationCriteria) + assert criteria.expected_output == {"output": "Test output"} + + @pytest.mark.asyncio + async def test_per_evaluator_overrides_evaluation_level(self) -> None: + """Per-evaluator criteria expectedOutput overrides evaluation-level.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": "evaluation-level"}, + "evaluationCriterias": { + evaluator_id: {"expectedOutput": {"result": "per-evaluator"}} + }, + } + ) + + criteria = self._build_criteria(evaluator, evaluation_item, evaluator_id) + + assert criteria is not None + assert criteria.expected_output == {"result": "per-evaluator"} + + @pytest.mark.asyncio + async def test_evaluation_level_injected_when_criteria_lacks_expected_output( + self, + ) -> None: + """When criteria has other fields but no expectedOutput, evaluation-level is injected.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": 4}, + "evaluationCriterias": {evaluator_id: {"someOtherField": "value"}}, + } + ) + + criteria = self._build_criteria(evaluator, evaluation_item, evaluator_id) + + assert criteria is not None + assert criteria.expected_output == {"result": 4} + + @pytest.mark.asyncio + async def test_non_output_evaluator_unaffected_by_evaluation_level(self) -> None: + """Non-output evaluators (ContainsEvaluator) ignore evaluation-level expectedOutput.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ContainsEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test", + "inputs": {"query": "hello"}, + "expectedOutput": {"result": "should be ignored"}, + "evaluationCriterias": {evaluator_id: {"searchText": "hello"}}, + } + ) + + criteria = self._build_criteria(evaluator, evaluation_item, evaluator_id) + + assert criteria is not None + assert isinstance(criteria, ContainsEvaluationCriteria) + assert criteria.search_text == "hello" + # expectedOutput was NOT injected + assert not hasattr(criteria, "expected_output") + + @pytest.mark.asyncio + async def test_no_evaluation_level_expected_output_no_injection(self) -> None: + """When evaluation-level expectedOutput is None, no injection happens.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test", + "inputs": {"query": "2+2"}, + "evaluationCriterias": {evaluator_id: None}, + } + ) + + criteria = self._build_criteria(evaluator, evaluation_item, evaluator_id) + + # No evaluation-level, no criteria -> None (will fall to default or error) + assert criteria is None + + @pytest.mark.asyncio + async def test_evaluation_level_string_expected_output(self) -> None: + """String evaluation-level expectedOutput is injected correctly.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test", + "inputs": {"query": "hello"}, + "expectedOutput": "Hello World", + "evaluationCriterias": {evaluator_id: None}, + } + ) + + criteria = self._build_criteria(evaluator, evaluation_item, evaluator_id) + + assert criteria is not None + assert criteria.expected_output == "Hello World" + + @pytest.mark.asyncio + async def test_evaluation_level_empty_dict_expected_output(self) -> None: + """Empty dict evaluation-level expectedOutput is still treated as present.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Test", + "inputs": {"query": "2+2"}, + "expectedOutput": {}, + "evaluationCriterias": {evaluator_id: None}, + } + ) + + criteria = self._build_criteria(evaluator, evaluation_item, evaluator_id) + + assert criteria is not None + assert criteria.expected_output == {} + + +# ───────────────────────────────────────────────────────────────── +# Evaluator Integration Tests +# ───────────────────────────────────────────────────────────────── + + +class TestExactMatchWithEvaluationLevelExpectedOutput: + """Test ExactMatchEvaluator with evaluation-level expectedOutput.""" + + @pytest.mark.asyncio + async def test_exact_match_with_evaluation_level_expected_output(self) -> None: + """ExactMatchEvaluator uses evaluation-level expectedOutput when criteria is null.""" + execution = AgentExecution( + agent_input={"query": "2+2"}, + agent_output={"result": 4}, + agent_trace=[], + ) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": str(uuid.uuid4())} + ) + + # Simulate runtime injection: evaluation-level expectedOutput -> criteria + criteria = OutputEvaluationCriteria( + expected_output={"result": 4} # pyright: ignore[reportCallIssue] + ) + result = await evaluator.evaluate(execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_exact_match_per_evaluator_overrides_evaluation_level(self) -> None: + """Per-evaluator expectedOutput overrides evaluation-level.""" + execution = AgentExecution( + agent_input={"query": "2+2"}, + agent_output={"result": 4}, + agent_trace=[], + ) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": str(uuid.uuid4())} + ) + + # Per-evaluator has different expectedOutput (mismatch) + criteria = OutputEvaluationCriteria( + expected_output={"result": 5} # pyright: ignore[reportCallIssue] + ) + result = await evaluator.evaluate(execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + +class TestJsonSimilarityWithEvaluationLevelExpectedOutput: + """Test JsonSimilarityEvaluator with evaluation-level expectedOutput.""" + + @pytest.mark.asyncio + async def test_json_similarity_with_evaluation_level_expected_output(self) -> None: + """JsonSimilarityEvaluator uses evaluation-level expectedOutput.""" + execution = AgentExecution( + agent_input={"input": "Test"}, + agent_output={"name": "John", "age": 30, "city": "NYC"}, + agent_trace=[], + ) + evaluator = JsonSimilarityEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": str(uuid.uuid4())} + ) + + criteria = OutputEvaluationCriteria( + expected_output={"name": "John", "age": 30, "city": "NYC"} # pyright: ignore[reportCallIssue] + ) + result = await evaluator.evaluate(execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestLLMJudgeWithEvaluationLevelExpectedOutput: + """Test LLMJudgeOutputEvaluator with evaluation-level expectedOutput.""" + + @pytest.mark.asyncio + async def test_llm_judge_output_with_evaluation_level_expected_output( + self, mocker: MockerFixture + ) -> None: + """LLMJudgeOutputEvaluator uses evaluation-level expectedOutput.""" + mock_tool_call = mocker.MagicMock() + mock_tool_call.id = "call_1" + mock_tool_call.name = "submit_evaluation" + mock_tool_call.arguments = { + "score": 90, + "justification": "Output matches expected", + } + + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock(content=None, tool_calls=[mock_tool_call]) + ) + ] + + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + config = { + "name": "LlmJudgeTest", + "prompt": "Rate: {{ActualOutput}} vs {{ExpectedOutput}}", + "model": "gpt-4o", + } + evaluator = LLMJudgeOutputEvaluator.model_validate( + { + "evaluatorConfig": config, + "llm_service": mock_chat_completions, + "id": str(uuid.uuid4()), + } + ) + + execution = AgentExecution( + agent_input={"query": "test"}, + agent_output={"result": "test output"}, + agent_trace=[], + ) + + # Criteria built from evaluation-level expectedOutput + criteria = OutputEvaluationCriteria( + expected_output={"result": "test output"} # pyright: ignore[reportCallIssue] + ) + result = await evaluator.evaluate(execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.9 + assert isinstance(result.details, LLMJudgeJustification) + + +# ───────────────────────────────────────────────────────────────── +# Legacy Migration Compatibility Tests +# ───────────────────────────────────────────────────────────────── + + +class TestLegacyMigrationCompatibility: + """Test that legacy migration path is unaffected by the new field.""" + + def test_legacy_evaluation_set_still_migrates(self) -> None: + """Legacy evaluation set without evaluation-level expectedOutput migrates correctly.""" + from uipath.eval.helpers import discriminate_eval_set + + legacy_data = { + "id": "set-1", + "fileName": "test.json", + "name": "Legacy Set", + "evaluatorRefs": ["exact-match"], + "evaluations": [ + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": 4}, + "expectedAgentBehavior": "", + "evalSetId": "set-1", + "createdAt": "2024-01-01T00:00:00Z", + "updatedAt": "2024-01-01T00:00:00Z", + } + ], + "batchSize": 10, + "timeoutMinutes": 20, + "createdAt": "2024-01-01T00:00:00Z", + "updatedAt": "2024-01-01T00:00:00Z", + } + + result = discriminate_eval_set(legacy_data) + + # Should parse as LegacyEvaluationSet (no version field) + from uipath.eval.models.evaluation_set import LegacyEvaluationSet + + assert isinstance(result, LegacyEvaluationSet) + + def test_v1_evaluation_set_with_evaluation_level_expected_output(self) -> None: + """v1.0 evaluation set with evaluation-level expectedOutput parses correctly.""" + from uipath.eval.helpers import discriminate_eval_set + + v1_data = { + "id": "set-1", + "name": "V1 Set", + "version": "1.0", + "evaluatorConfigs": ["exact-match"], + "evaluations": [ + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": 4}, + "evaluationCriterias": { + "exact-match": None, + }, + } + ], + } + + result = discriminate_eval_set(v1_data) + + assert isinstance(result, EvaluationSet) + assert result.evaluations[0].expected_output == {"result": 4} + + def test_v1_evaluation_set_without_evaluation_level_expected_output(self) -> None: + """v1.0 evaluation set without evaluation-level expectedOutput still works.""" + from uipath.eval.helpers import discriminate_eval_set + + v1_data = { + "id": "set-1", + "name": "V1 Set", + "version": "1.0", + "evaluatorConfigs": ["exact-match"], + "evaluations": [ + { + "id": "eval-1", + "name": "Test Evaluation", + "inputs": {"query": "2+2"}, + "evaluationCriterias": { + "exact-match": {"expectedOutput": {"result": 4}}, + }, + } + ], + } + + result = discriminate_eval_set(v1_data) + + assert isinstance(result, EvaluationSet) + assert result.evaluations[0].expected_output is None + + +# ───────────────────────────────────────────────────────────────── +# End-to-End Criteria Resolution Tests +# ───────────────────────────────────────────────────────────────── + + +class TestEndToEndCriteriaResolution: + """End-to-end tests that simulate the full runtime flow.""" + + @pytest.mark.asyncio + async def test_e2e_exact_match_null_criteria_with_evaluation_level(self) -> None: + """Full flow: null criteria + evaluation-level -> ExactMatch evaluator gets expectedOutput.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Calculator Test", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": 4}, + "evaluationCriterias": {evaluator_id: None}, + } + ) + execution = AgentExecution( + agent_input={"query": "2+2"}, + agent_output={"result": 4}, + agent_trace=[], + ) + + # Simulate runtime merge + evaluation_criteria = evaluation_item.evaluation_criterias[evaluator_id] + if evaluation_item.expected_output is not None and issubclass( + evaluator.evaluation_criteria_type, OutputEvaluationCriteria + ): + if evaluation_criteria is None: + evaluation_criteria = { + "expectedOutput": evaluation_item.expected_output + } + elif "expectedOutput" not in evaluation_criteria: + evaluation_criteria = { + **evaluation_criteria, + "expectedOutput": evaluation_item.expected_output, + } + + assert evaluation_criteria is not None + typed_criteria = evaluator.evaluation_criteria_type(**evaluation_criteria) + result = await evaluator.evaluate(execution, typed_criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_e2e_mixed_evaluators_with_evaluation_level(self) -> None: + """Multiple evaluators: output-based gets evaluation-level, non-output ignores it.""" + exact_match_id = str(uuid.uuid4()) + contains_id = str(uuid.uuid4()) + + exact_match_evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "ExactMatch"}, "id": exact_match_id} + ) + contains_evaluator = ContainsEvaluator.model_validate( + {"evaluatorConfig": {"name": "Contains"}, "id": contains_id} + ) + + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Mixed Test", + "inputs": {"query": "hello"}, + "expectedOutput": "Hello World", + "evaluationCriterias": { + exact_match_id: None, # Will get evaluation-level expectedOutput + contains_id: {"searchText": "Hello"}, # Unaffected + }, + } + ) + execution = AgentExecution( + agent_input={"query": "hello"}, + agent_output="Hello World", + agent_trace=[], + ) + + # Process exact-match (output-based) + em_criteria = evaluation_item.evaluation_criterias[exact_match_id] + if evaluation_item.expected_output is not None and issubclass( + exact_match_evaluator.evaluation_criteria_type, + OutputEvaluationCriteria, + ): + if em_criteria is None: + em_criteria = {"expectedOutput": evaluation_item.expected_output} + + assert em_criteria is not None + em_typed = exact_match_evaluator.evaluation_criteria_type(**em_criteria) + em_result = await exact_match_evaluator.evaluate(execution, em_typed) + assert em_result.score == 1.0 + + # Process contains (non-output-based) + c_criteria = evaluation_item.evaluation_criterias[contains_id] + assert not issubclass( + contains_evaluator.evaluation_criteria_type, + OutputEvaluationCriteria, + ) + assert c_criteria is not None + c_typed = contains_evaluator.evaluation_criteria_type(**c_criteria) + c_result = await contains_evaluator.evaluate(execution, c_typed) + assert c_result.score == 1.0 + + @pytest.mark.asyncio + async def test_e2e_per_evaluator_override_with_evaluation_level(self) -> None: + """Per-evaluator criteria overrides evaluation-level in full flow.""" + evaluator_id = str(uuid.uuid4()) + evaluator = ExactMatchEvaluator.model_validate( + {"evaluatorConfig": {"name": "Test"}, "id": evaluator_id} + ) + evaluation_item = EvaluationItem.model_validate( + { + "id": "eval-1", + "name": "Override Test", + "inputs": {"query": "2+2"}, + "expectedOutput": {"result": "wrong"}, + "evaluationCriterias": { + evaluator_id: {"expectedOutput": {"result": 4}} + }, + } + ) + execution = AgentExecution( + agent_input={"query": "2+2"}, + agent_output={"result": 4}, + agent_trace=[], + ) + + # Simulate runtime merge + evaluation_criteria = evaluation_item.evaluation_criterias[evaluator_id] + if evaluation_item.expected_output is not None and issubclass( + evaluator.evaluation_criteria_type, OutputEvaluationCriteria + ): + if evaluation_criteria is None: + evaluation_criteria = { + "expectedOutput": evaluation_item.expected_output + } + elif "expectedOutput" not in evaluation_criteria: + evaluation_criteria = { + **evaluation_criteria, + "expectedOutput": evaluation_item.expected_output, + } + # else: per-evaluator wins (this case) + + assert evaluation_criteria is not None + typed_criteria = evaluator.evaluation_criteria_type(**evaluation_criteria) + result = await evaluator.evaluate(execution, typed_criteria) + + # Per-evaluator says {"result": 4}, agent output is {"result": 4} -> match + assert result.score == 1.0 + # Verify evaluation-level "wrong" was NOT used + assert typed_criteria.expected_output == {"result": 4}