Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"version": "1.0",
"id": "EvalLevelExpectedOutputLLMJudgeSet",
"name": "Evaluation-Level expectedOutput with LLM Judge",
"evaluatorRefs": [
"ExactMatchEvaluator",
"LLMJudgeOutputEvaluator"
],
"evaluations": [
{
"id": "eval-level-llm-judge-add",
"name": "LLM Judge uses eval-level expectedOutput (addition)",
"inputs": {
"a": 3,
"b": 7,
"operator": "+"
},
"expectedOutput": {
"result": 10.0
},
"expectedAgentBehavior": "The agent should correctly add the two numbers and return the result.",
"evaluationCriterias": {
"ExactMatchEvaluator": null,
"LLMJudgeOutputEvaluator": null
}
},
{
"id": "eval-level-llm-judge-multiply",
"name": "LLM Judge uses eval-level expectedOutput (multiplication)",
"inputs": {
"a": 6,
"b": 8,
"operator": "*"
},
"expectedOutput": {
"result": 48.0
},
"expectedAgentBehavior": "The agent should correctly multiply the two numbers and return the result.",
"evaluationCriterias": {
"ExactMatchEvaluator": null,
"LLMJudgeOutputEvaluator": null
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"version": "1.0",
"id": "EvalLevelExpectedOutputSet",
"name": "Evaluation-Level expectedOutput Tests",
"evaluatorRefs": [
"ExactMatchEvaluator",
"JsonSimilarityEvaluator",
"ContainsEvaluator"
],
"evaluations": [
{
"id": "eval-level-null-criteria-add",
"name": "Eval-level expectedOutput with null criteria (addition)",
"inputs": {
"a": 2,
"b": 3,
"operator": "+"
},
"expectedOutput": {
"result": 5.0
},
"evaluationCriterias": {
"ExactMatchEvaluator": null,
"JsonSimilarityEvaluator": null,
"ContainsEvaluator": {
"searchText": "5"
}
}
},
{
"id": "eval-level-null-criteria-multiply",
"name": "Eval-level expectedOutput with null criteria (multiplication)",
"inputs": {
"a": 4,
"b": 5,
"operator": "*"
},
"expectedOutput": {
"result": 20.0
},
"evaluationCriterias": {
"ExactMatchEvaluator": null,
"JsonSimilarityEvaluator": null,
"ContainsEvaluator": {
"searchText": "20"
}
}
},
{
"id": "eval-level-per-evaluator-override",
"name": "Per-evaluator expectedOutput overrides eval-level",
"inputs": {
"a": 10,
"b": 5,
"operator": "-"
},
"expectedOutput": {
"result": 5.0
},
"evaluationCriterias": {
"ExactMatchEvaluator": {
"expectedOutput": {
"result": 5.0
}
},
"JsonSimilarityEvaluator": {
"expectedOutput": {
"result": 5.0
}
},
"ContainsEvaluator": {
"searchText": "5"
}
}
},
{
"id": "eval-level-mixed-null-and-explicit",
"name": "Mixed: some evaluators null, some explicit",
"inputs": {
"a": 7,
"b": 3,
"operator": "+"
},
"expectedOutput": {
"result": 10.0
},
"evaluationCriterias": {
"ExactMatchEvaluator": null,
"JsonSimilarityEvaluator": {
"expectedOutput": {
"result": 10.0
}
},
"ContainsEvaluator": {
"searchText": "10"
}
}
}
]
}
3 changes: 3 additions & 0 deletions src/uipath/eval/models/evaluation_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ class EvaluationItem(BaseModel):
id: str
name: str
inputs: dict[str, Any]
expected_output: dict[str, Any] | str | None = Field(
default=None, alias="expectedOutput"
)
evaluation_criterias: dict[str, dict[str, Any] | None] = Field(
..., alias="evaluationCriterias"
)
Expand Down
17 changes: 17 additions & 0 deletions src/uipath/eval/runtime/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

from .._execution_context import ExecutionSpanCollector
from ..evaluators.base_evaluator import GenericBaseEvaluator
from ..evaluators.output_evaluator import OutputEvaluationCriteria
from ..mocks._cache_manager import CacheManager
from ..mocks._input_mocker import (
generate_llm_input,
Expand Down Expand Up @@ -549,6 +550,22 @@ async def _execute_eval(
continue
evaluation_criteria = eval_item.evaluation_criterias[evaluator.id]

# Inject eval-level expectedOutput for output-based evaluators
if eval_item.expected_output is not None and issubclass(
evaluator.evaluation_criteria_type,
OutputEvaluationCriteria,
):
if evaluation_criteria is None:
evaluation_criteria = {
"expectedOutput": eval_item.expected_output
}
elif "expectedOutput" not in evaluation_criteria:
evaluation_criteria = {
**evaluation_criteria,
"expectedOutput": eval_item.expected_output,
}
# else: per-evaluator expectedOutput takes precedence

evaluation_result = await self.run_evaluator(
evaluator=evaluator,
execution_output=agent_execution_output,
Expand Down
12 changes: 12 additions & 0 deletions testcases/eval-level-expected-output/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[project]
name = "eval-level-expected-output"
version = "0.0.1"
description = "Tests for evaluation-level expectedOutput on EvaluationItem"
authors = [{ name = "John Doe", email = "john.doe@myemail.com" }]
dependencies = [
"uipath",
]
requires-python = ">=3.11"

[tool.uv.sources]
uipath = { path = "../../", editable = true }
16 changes: 16 additions & 0 deletions testcases/eval-level-expected-output/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
set -e

echo "Syncing dependencies..."
uv sync

echo "Authenticating with UiPath..."
uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"

echo "Running eval-level expectedOutput evaluations (deterministic evaluators)..."
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/eval-level-expected-output.json --no-report --output-file eval-level-expected-output.json

echo "Running eval-level expectedOutput evaluations (LLM judge)..."
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/eval-level-expected-output-llm-judge.json --no-report --output-file eval-level-expected-output-llm-judge.json

echo "Test completed successfully!"
148 changes: 148 additions & 0 deletions testcases/eval-level-expected-output/src/assert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Assertions for eval-level-expected-output testcase.

Validates that evaluation-level expectedOutput is correctly injected
into output-based evaluators (ExactMatch, JsonSimilarity, LLMJudge)
while non-output evaluators (Contains) remain unaffected.
"""

import json
import os

# Evaluators expected in the deterministic eval set
DETERMINISTIC_EVALUATORS = {
"ExactMatchEvaluator",
"JsonSimilarityEvaluator",
"ContainsEvaluator",
}

# Evaluators expected in the LLM judge eval set
LLM_JUDGE_EVALUATORS = {
"ExactMatchEvaluator",
"LLMJudgeOutputEvaluator",
}

# Evaluations in the deterministic eval set
DETERMINISTIC_EVALUATIONS = {
"Eval-level expectedOutput with null criteria (addition)",
"Eval-level expectedOutput with null criteria (multiplication)",
"Per-evaluator expectedOutput overrides eval-level",
"Mixed: some evaluators null, some explicit",
}

# Evaluations in the LLM judge eval set
LLM_JUDGE_EVALUATIONS = {
"LLM Judge uses eval-level expectedOutput (addition)",
"LLM Judge uses eval-level expectedOutput (multiplication)",
}


def validate_output_file(
output_file: str,
expected_evaluations: set[str],
expected_evaluators: set[str],
min_score: float = 0.99,
) -> None:
"""Validate an evaluation output file.

Args:
output_file: Path to the evaluation output JSON file.
expected_evaluations: Set of evaluation names to expect.
expected_evaluators: Set of evaluator IDs/names to expect.
min_score: Minimum acceptable score for all evaluators.
"""
assert os.path.isfile(output_file), f"Output file '{output_file}' not found"
print(f" Found output file: {output_file}")

with open(output_file, "r", encoding="utf-8") as f:
output_data = json.load(f)

assert "evaluationSetResults" in output_data, "Missing 'evaluationSetResults'"

evaluation_results = output_data["evaluationSetResults"]
assert len(evaluation_results) > 0, "No evaluation results found"
print(f" Found {len(evaluation_results)} evaluation result(s)")

failed_count = 0
seen_evaluations: set[str] = set()
seen_evaluators: set[str] = set()

for eval_result in evaluation_results:
eval_name = eval_result.get("evaluationName", "Unknown")
seen_evaluations.add(eval_name)
print(f"\n Validating: {eval_name}")

eval_run_results = eval_result.get("evaluationRunResults", [])
assert len(eval_run_results) > 0, f"No run results for '{eval_name}'"

for eval_run in eval_run_results:
evaluator_id = eval_run.get("evaluatorId", "Unknown")
evaluator_name = eval_run.get("evaluatorName", evaluator_id)
result = eval_run.get("result", {})
score = result.get("score")

seen_evaluators.add(evaluator_id)

is_passing = False
if score is True:
is_passing = True
elif isinstance(score, (int, float)) and score >= min_score:
is_passing = True

if is_passing:
display = f"{score:.2f}" if isinstance(score, float) else str(score)
print(f" {evaluator_name}: score={display} (pass)")
else:
print(
f" {evaluator_name}: score={score} "
f"(FAILED - expected >= {min_score})"
)
failed_count += 1

# Verify all expected evaluations were seen
missing_evals = expected_evaluations - seen_evaluations
if missing_evals:
print(f"\n Missing evaluations: {missing_evals}")
failed_count += len(missing_evals)

# Verify all expected evaluators were seen
missing_evaluators = expected_evaluators - seen_evaluators
if missing_evaluators:
print(f"\n Missing evaluators: {missing_evaluators}")
failed_count += len(missing_evaluators)

print(f"\n{'=' * 60}")
print(f" Failed: {failed_count}")
print(f"{'=' * 60}")

assert failed_count == 0, f"{failed_count} assertion(s) failed for {output_file}"
print(f"\n All assertions passed for {output_file}!")


def main() -> None:
"""Main assertion logic."""
# 1. Validate deterministic evaluators (ExactMatch, JsonSimilarity, Contains)
# All scores should be >= 0.99 since these are deterministic calculations
print("\n--- Deterministic Evaluators ---")
validate_output_file(
"eval-level-expected-output.json",
expected_evaluations=DETERMINISTIC_EVALUATIONS,
expected_evaluators=DETERMINISTIC_EVALUATORS,
min_score=0.99,
)

# 2. Validate LLM judge evaluators
# ExactMatch should score >= 0.99, LLM judge scores can vary
# but should be > 0 (semantically correct answers)
print("\n--- LLM Judge Evaluators ---")
validate_output_file(
"eval-level-expected-output-llm-judge.json",
expected_evaluations=LLM_JUDGE_EVALUATIONS,
expected_evaluators=LLM_JUDGE_EVALUATORS,
min_score=0.5, # LLM judge scores can vary, but should be well above 0
)

print("\n All eval-level expectedOutput assertions passed!")


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions testcases/eval-level-expected-output/uipath.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"functions": {
"main": "../../samples/calculator/main.py:main"
}
}
Loading