Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Release History

## 1.17.1 (Unreleased)

### Features Added

- Enabled `ToolCallAccuracyEvaluator` and `_ToolInputAccuracyEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). Both evaluators grade the agent's tool selection and input arguments — neither requires the (often redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `_ToolCallSuccessEvaluator`, `GroundednessEvaluator`, and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because their rubrics consume the tool output body.
- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line.
- `_ToolCallSuccessEvaluator` now forwards the per-call runtime `status` (e.g. `failed`, `error`, `incomplete`, `cancelled`, `canceled`, `completed`) to the LLM rubric as a `[STATUS] <value>` annotation appended to each emitted `[TOOL_CALL]` / `[TOOL_RESULT]` line. The prompty rubric is updated to treat the failure annotations as a strong, authoritative failure signal that overrides a bland or otherwise-passing-looking payload, while still falling back to payload-only judgment when `status` is absent. Output is byte-identical to the previous wire format when no `status` field is populated, so existing recorded test fixtures and customers whose converters do not emit `status` are unaffected.

## 1.17.0 (2026-06-03)

### Breaking Changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
from ._model_configurations import (
AzureAIProject,
AzureOpenAIModelConfiguration,
Expand Down Expand Up @@ -135,6 +136,7 @@ def lazy_import():
"ToolCallAccuracyEvaluator",
"_ToolOutputUtilizationEvaluator",
"_ToolCallSuccessEvaluator",
"_ToolInputAccuracyEvaluator",
"AzureOpenAIGrader",
"AzureOpenAILabelGrader",
"AzureOpenAIStringCheckGrader",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
# Initialize input validator
self._validator = ToolCallsValidator(
error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,30 @@ def __call__( # pylint: disable=docstring-missing-param
"""
return super().__call__(*args, **kwargs)

def _return_short_circuit_failure_result(self, failed_tools: List[str]) -> Dict[str, Union[str, float]]:
"""Return a deterministic fail result without invoking the LLM judge.

Used when the runtime explicitly marks one or more tool calls as
failed/incomplete via the ``status`` field on a ``tool_call`` or
``tool_result`` content block. The LLM call is skipped because the
runtime signal is authoritative.
"""
failed_list = ",".join(failed_tools)
reason = (
f"Tool call(s) [{failed_list}] reported a non-success runtime status "
"(failed or incomplete). Short-circuited without invoking the LLM judge."
)
return {
self._result_key: 0.0,
f"{self._result_key}_score": 0.0,
f"{self._result_key}_passed": False,
f"{self._result_key}_result": "fail",
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": "completed",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": {"failed_tools": failed_list},
}

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # type: ignore[override]
"""Do Tool Call Success evaluation.
Expand Down Expand Up @@ -181,6 +205,16 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t

if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
# Short-circuit: when the runtime explicitly marks any tool_call
# or tool_result with a non-success status (e.g. ``failed`` or
# ``incomplete``) there is no point asking the LLM judge to
# re-derive the failure from the payload -- the runtime signal
# is authoritative. Return a deterministic fail result and skip
# the LLM call entirely. The prompty rubric is now only
# consulted on the success path (status ``completed`` or absent).
failed_tools = _collect_failed_tool_calls(eval_input["response"])
if failed_tools:
return self._return_short_circuit_failure_result(failed_tools)
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
# If response is a string, pass directly without reformatting
elif isinstance(eval_input["response"], str):
Expand Down Expand Up @@ -271,8 +305,88 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
return tool_definitions


_FAILED_RUNTIME_STATUSES = frozenset({"failed", "incomplete"})


def _collect_failed_tool_calls(messages):
"""Return ordered, unique tool names whose runtime status indicates failure.

A tool call is treated as a runtime failure when either its assistant
``tool_call`` content block or its matched tool ``tool_result`` content
block carries a ``status`` field in ``{failed, incomplete}``. The check
runs in Python so the LLM judge is only invoked on the success path
(status ``completed`` or absent); failed/incomplete calls are short-
circuited deterministically.

When the failing block carries no resolvable function name, the
``tool_call_id`` is used as a stable identifier instead so the caller
can still surface it in ``properties.failed_tools``.
"""
if not isinstance(messages, list):
return []

id_to_name = {}
failed_ids = []
failed_names_without_id = []

for msg in messages:
if not isinstance(msg, dict) or msg.get("role") != "assistant":
continue
for content in msg.get("content", []) or []:
if not isinstance(content, dict) or content.get("type") != "tool_call":
continue
if "tool_call" in content and "function" in content.get("tool_call", {}):
tc = content["tool_call"]
name = tc.get("function", {}).get("name", "") or ""
call_id = tc.get("id")
else:
name = content.get("name", "") or ""
call_id = content.get("tool_call_id")
if call_id is not None:
id_to_name[call_id] = name
status = content.get("status")
if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES:
if call_id is not None:
failed_ids.append(call_id)
elif name:
failed_names_without_id.append(name)

for msg in messages:
if not isinstance(msg, dict) or msg.get("role") != "tool":
continue
call_id = msg.get("tool_call_id")
for content in msg.get("content", []) or []:
if not isinstance(content, dict) or content.get("type") != "tool_result":
continue
status = content.get("status")
if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES and call_id is not None:
failed_ids.append(call_id)

ordered = []
seen = set()
for call_id in failed_ids:
label = id_to_name.get(call_id) or call_id
if label and label not in seen:
seen.add(label)
ordered.append(label)
for name in failed_names_without_id:
if name and name not in seen:
seen.add(name)
ordered.append(name)
return ordered


def _get_tool_calls_results(agent_response_msgs):
"""Extract formatted agent tool calls and results from response."""
"""Extract formatted agent tool calls and results from response.

The output uses the original ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line
format only; runtime ``status`` is no longer forwarded to the LLM judge.
Failed/incomplete tool calls are short-circuited in Python by
:func:`_collect_failed_tool_calls` before this formatter runs, so by the
time the LLM sees the response every remaining call has either no
status or a ``completed`` status -- the rubric judges those by payload
alone.
"""
agent_response_text = []
tool_results = {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ EXPECTED OUTPUT
{
"reason": "None of the results indicate an error",
"properties": {
"failed_tools": "",
"failed_tools": ""
},
"score": 1,
"status": "completed"
Expand All @@ -111,7 +111,7 @@ EXPECTED OUTPUT
{
"reason": "None of the results indicate an error",
"properties": {
"failed_tools": "",
"failed_tools": ""
},
"score": 1,
"status": "completed"
Expand All @@ -128,7 +128,7 @@ EXPECTED OUTPUT
{
"reason": "Although the returned value 7 is not the square root of 4, this is a business mistake in the tool. The tool did not return a result indicating a technical error",
"properties": {
"failed_tools": "",
"failed_tools": ""
},
"score": 1,
"status": "completed"
Expand All @@ -146,7 +146,7 @@ EXPECTED OUTPUT
{
"reason": "The tool returned a semicolon separated list of names. Although the description in the definition says it should return comma-separated list , this formatting mistake is a business mistake of the tool, not a technical failure. The tool did not return an error",
"properties": {
"failed_tools": "",
"failed_tools": ""
},
"score": 1,
"status": "completed"
Expand All @@ -165,7 +165,7 @@ EXPECTED OUTPUT
{
"reason": "The tool returned empty response which is accepted given that this tool functionality does not include returning data to the caller",
"properties": {
"failed_tools": "",
"failed_tools": ""
},
"score": 1,
"status": "completed"
Expand All @@ -186,7 +186,7 @@ EXPECTED OUTPUT
{
"reason": "The tool returned empty response , however , given the tool definition , it should never return empty response because there should be weather info at any given point in time. An empty response here is considered a technical failure. The conclusion is the get_weather_info failed",
"properties": {
"failed_tools": "get_weather_info",
"failed_tools": "get_weather_info"
},
"score": 0,
"status": "completed"
Expand All @@ -203,7 +203,7 @@ EXPECTED OUTPUT
{
"reason": "The tool returned a string indicating that it failed",
"properties": {
"failed_tools": "get_current_user_Info",
"failed_tools": "get_current_user_Info"
},
"score": 0,
"status": "completed"
Expand All @@ -220,7 +220,7 @@ EXPECTED OUTPUT
{
"reason": "The tool returned an object with empty fields and a string indicating that it failed",
"properties": {
"failed_tools": "get_current_user_Info",
"failed_tools": "get_current_user_Info"
},
"score": 0,
"status": "completed"
Expand All @@ -237,7 +237,7 @@ EXPECTED OUTPUT
{
"reason": "The call for GetWeatherInfo returned an object containing single property 'temp' that is an empty string. This means the call to GetWeatherInfo returned empty result while weather info should be available at any time",
"properties": {
"failed_tools": "GetWeatherInfo",
"failed_tools": "GetWeatherInfo"
},
"score": 0,
"status": "completed"
Expand All @@ -254,7 +254,7 @@ EXPECTED OUTPUT
{
"reason": "the returned result indicates that the call to get_day_of_week timed out",
"properties": {
"failed_tools": "get_day_of_week",
"failed_tools": "get_day_of_week"
},
"score": 0,
"status": "completed"
Expand All @@ -272,7 +272,7 @@ EXPECTED OUTPUT
{
"reason": "null indicates an empty result which cannot be an accepted output of the tool given the tool definition since any given date represents a day of week",
"properties": {
"failed_tools": "get_day_of_week",
"failed_tools": "get_day_of_week"
},
"score": 0,
"status": "completed"
Expand All @@ -290,7 +290,7 @@ EXPECTED OUTPUT
{
"reason": "Empty object cannot be an accepted output of the tool given the tool definition since any given date should represent a day of week",
"properties": {
"failed_tools": "get_day_of_week",
"failed_tools": "get_day_of_week"
},
"score": 0,
"status": "completed"
Expand All @@ -309,7 +309,7 @@ EXPECTED OUTPUT
{
"reason": "GetWeatherInfo returned an empty response while it should return the weather info and BookTicket returned an error.Both tools failed.",
"properties": {
"failed_tools": "GetWeatherInfo,BookTicket",
"failed_tools": "GetWeatherInfo,BookTicket"
},
"score": 0,
"status": "completed"
Expand All @@ -328,7 +328,7 @@ EXPECTED OUTPUT
{
"reason": "Although GetWeatherInfo succeeded, BookTicket returned an error. The final result is failure because one of the tool calls has failed",
"properties": {
"failed_tools": "BookTicket",
"failed_tools": "BookTicket"
},
"score": 0,
"status": "completed"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(
self._validator = ToolDefinitionsValidator(
error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
optional_tool_definitions=False,
check_for_unsupported_tools=True,
check_for_unsupported_tools=False,
)

super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# ---------------------------------------------------------
# represents upcoming version

VERSION = "1.17.0"
VERSION = "1.17.1"
Loading
Loading