From e132532d8a990d5c932264af7d5781f0e8d181ed Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Thu, 11 Jun 2026 10:39:46 -0700 Subject: [PATCH 1/4] Enable ToolCallAccuracy / ToolInputAccuracy on restricted-tool conversations and add [STATUS] pass-through for ToolCallSuccess Three evaluators in azure-ai-evaluation previously rejected any conversation containing a built-in restricted tool (bing_grounding, bing_custom_search, azure_ai_search, azure_fabric, sharepoint_grounding). Two of those evaluators -- ToolCallAccuracyEvaluator and _ToolInputAccuracyEvaluator -- only judge the agent's tool selection and input arguments and do not need the (redacted) tool output body, so the rejection was overly conservative. This change enables both on restricted-tool conversations. _ToolCallSuccessEvaluator continues to reject them because its rubric inspects the tool output body, but it gains a new mechanism -- [STATUS] pass-through -- so the LLM judge can correctly recognize runtime-reported failures on conversations that *do* reach it. Changes ------- ToolCallAccuracy / ToolInputAccuracy: - Set check_for_unsupported_tools=False on the input validator in _tool_call_accuracy.py and _tool_input_accuracy.py. The underlying ToolDefinitionsValidator / ToolCallsValidator classes are unchanged; GroundednessEvaluator and ToolOutputUtilizationEvaluator still reject restricted tools because they require the tool output body. - Export _ToolInputAccuracyEvaluator from the azure.ai.evaluation top-level namespace, matching its three sibling tool evaluators (ToolCallAccuracyEvaluator, _ToolCallSuccessEvaluator, _ToolOutputUtilizationEvaluator). Consumers (notably the Foundry evaluations service catalog) can now import it directly instead of reaching into the private _evaluators._tool_input_accuracy submodule. ToolCallSuccess -- [STATUS] pass-through: - Added _format_status_suffix helper and wired it into _get_tool_calls_results so every [TOOL_CALL] / [TOOL_RESULT] line carries a [STATUS] suffix when the source content block has a status field. Back-compat preserved: empty/None/non-string status emits the empty string, so output is byte-identical to the prior format when status is absent. - Prompty: added an ERROR-CASES bullet that names [STATUS] failed and [STATUS] incomplete as authoritative failure signals that override bland payload appearance, with two illustrative examples (bland-payload+failed-status and completed-status+error-payload). The bullet matches the Responses-API tool-call status enum (in_progress | completed | incomplete | failed) -- only 'failed' and 'incomplete' are listed as primary values because no current emitter (Responses API, Threads/v1 Agents, ACA trace converter, tool-server gRPC) produces error/cancelled/canceled on a tool_call block. The _format_status_suffix helper remains permissive (any non-empty string) for forward-compat; only the rubric wording is narrowed. - Prompty: added an explicit clause that [STATUS] is optional and that [STATUS] completed does not by itself imply success -- payload-based rules still apply. - Prompty: fixed invalid trailing commas in every few-shot EXAMPLE OUTPUT. Each example had a trailing comma after the only failed_tools field of properties, producing invalid JSON. Under gpt-4o + response_format=json_object this caused the model to disambiguate the trailing comma by nesting score/status inside properties (a syntactically-valid alternative), which broke the SDK's top-level score extractor and silently flipped passing evaluations to fail. Validated end-to-end on a SharePoint-grounded transcript: with the commas stripped, gpt-4o reliably emits the canonical shape with score/status as siblings of properties, and pass/fail rows are classified correctly. Tests: - New test_unsupported_tools_validation.py (26 tests): 15 parametrized cases (3 evaluators x 5 restricted tools) asserting validate_eval_input returns True for response= payloads, 1 mixed-tools case, 10 regression cases asserting the underlying validators still reject restricted tools when check_for_unsupported_tools=True. - Replaced test_tool_call_success_evaluator.py with status-passthrough coverage (12 tests on _format_status_suffix and _get_tool_calls_results topologies). - One test was flipped from test_tool_call_success_accepts_restricted_tool to test_tool_call_success_still_rejects_restricted_tool in test_unsupported_tools_validation.py, with the module docstring scope narrowed to TCA/TIA only. Versioning: - Bumped _version.py 1.17.0 -> 1.17.1. - Added 1.17.1 (Unreleased) section to CHANGELOG.md under Features Added covering TCA/TIA enablement on restricted-tool conversations and TCS [STATUS] pass-through. All 38 impacted unit tests pass. --- .../azure-ai-evaluation/CHANGELOG.md | 8 + .../azure/ai/evaluation/__init__.py | 2 + .../_tool_call_accuracy.py | 2 +- .../_tool_call_success/_tool_call_success.py | 35 ++- .../tool_call_success.prompty | 85 ++++++-- .../_tool_input_accuracy.py | 2 +- .../azure/ai/evaluation/_version.py | 2 +- .../test_tool_call_success_evaluator.py | 202 ++++++++++++++++++ .../test_unsupported_tools_validation.py | 192 +++++++++++++++++ 9 files changed, 510 insertions(+), 20 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 576ed70a4396..2783023835c7 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,5 +1,13 @@ # Release History +## 1.17.1 (Unreleased) + +### Features Added + +- Enabled `ToolCallAccuracyEvaluator` and `_ToolInputAccuracyEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). Both evaluators grade the agent's tool selection and input arguments — neither requires the (often redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `_ToolCallSuccessEvaluator`, `GroundednessEvaluator`, and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because their rubrics consume the tool output body. +- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line. +- `_ToolCallSuccessEvaluator` now forwards the per-call runtime `status` (e.g. `failed`, `error`, `incomplete`, `cancelled`, `canceled`, `completed`) to the LLM rubric as a `[STATUS] ` annotation appended to each emitted `[TOOL_CALL]` / `[TOOL_RESULT]` line. The prompty rubric is updated to treat the failure annotations as a strong, authoritative failure signal that overrides a bland or otherwise-passing-looking payload, while still falling back to payload-only judgment when `status` is absent. Output is byte-identical to the previous wire format when no `status` field is populated, so existing recorded test fixtures and customers whose converters do not emit `status` are unaffected. + ## 1.17.0 (2026-06-03) ### Breaking Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 6703b2ca111f..f32ade1e90e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -34,6 +34,7 @@ from ._evaluators._document_retrieval import DocumentRetrievalEvaluator from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator from ._model_configurations import ( AzureAIProject, AzureOpenAIModelConfiguration, @@ -135,6 +136,7 @@ def lazy_import(): "ToolCallAccuracyEvaluator", "_ToolOutputUtilizationEvaluator", "_ToolCallSuccessEvaluator", + "_ToolInputAccuracyEvaluator", "AzureOpenAIGrader", "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 44e0876bad68..a66af1b48141 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -271,8 +271,35 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): return tool_definitions +def _format_status_suffix(status): + """Build the trailing ``[STATUS] `` annotation for a content block. + + Returns the empty string when ``status`` is absent or not a non-empty + string, so callers can unconditionally concatenate the return value + without affecting back-compat output. + + :param status: The raw ``status`` field from a ``tool_call`` or + ``tool_result`` content block. + :type status: Any + :return: ``" [STATUS] "`` when ``status`` is a non-empty string, + otherwise ``""``. + :rtype: str + """ + if isinstance(status, str) and status: + return f" [STATUS] {status}" + return "" + + def _get_tool_calls_results(agent_response_msgs): - """Extract formatted agent tool calls and results from response.""" + """Extract formatted agent tool calls and results from response. + + Each emitted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line is suffixed with + ``[STATUS] `` when the source content block carries a ``status`` + field. The prompty rubric uses this annotation as a strong failure signal + (see ``tool_call_success.prompty``). When ``status`` is absent the suffix + is omitted and the rubric falls back to payload-only judgment, so the + formatted output is byte-identical to the pre-pass-through wire format. + """ agent_response_text = [] tool_results = {} @@ -283,7 +310,8 @@ def _get_tool_calls_results(agent_response_msgs): for content in msg.get("content", []): if content.get("type") == "tool_result": result = content.get("tool_result") - tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}" + status_suffix = _format_status_suffix(content.get("status")) + tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}{status_suffix}" # Second pass: parse assistant messages and tool calls for msg in agent_response_msgs: @@ -302,7 +330,8 @@ def _get_tool_calls_results(agent_response_msgs): func_name = content.get("name", "") args = content.get("arguments", {}) args_str = ", ".join(f'{k}="{v}"' for k, v in args.items()) - call_line = f"[TOOL_CALL] {func_name}({args_str})" + status_suffix = _format_status_suffix(content.get("status")) + call_line = f"[TOOL_CALL] {func_name}({args_str}){status_suffix}" agent_response_text.append(call_line) if tool_call_id in tool_results: agent_response_text.append(tool_results[tool_call_id]) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty index d7df87a2004d..23a88552ed9a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty @@ -53,6 +53,7 @@ B. Examine tool result and definition for the tool being called to check whether 1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it: ERROR-CASES: =========== + - The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`**. These annotations indicate the tool call did not produce a usable result -- either because the runtime explicitly marked the call `failed` (an exception in the tool, the API surface returned an error response) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload. - The tool call resulted in an error or exception - The tool call failed to run or failed to return - The tool call returned a result that indicates an error or failure @@ -60,6 +61,7 @@ B. Examine tool result and definition for the tool being called to check whether - The tool timed-out or returned a result that indicate a time-out - The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present 2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake + 3. The `[STATUS]` annotation is **optional**. When it is absent on a tool call, judge that call by the payload-based rules above (back-compat with runtimes that do not emit a status field). When it is present and indicates success (e.g. `[STATUS] completed`), it does not by itself make a call succeed -- still apply the payload-based rules, because a runtime can report `completed` while the tool itself returned an error payload. C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded** D. You are required to return your **output** in the following format: { @@ -94,7 +96,7 @@ EXPECTED OUTPUT { "reason": "None of the results indicate an error", "properties": { - "failed_tools": "", + "failed_tools": "" }, "score": 1, "status": "completed" @@ -111,7 +113,7 @@ EXPECTED OUTPUT { "reason": "None of the results indicate an error", "properties": { - "failed_tools": "", + "failed_tools": "" }, "score": 1, "status": "completed" @@ -128,7 +130,7 @@ EXPECTED OUTPUT { "reason": "Although the returned value 7 is not the square root of 4, this is a business mistake in the tool. The tool did not return a result indicating a technical error", "properties": { - "failed_tools": "", + "failed_tools": "" }, "score": 1, "status": "completed" @@ -146,7 +148,7 @@ EXPECTED OUTPUT { "reason": "The tool returned a semicolon separated list of names. Although the description in the definition says it should return comma-separated list , this formatting mistake is a business mistake of the tool, not a technical failure. The tool did not return an error", "properties": { - "failed_tools": "", + "failed_tools": "" }, "score": 1, "status": "completed" @@ -165,7 +167,7 @@ EXPECTED OUTPUT { "reason": "The tool returned empty response which is accepted given that this tool functionality does not include returning data to the caller", "properties": { - "failed_tools": "", + "failed_tools": "" }, "score": 1, "status": "completed" @@ -186,7 +188,7 @@ EXPECTED OUTPUT { "reason": "The tool returned empty response , however , given the tool definition , it should never return empty response because there should be weather info at any given point in time. An empty response here is considered a technical failure. The conclusion is the get_weather_info failed", "properties": { - "failed_tools": "get_weather_info", + "failed_tools": "get_weather_info" }, "score": 0, "status": "completed" @@ -203,7 +205,7 @@ EXPECTED OUTPUT { "reason": "The tool returned a string indicating that it failed", "properties": { - "failed_tools": "get_current_user_Info", + "failed_tools": "get_current_user_Info" }, "score": 0, "status": "completed" @@ -220,7 +222,7 @@ EXPECTED OUTPUT { "reason": "The tool returned an object with empty fields and a string indicating that it failed", "properties": { - "failed_tools": "get_current_user_Info", + "failed_tools": "get_current_user_Info" }, "score": 0, "status": "completed" @@ -237,7 +239,7 @@ EXPECTED OUTPUT { "reason": "The call for GetWeatherInfo returned an object containing single property 'temp' that is an empty string. This means the call to GetWeatherInfo returned empty result while weather info should be available at any time", "properties": { - "failed_tools": "GetWeatherInfo", + "failed_tools": "GetWeatherInfo" }, "score": 0, "status": "completed" @@ -254,7 +256,7 @@ EXPECTED OUTPUT { "reason": "the returned result indicates that the call to get_day_of_week timed out", "properties": { - "failed_tools": "get_day_of_week", + "failed_tools": "get_day_of_week" }, "score": 0, "status": "completed" @@ -272,7 +274,7 @@ EXPECTED OUTPUT { "reason": "null indicates an empty result which cannot be an accepted output of the tool given the tool definition since any given date represents a day of week", "properties": { - "failed_tools": "get_day_of_week", + "failed_tools": "get_day_of_week" }, "score": 0, "status": "completed" @@ -290,7 +292,7 @@ EXPECTED OUTPUT { "reason": "Empty object cannot be an accepted output of the tool given the tool definition since any given date should represent a day of week", "properties": { - "failed_tools": "get_day_of_week", + "failed_tools": "get_day_of_week" }, "score": 0, "status": "completed" @@ -309,7 +311,7 @@ EXPECTED OUTPUT { "reason": "GetWeatherInfo returned an empty response while it should return the weather info and BookTicket returned an error.Both tools failed.", "properties": { - "failed_tools": "GetWeatherInfo,BookTicket", + "failed_tools": "GetWeatherInfo,BookTicket" }, "score": 0, "status": "completed" @@ -328,7 +330,62 @@ EXPECTED OUTPUT { "reason": "Although GetWeatherInfo succeeded, BookTicket returned an error. The final result is failure because one of the tool calls has failed", "properties": { - "failed_tools": "BookTicket", + "failed_tools": "BookTicket" + }, + "score": 0, + "status": "completed" +} + + +### Example - Failed (status annotation overrides bland payload) + +[TOOL_CALLS] +[TOOL_CALL] send_email(to:"alice@example.com" , body:"hi") [STATUS] failed +[TOOL_RESULT] {} [STATUS] failed + +EXPECTED OUTPUT +{ + "reason": "send_email is annotated with [STATUS] failed on both the call and the result, which is an authoritative failure signal from the runtime even though the result body {} is otherwise inconclusive", + "properties": { + "failed_tools": "send_email" + }, + "score": 0, + "status": "completed" +} + + +### Example - Failed (status completed but payload still indicates an error) + +[TOOL_CALLS] +[TOOL_CALL] get_current_user_info() [STATUS] completed +[TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"} [STATUS] completed + +EXPECTED OUTPUT +{ + "reason": "The runtime reported [STATUS] completed but the result payload still indicates failure with empty fields and an explicit error message. Payload-based rules still apply when [STATUS] is completed -- this call is failed", + "properties": { + "failed_tools": "get_current_user_info" + }, + "score": 0, + "status": "completed" +} + + +### Example - Failed (parallel calls in one turn, one annotated failed) + +[TOOL_CALLS] +[TOOL_CALL] fetch_weather(city:"Seattle") [STATUS] completed +[TOOL_RESULT] {"temp": 62} [STATUS] completed +[TOOL_CALL] send_email(to:"x@example.com") [STATUS] failed +[TOOL_RESULT] {} [STATUS] failed +[TOOL_CALL] lookup_user(id:"u42") [STATUS] completed +[TOOL_RESULT] {"user_id": "u42"} [STATUS] completed + +EXPECTED OUTPUT +{ + "reason": "send_email is annotated with [STATUS] failed; the other two parallel calls succeeded but a single failed call is sufficient to fail the overall evaluation", + "properties": { + "failed_tools": "send_email" }, "score": 0, "status": "completed" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py index bae6c9895046..9dc1249dff60 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py @@ -3,4 +3,4 @@ # --------------------------------------------------------- # represents upcoming version -VERSION = "1.17.0" +VERSION = "1.17.1" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py new file mode 100644 index 000000000000..6d0a558921f3 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -0,0 +1,202 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Tests for ToolCallSuccess runtime status pass-through to the LLM rubric. + +The evaluator's source-side preprocessing emits ``[STATUS] `` annotations +on each formatted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line whenever the source +content block carries a ``status`` field. The prompty rubric is taught to treat +these annotations as a strong (authoritative) failure signal when the status is +in {failed, error, incomplete, cancelled, canceled}, and to fall back to +payload-only judgment when ``status`` is absent. + +These tests cover the source-side preprocessing only (the ``[STATUS]`` string +emission). End-to-end rubric behavior is covered by the existing behavior +suites that exercise the full evaluator with a mocked LLM. +""" + +import pytest + +from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import ( + _format_status_suffix, + _get_tool_calls_results, +) + + +# region helpers + + +def _assistant_tool_call(tool_call_id, name, arguments, status=None): + """Build an assistant message carrying a single tool_call content block.""" + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments, + } + if status is not None: + block["status"] = status + return {"role": "assistant", "content": [block]} + + +def _tool_result(tool_call_id, result, status=None): + """Build a tool message carrying a single tool_result content block.""" + block = { + "type": "tool_result", + "tool_call_id": tool_call_id, + "tool_result": result, + } + if status is not None: + block["status"] = status + return { + "role": "tool", + "tool_call_id": tool_call_id, + "content": [block], + } + + +def _assistant_parallel_tool_calls(blocks): + """Build a single assistant message that emits multiple tool_call blocks in one turn. + + ``blocks`` is a list of ``(tool_call_id, name, arguments, status)`` tuples. + This is the modern Responses-API topology for parallel function-call + invocation: multiple ``tool_call`` content blocks under one assistant + message, in contrast to one assistant message per call. + """ + content = [] + for tool_call_id, name, arguments, status in blocks: + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments, + } + if status is not None: + block["status"] = status + content.append(block) + return {"role": "assistant", "content": content} + + +# endregion + + +@pytest.mark.unittest +class TestFormatStatusSuffix: + """Unit tests for the ``_format_status_suffix`` helper.""" + + def test_known_failure_status_emits_suffix(self): + """A known-failure status string produces a ``[STATUS] `` suffix.""" + assert _format_status_suffix("failed") == " [STATUS] failed" + + def test_completed_status_emits_suffix(self): + """A success status string also emits a suffix (the rubric distinguishes the two).""" + assert _format_status_suffix("completed") == " [STATUS] completed" + + def test_arbitrary_status_string_emits_suffix(self): + """Any non-empty string status emits a suffix; the rubric judges semantics, not Python.""" + assert _format_status_suffix("rate_limited") == " [STATUS] rate_limited" + + def test_none_status_emits_empty(self): + """Absent status (``None``) emits the empty string for back-compat.""" + assert _format_status_suffix(None) == "" + + def test_empty_string_status_emits_empty(self): + """Empty string status emits the empty string (treated same as absent).""" + assert _format_status_suffix("") == "" + + def test_non_string_status_emits_empty(self): + """Non-string statuses (int, dict, list) are ignored rather than raised on.""" + assert _format_status_suffix(42) == "" + assert _format_status_suffix({"x": 1}) == "" + assert _format_status_suffix(["failed"]) == "" + + +@pytest.mark.unittest +class TestGetToolCallsResultsStatusPassthrough: + """Integration tests for ``[STATUS]`` annotation emission via ``_get_tool_calls_results``.""" + + def test_status_on_tool_call_is_appended_to_tool_call_line(self): + """When ``status`` is set on a tool_call block, the ``[TOOL_CALL]`` line carries the annotation.""" + msgs = [ + _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}, status="failed"), + _tool_result("c1", ""), + ] + lines = _get_tool_calls_results(msgs) + assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed' + # Tool result has no status -> no suffix. + assert lines[1] == "[TOOL_RESULT] " + + def test_status_on_tool_result_is_appended_to_tool_result_line(self): + """When ``status`` is set on a tool_result block, the ``[TOOL_RESULT]`` line carries the annotation.""" + msgs = [ + _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}), + _tool_result("c1", "", status="error"), + ] + lines = _get_tool_calls_results(msgs) + assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com")' + assert lines[1] == "[TOOL_RESULT] [STATUS] error" + + def test_completed_status_is_passed_through_too(self): + """``[STATUS] completed`` is emitted alongside failure statuses; the rubric decides semantics.""" + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"), + _tool_result("c1", "Sunny, 72F.", status="completed"), + ] + lines = _get_tool_calls_results(msgs) + assert lines[0] == '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed' + assert lines[1] == "[TOOL_RESULT] Sunny, 72F. [STATUS] completed" + + def test_absent_status_produces_no_suffix_back_compat(self): + """When ``status`` is absent on every block, output matches the pre-status-pass-through format exactly.""" + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}), + _tool_result("c1", "Sunny, 72F."), + ] + lines = _get_tool_calls_results(msgs) + assert lines == [ + '[TOOL_CALL] fetch_weather(city="Seattle")', + "[TOOL_RESULT] Sunny, 72F.", + ] + + def test_parallel_tool_calls_in_one_assistant_message_each_get_their_own_status(self): + """Multiple ``tool_call`` blocks in one assistant message each emit their own ``[STATUS]`` annotation. + + This is the modern Responses-API topology and exercises that the + formatter walks into the content list rather than only processing the + first block per message. + """ + msgs = [ + _assistant_parallel_tool_calls([ + ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), + ("c2", "send_email", {"to": "x@example.com"}, "failed"), + ("c3", "lookup_user", {"id": "u42"}, "completed"), + ]), + _tool_result("c1", "Sunny, 72F.", status="completed"), + _tool_result("c2", "", status="failed"), + _tool_result("c3", {"user_id": "u42"}, status="completed"), + ] + lines = _get_tool_calls_results(msgs) + assert lines == [ + '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed', + "[TOOL_RESULT] Sunny, 72F. [STATUS] completed", + '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed', + "[TOOL_RESULT] [STATUS] failed", + '[TOOL_CALL] lookup_user(id="u42") [STATUS] completed', + "[TOOL_RESULT] {'user_id': 'u42'} [STATUS] completed", + ] + + def test_mixed_status_present_and_absent_across_calls(self): + """A response with status on some calls and not others produces a mixed-suffix output.""" + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"), + _tool_result("c1", "Sunny, 72F."), + _assistant_tool_call("c2", "send_email", {"to": "x@example.com"}), + _tool_result("c2", "", status="failed"), + ] + lines = _get_tool_calls_results(msgs) + assert lines == [ + '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed', + "[TOOL_RESULT] Sunny, 72F.", + '[TOOL_CALL] send_email(to="x@example.com")', + "[TOOL_RESULT] [STATUS] failed", + ] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py new file mode 100644 index 000000000000..2250e343a3d2 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -0,0 +1,192 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +""" +Regression tests for the change that lets two tool evaluators +(``ToolCallAccuracy`` and ``_ToolInputAccuracy``) accept conversations +containing restricted built-in tools. + +These two evaluators previously rejected any conversation containing tools +in ``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``, +``azure_ai_search``). Because neither grade requires the (often redacted) +tool output body, the rejection has been lifted by setting +``check_for_unsupported_tools=False`` on each evaluator's input validator. + +``_ToolCallSuccess`` is intentionally **not** part of this enablement: its +rubric still depends on the tool output body to judge success, so it keeps +``check_for_unsupported_tools=True`` and continues to reject restricted +tools. Coverage for that contract lives in this file alongside the TCA/TIA +acceptance tests so the two stay in lockstep. + +The tests below exercise the validator directly so they do not need the +prompty flow or a real model deployment. They also confirm that the +underlying validator class still rejects restricted tools when +``check_for_unsupported_tools=True``, so the behavior change is limited +to the per-evaluator wiring. +""" + +import pytest + +from azure.ai.evaluation import ToolCallAccuracyEvaluator +from azure.ai.evaluation._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator +from azure.ai.evaluation._evaluators._common._validators import ( + ToolCallsValidator, + ToolDefinitionsValidator, +) +from azure.ai.evaluation._exceptions import ErrorTarget, EvaluationException + + +RESTRICTED_TOOL_NAMES = [ + "bing_grounding", + "bing_custom_search", + "azure_ai_search", + "azure_fabric", + "sharepoint_grounding", +] + + +def _restricted_response(tool_name: str): + return [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": tool_name, + "arguments": {"query": "anything"}, + } + ], + } + ] + + +def _restricted_tool_definition(tool_name: str): + return { + "name": tool_name, + "description": f"Built-in {tool_name} tool.", + "parameters": { + "type": "object", + "properties": {"query": {"type": "string"}}, + }, + } + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestRestrictedToolValidationLifted: + """Validator should no longer reject restricted tools for these three evaluators.""" + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + # Should not raise EvaluationException; flag flip made this path legal. + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_input_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_success_still_rejects_restricted_tool(self, mock_model_config, tool_name): + """TCS keeps the restricted-tool block (its rubric depends on the tool output body).""" + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + evaluator._validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + + def test_mixed_function_and_restricted_tool_accepted(self, mock_model_config): + """Conversation containing both a function call and a restricted tool call validates cleanly.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Find stock price and weather.", + "response": [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_func", + "name": "get_weather", + "arguments": {"location": "Paris"}, + }, + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": "bing_grounding", + "arguments": {"query": "MSFT stock price"}, + }, + ], + } + ], + "tool_definitions": [ + { + "name": "get_weather", + "type": "function", + "description": "Weather lookup.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, + }, + _restricted_tool_definition("bing_grounding"), + ], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + +@pytest.mark.unittest +class TestUnderlyingValidatorUnchanged: + """The validator class itself still rejects restricted tools when the flag is on. + + Ensures the behavior change is limited to per-evaluator wiring; the validator + keeps its option to enforce the restricted-tool block for other consumers + (e.g. GroundednessEvaluator). + """ + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_calls_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolCallsValidator( + error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + check_for_unsupported_tools=True, + ) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolDefinitionsValidator( + error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, + requires_query=False, + check_for_unsupported_tools=True, + ) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) From 6eb07a0a1890f07dd26f44fb36105dad0a2baa34 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 15 Jun 2026 12:35:16 -0700 Subject: [PATCH 2/4] ToolCallSuccess: move runtime-status short-circuit from prompt into Python Failed/incomplete tool_call or tool_result blocks now return a deterministic fail result without invoking the LLM judge; the prompty rubric is consulted only on the success path. Drops [STATUS] suffix from the formatted LLM input (back-compat with pre-pass-through wire format). Adds _collect_failed_tool_calls helper and _return_short_circuit_failure_result method; removes _format_status_suffix; rewrites tests. --- .../_tool_call_success/_tool_call_success.py | 135 +++++++-- .../tool_call_success.prompty | 57 ---- .../test_tool_call_success_evaluator.py | 268 ++++++++++++------ 3 files changed, 296 insertions(+), 164 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index a66af1b48141..a2988431a605 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -141,6 +141,32 @@ def __call__( # pylint: disable=docstring-missing-param """ return super().__call__(*args, **kwargs) + def _return_short_circuit_failure_result( + self, failed_tools: List[str] + ) -> Dict[str, Union[str, float]]: + """Return a deterministic fail result without invoking the LLM judge. + + Used when the runtime explicitly marks one or more tool calls as + failed/incomplete via the ``status`` field on a ``tool_call`` or + ``tool_result`` content block. The LLM call is skipped because the + runtime signal is authoritative. + """ + failed_list = ",".join(failed_tools) + reason = ( + f"Tool call(s) [{failed_list}] reported a non-success runtime status " + "(failed or incomplete). Short-circuited without invoking the LLM judge." + ) + return { + self._result_key: 0.0, + f"{self._result_key}_score": 0.0, + f"{self._result_key}_passed": False, + f"{self._result_key}_result": "fail", + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": "completed", + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_properties": {"failed_tools": failed_list}, + } + @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # type: ignore[override] """Do Tool Call Success evaluation. @@ -181,6 +207,16 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) + # Short-circuit: when the runtime explicitly marks any tool_call + # or tool_result with a non-success status (e.g. ``failed`` or + # ``incomplete``) there is no point asking the LLM judge to + # re-derive the failure from the payload -- the runtime signal + # is authoritative. Return a deterministic fail result and skip + # the LLM call entirely. The prompty rubric is now only + # consulted on the success path (status ``completed`` or absent). + failed_tools = _collect_failed_tool_calls(eval_input["response"]) + if failed_tools: + return self._return_short_circuit_failure_result(failed_tools) eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) # If response is a string, pass directly without reformatting elif isinstance(eval_input["response"], str): @@ -271,34 +307,87 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): return tool_definitions -def _format_status_suffix(status): - """Build the trailing ``[STATUS] `` annotation for a content block. +_FAILED_RUNTIME_STATUSES = frozenset({"failed", "incomplete"}) + + +def _collect_failed_tool_calls(messages): + """Return ordered, unique tool names whose runtime status indicates failure. - Returns the empty string when ``status`` is absent or not a non-empty - string, so callers can unconditionally concatenate the return value - without affecting back-compat output. + A tool call is treated as a runtime failure when either its assistant + ``tool_call`` content block or its matched tool ``tool_result`` content + block carries a ``status`` field in ``{failed, incomplete}``. The check + runs in Python so the LLM judge is only invoked on the success path + (status ``completed`` or absent); failed/incomplete calls are short- + circuited deterministically. - :param status: The raw ``status`` field from a ``tool_call`` or - ``tool_result`` content block. - :type status: Any - :return: ``" [STATUS] "`` when ``status`` is a non-empty string, - otherwise ``""``. - :rtype: str + When the failing block carries no resolvable function name, the + ``tool_call_id`` is used as a stable identifier instead so the caller + can still surface it in ``properties.failed_tools``. """ - if isinstance(status, str) and status: - return f" [STATUS] {status}" - return "" + if not isinstance(messages, list): + return [] + + id_to_name = {} + failed_ids = [] + failed_names_without_id = [] + + for msg in messages: + if not isinstance(msg, dict) or msg.get("role") != "assistant": + continue + for content in msg.get("content", []) or []: + if not isinstance(content, dict) or content.get("type") != "tool_call": + continue + if "tool_call" in content and "function" in content.get("tool_call", {}): + tc = content["tool_call"] + name = tc.get("function", {}).get("name", "") or "" + tcid = tc.get("id") + else: + name = content.get("name", "") or "" + tcid = content.get("tool_call_id") + if tcid is not None: + id_to_name[tcid] = name + status = content.get("status") + if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES: + if tcid is not None: + failed_ids.append(tcid) + elif name: + failed_names_without_id.append(name) + + for msg in messages: + if not isinstance(msg, dict) or msg.get("role") != "tool": + continue + tcid = msg.get("tool_call_id") + for content in msg.get("content", []) or []: + if not isinstance(content, dict) or content.get("type") != "tool_result": + continue + status = content.get("status") + if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES and tcid is not None: + failed_ids.append(tcid) + + ordered = [] + seen = set() + for tcid in failed_ids: + label = id_to_name.get(tcid) or tcid + if label and label not in seen: + seen.add(label) + ordered.append(label) + for name in failed_names_without_id: + if name and name not in seen: + seen.add(name) + ordered.append(name) + return ordered def _get_tool_calls_results(agent_response_msgs): """Extract formatted agent tool calls and results from response. - Each emitted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line is suffixed with - ``[STATUS] `` when the source content block carries a ``status`` - field. The prompty rubric uses this annotation as a strong failure signal - (see ``tool_call_success.prompty``). When ``status`` is absent the suffix - is omitted and the rubric falls back to payload-only judgment, so the - formatted output is byte-identical to the pre-pass-through wire format. + The output uses the original ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line + format only; runtime ``status`` is no longer forwarded to the LLM judge. + Failed/incomplete tool calls are short-circuited in Python by + :func:`_collect_failed_tool_calls` before this formatter runs, so by the + time the LLM sees the response every remaining call has either no + status or a ``completed`` status -- the rubric judges those by payload + alone. """ agent_response_text = [] tool_results = {} @@ -310,8 +399,7 @@ def _get_tool_calls_results(agent_response_msgs): for content in msg.get("content", []): if content.get("type") == "tool_result": result = content.get("tool_result") - status_suffix = _format_status_suffix(content.get("status")) - tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}{status_suffix}" + tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}" # Second pass: parse assistant messages and tool calls for msg in agent_response_msgs: @@ -330,8 +418,7 @@ def _get_tool_calls_results(agent_response_msgs): func_name = content.get("name", "") args = content.get("arguments", {}) args_str = ", ".join(f'{k}="{v}"' for k, v in args.items()) - status_suffix = _format_status_suffix(content.get("status")) - call_line = f"[TOOL_CALL] {func_name}({args_str}){status_suffix}" + call_line = f"[TOOL_CALL] {func_name}({args_str})" agent_response_text.append(call_line) if tool_call_id in tool_results: agent_response_text.append(tool_results[tool_call_id]) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty index 23a88552ed9a..f9af98623073 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty @@ -53,7 +53,6 @@ B. Examine tool result and definition for the tool being called to check whether 1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it: ERROR-CASES: =========== - - The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`**. These annotations indicate the tool call did not produce a usable result -- either because the runtime explicitly marked the call `failed` (an exception in the tool, the API surface returned an error response) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload. - The tool call resulted in an error or exception - The tool call failed to run or failed to return - The tool call returned a result that indicates an error or failure @@ -61,7 +60,6 @@ B. Examine tool result and definition for the tool being called to check whether - The tool timed-out or returned a result that indicate a time-out - The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present 2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake - 3. The `[STATUS]` annotation is **optional**. When it is absent on a tool call, judge that call by the payload-based rules above (back-compat with runtimes that do not emit a status field). When it is present and indicates success (e.g. `[STATUS] completed`), it does not by itself make a call succeed -- still apply the payload-based rules, because a runtime can report `completed` while the tool itself returned an error payload. C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded** D. You are required to return your **output** in the following format: { @@ -337,61 +335,6 @@ EXPECTED OUTPUT } -### Example - Failed (status annotation overrides bland payload) - -[TOOL_CALLS] -[TOOL_CALL] send_email(to:"alice@example.com" , body:"hi") [STATUS] failed -[TOOL_RESULT] {} [STATUS] failed - -EXPECTED OUTPUT -{ - "reason": "send_email is annotated with [STATUS] failed on both the call and the result, which is an authoritative failure signal from the runtime even though the result body {} is otherwise inconclusive", - "properties": { - "failed_tools": "send_email" - }, - "score": 0, - "status": "completed" -} - - -### Example - Failed (status completed but payload still indicates an error) - -[TOOL_CALLS] -[TOOL_CALL] get_current_user_info() [STATUS] completed -[TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"} [STATUS] completed - -EXPECTED OUTPUT -{ - "reason": "The runtime reported [STATUS] completed but the result payload still indicates failure with empty fields and an explicit error message. Payload-based rules still apply when [STATUS] is completed -- this call is failed", - "properties": { - "failed_tools": "get_current_user_info" - }, - "score": 0, - "status": "completed" -} - - -### Example - Failed (parallel calls in one turn, one annotated failed) - -[TOOL_CALLS] -[TOOL_CALL] fetch_weather(city:"Seattle") [STATUS] completed -[TOOL_RESULT] {"temp": 62} [STATUS] completed -[TOOL_CALL] send_email(to:"x@example.com") [STATUS] failed -[TOOL_RESULT] {} [STATUS] failed -[TOOL_CALL] lookup_user(id:"u42") [STATUS] completed -[TOOL_RESULT] {"user_id": "u42"} [STATUS] completed - -EXPECTED OUTPUT -{ - "reason": "send_email is annotated with [STATUS] failed; the other two parallel calls succeeded but a single failed call is sufficient to fail the overall evaluation", - "properties": { - "failed_tools": "send_email" - }, - "score": 0, - "status": "completed" -} - - Now given the **INPUT** you received generate the output # Output diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py index 6d0a558921f3..288181c87e7a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -1,24 +1,27 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -"""Tests for ToolCallSuccess runtime status pass-through to the LLM rubric. - -The evaluator's source-side preprocessing emits ``[STATUS] `` annotations -on each formatted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line whenever the source -content block carries a ``status`` field. The prompty rubric is taught to treat -these annotations as a strong (authoritative) failure signal when the status is -in {failed, error, incomplete, cancelled, canceled}, and to fall back to -payload-only judgment when ``status`` is absent. - -These tests cover the source-side preprocessing only (the ``[STATUS]`` string -emission). End-to-end rubric behavior is covered by the existing behavior -suites that exercise the full evaluator with a mocked LLM. +"""Tests for ToolCallSuccess Python-side short-circuit on runtime status. + +The evaluator's preprocessing inspects every assistant ``tool_call`` and tool +``tool_result`` content block. When any of them carries a ``status`` field in +``{failed, incomplete}`` the evaluator returns a deterministic fail result +without invoking the LLM judge. The LLM rubric is consulted only on the +success path (status ``completed`` or absent). + +These tests cover the two new pieces of behavior: + +1. ``_collect_failed_tool_calls`` correctly identifies failed tool names + across the supported content shapes. +2. ``_get_tool_calls_results`` no longer forwards ``[STATUS]`` annotations + to the formatted LLM input (back-compat with the pre-pass-through wire + format). """ import pytest from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import ( - _format_status_suffix, + _collect_failed_tool_calls, _get_tool_calls_results, ) @@ -81,73 +84,194 @@ def _assistant_parallel_tool_calls(blocks): @pytest.mark.unittest -class TestFormatStatusSuffix: - """Unit tests for the ``_format_status_suffix`` helper.""" +class TestCollectFailedToolCalls: + """Unit tests for the ``_collect_failed_tool_calls`` helper.""" + + def test_no_status_anywhere_returns_empty(self): + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}), + _tool_result("c1", "Sunny, 72F."), + ] + assert _collect_failed_tool_calls(msgs) == [] + + def test_all_completed_returns_empty(self): + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"), + _tool_result("c1", "Sunny, 72F.", status="completed"), + ] + assert _collect_failed_tool_calls(msgs) == [] + + def test_failed_status_on_tool_call_block(self): + msgs = [ + _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}, status="failed"), + _tool_result("c1", ""), + ] + assert _collect_failed_tool_calls(msgs) == ["send_email"] + + def test_failed_status_on_tool_result_block(self): + msgs = [ + _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}), + _tool_result("c1", "", status="failed"), + ] + assert _collect_failed_tool_calls(msgs) == ["send_email"] + + def test_incomplete_status_is_treated_as_failure(self): + msgs = [ + _assistant_tool_call("c1", "long_running_query", {}, status="incomplete"), + ] + assert _collect_failed_tool_calls(msgs) == ["long_running_query"] + + def test_failed_on_both_call_and_result_dedupes_to_single_entry(self): + msgs = [ + _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}, status="failed"), + _tool_result("c1", "", status="failed"), + ] + assert _collect_failed_tool_calls(msgs) == ["send_email"] - def test_known_failure_status_emits_suffix(self): - """A known-failure status string produces a ``[STATUS] `` suffix.""" - assert _format_status_suffix("failed") == " [STATUS] failed" + def test_unknown_runtime_status_is_ignored(self): + # Only "failed" and "incomplete" trigger the short-circuit; anything else + # (including "error", "cancelled", "rate_limited", ...) falls through to + # the LLM rubric for payload-based judgment, preserving back-compat with + # runtimes that emit non-standardized status values. + msgs = [ + _assistant_tool_call("c1", "send_email", {}, status="error"), + _tool_result("c1", "", status="cancelled"), + ] + assert _collect_failed_tool_calls(msgs) == [] + + def test_parallel_calls_one_failed_returns_only_the_failed_name(self): + msgs = [ + _assistant_parallel_tool_calls([ + ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), + ("c2", "send_email", {"to": "x@example.com"}, "failed"), + ("c3", "lookup_user", {"id": "u42"}, "completed"), + ]), + _tool_result("c1", "Sunny, 72F.", status="completed"), + _tool_result("c2", "", status="failed"), + _tool_result("c3", {"user_id": "u42"}, status="completed"), + ] + assert _collect_failed_tool_calls(msgs) == ["send_email"] - def test_completed_status_emits_suffix(self): - """A success status string also emits a suffix (the rubric distinguishes the two).""" - assert _format_status_suffix("completed") == " [STATUS] completed" + def test_multiple_distinct_failures_dedupe_across_passes(self): + msgs = [ + _assistant_parallel_tool_calls([ + ("c1", "send_email", {"to": "x"}, "failed"), + ("c2", "fetch_weather", {"city": "Seattle"}, None), + ("c3", "lookup_user", {"id": "u42"}, "incomplete"), + ]), + _tool_result("c2", "Sunny", status="failed"), + # c1's tool_result also fails -- must not double-list send_email. + _tool_result("c1", "", status="failed"), + ] + result = _collect_failed_tool_calls(msgs) + assert set(result) == {"send_email", "fetch_weather", "lookup_user"} + # send_email and lookup_user are recorded during the assistant pass + # before fetch_weather appears in the tool pass. + assert result.index("send_email") < result.index("fetch_weather") + assert result.index("lookup_user") < result.index("fetch_weather") + + def test_failed_call_without_id_falls_back_to_name(self): + msgs = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "name": "anon_tool", + "arguments": {}, + "status": "failed", + } + ], + } + ] + assert _collect_failed_tool_calls(msgs) == ["anon_tool"] - def test_arbitrary_status_string_emits_suffix(self): - """Any non-empty string status emits a suffix; the rubric judges semantics, not Python.""" - assert _format_status_suffix("rate_limited") == " [STATUS] rate_limited" + def test_failed_tool_result_without_assistant_call_uses_id_as_label(self): + msgs = [ + _tool_result("c1", "", status="failed"), + ] + assert _collect_failed_tool_calls(msgs) == ["c1"] - def test_none_status_emits_empty(self): - """Absent status (``None``) emits the empty string for back-compat.""" - assert _format_status_suffix(None) == "" + def test_nested_function_shape_failed_status(self): + # The "tool_call.function.name" shape is what _normalize_function_call_types + # produces from OpenAI Responses-API function_call blocks. + msgs = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call": { + "id": "c1", + "function": { + "name": "send_email", + "arguments": {"to": "x"}, + }, + }, + "status": "failed", + } + ], + } + ] + assert _collect_failed_tool_calls(msgs) == ["send_email"] - def test_empty_string_status_emits_empty(self): - """Empty string status emits the empty string (treated same as absent).""" - assert _format_status_suffix("") == "" + def test_non_list_input_returns_empty(self): + assert _collect_failed_tool_calls(None) == [] + assert _collect_failed_tool_calls({}) == [] + assert _collect_failed_tool_calls("not a list") == [] - def test_non_string_status_emits_empty(self): - """Non-string statuses (int, dict, list) are ignored rather than raised on.""" - assert _format_status_suffix(42) == "" - assert _format_status_suffix({"x": 1}) == "" - assert _format_status_suffix(["failed"]) == "" + def test_malformed_content_blocks_are_skipped_silently(self): + msgs = [ + {"role": "assistant", "content": [None, "string", {"type": "text"}]}, + {"role": "tool", "tool_call_id": "c1", "content": [None]}, + ] + assert _collect_failed_tool_calls(msgs) == [] @pytest.mark.unittest -class TestGetToolCallsResultsStatusPassthrough: - """Integration tests for ``[STATUS]`` annotation emission via ``_get_tool_calls_results``.""" +class TestGetToolCallsResultsNoStatusForward: + """``_get_tool_calls_results`` must not forward ``[STATUS]`` to the LLM input. - def test_status_on_tool_call_is_appended_to_tool_call_line(self): - """When ``status`` is set on a tool_call block, the ``[TOOL_CALL]`` line carries the annotation.""" + Runtime status drives the Python short-circuit; the LLM rubric is only + invoked on the success path and so the formatted output is byte-identical + to the pre-status-pass-through wire format regardless of whether the + source blocks carry a ``status`` field. + """ + + def test_status_on_tool_call_is_not_appended(self): msgs = [ _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}, status="failed"), _tool_result("c1", ""), ] lines = _get_tool_calls_results(msgs) - assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed' - # Tool result has no status -> no suffix. - assert lines[1] == "[TOOL_RESULT] " + assert lines == [ + '[TOOL_CALL] send_email(to="x@example.com")', + "[TOOL_RESULT] ", + ] - def test_status_on_tool_result_is_appended_to_tool_result_line(self): - """When ``status`` is set on a tool_result block, the ``[TOOL_RESULT]`` line carries the annotation.""" + def test_status_on_tool_result_is_not_appended(self): msgs = [ _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}), - _tool_result("c1", "", status="error"), + _tool_result("c1", "", status="failed"), ] lines = _get_tool_calls_results(msgs) - assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com")' - assert lines[1] == "[TOOL_RESULT] [STATUS] error" + assert lines == [ + '[TOOL_CALL] send_email(to="x@example.com")', + "[TOOL_RESULT] ", + ] - def test_completed_status_is_passed_through_too(self): - """``[STATUS] completed`` is emitted alongside failure statuses; the rubric decides semantics.""" + def test_completed_status_is_not_appended(self): msgs = [ _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"), _tool_result("c1", "Sunny, 72F.", status="completed"), ] lines = _get_tool_calls_results(msgs) - assert lines[0] == '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed' - assert lines[1] == "[TOOL_RESULT] Sunny, 72F. [STATUS] completed" + assert lines == [ + '[TOOL_CALL] fetch_weather(city="Seattle")', + "[TOOL_RESULT] Sunny, 72F.", + ] - def test_absent_status_produces_no_suffix_back_compat(self): - """When ``status`` is absent on every block, output matches the pre-status-pass-through format exactly.""" + def test_absent_status_back_compat_unchanged(self): msgs = [ _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}), _tool_result("c1", "Sunny, 72F."), @@ -158,45 +282,23 @@ def test_absent_status_produces_no_suffix_back_compat(self): "[TOOL_RESULT] Sunny, 72F.", ] - def test_parallel_tool_calls_in_one_assistant_message_each_get_their_own_status(self): - """Multiple ``tool_call`` blocks in one assistant message each emit their own ``[STATUS]`` annotation. - - This is the modern Responses-API topology and exercises that the - formatter walks into the content list rather than only processing the - first block per message. - """ + def test_parallel_tool_calls_in_one_message_no_status_in_output(self): msgs = [ _assistant_parallel_tool_calls([ ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), - ("c2", "send_email", {"to": "x@example.com"}, "failed"), - ("c3", "lookup_user", {"id": "u42"}, "completed"), + ("c2", "send_email", {"to": "x@example.com"}, "completed"), + ("c3", "lookup_user", {"id": "u42"}, "completed"), ]), _tool_result("c1", "Sunny, 72F.", status="completed"), - _tool_result("c2", "", status="failed"), + _tool_result("c2", "ok", status="completed"), _tool_result("c3", {"user_id": "u42"}, status="completed"), ] lines = _get_tool_calls_results(msgs) assert lines == [ - '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed', - "[TOOL_RESULT] Sunny, 72F. [STATUS] completed", - '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed', - "[TOOL_RESULT] [STATUS] failed", - '[TOOL_CALL] lookup_user(id="u42") [STATUS] completed', - "[TOOL_RESULT] {'user_id': 'u42'} [STATUS] completed", - ] - - def test_mixed_status_present_and_absent_across_calls(self): - """A response with status on some calls and not others produces a mixed-suffix output.""" - msgs = [ - _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"), - _tool_result("c1", "Sunny, 72F."), - _assistant_tool_call("c2", "send_email", {"to": "x@example.com"}), - _tool_result("c2", "", status="failed"), - ] - lines = _get_tool_calls_results(msgs) - assert lines == [ - '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed', + '[TOOL_CALL] fetch_weather(city="Seattle")', "[TOOL_RESULT] Sunny, 72F.", '[TOOL_CALL] send_email(to="x@example.com")', - "[TOOL_RESULT] [STATUS] failed", + "[TOOL_RESULT] ok", + '[TOOL_CALL] lookup_user(id="u42")', + "[TOOL_RESULT] {'user_id': 'u42'}", ] From bc783a6b6156ccdafc0ed622975a109ff2d84d36 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 15 Jun 2026 13:24:50 -0700 Subject: [PATCH 3/4] ToolCallSuccess: rename tcid -> call_id to satisfy cspell --- .../_tool_call_success/_tool_call_success.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index a2988431a605..8520b61d7d25 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -340,34 +340,34 @@ def _collect_failed_tool_calls(messages): if "tool_call" in content and "function" in content.get("tool_call", {}): tc = content["tool_call"] name = tc.get("function", {}).get("name", "") or "" - tcid = tc.get("id") + call_id = tc.get("id") else: name = content.get("name", "") or "" - tcid = content.get("tool_call_id") - if tcid is not None: - id_to_name[tcid] = name + call_id = content.get("tool_call_id") + if call_id is not None: + id_to_name[call_id] = name status = content.get("status") if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES: - if tcid is not None: - failed_ids.append(tcid) + if call_id is not None: + failed_ids.append(call_id) elif name: failed_names_without_id.append(name) for msg in messages: if not isinstance(msg, dict) or msg.get("role") != "tool": continue - tcid = msg.get("tool_call_id") + call_id = msg.get("tool_call_id") for content in msg.get("content", []) or []: if not isinstance(content, dict) or content.get("type") != "tool_result": continue status = content.get("status") - if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES and tcid is not None: - failed_ids.append(tcid) + if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES and call_id is not None: + failed_ids.append(call_id) ordered = [] seen = set() - for tcid in failed_ids: - label = id_to_name.get(tcid) or tcid + for call_id in failed_ids: + label = id_to_name.get(call_id) or call_id if label and label not in seen: seen.add(label) ordered.append(label) From 470db6b11d86547fffd3702e1cef4103edfd3f03 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 15 Jun 2026 14:32:38 -0700 Subject: [PATCH 4/4] ToolCallSuccess: apply black formatting --- .../_tool_call_success/_tool_call_success.py | 4 +-- .../test_tool_call_success_evaluator.py | 36 +++++++++++-------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 8520b61d7d25..c366e81bd3b9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -141,9 +141,7 @@ def __call__( # pylint: disable=docstring-missing-param """ return super().__call__(*args, **kwargs) - def _return_short_circuit_failure_result( - self, failed_tools: List[str] - ) -> Dict[str, Union[str, float]]: + def _return_short_circuit_failure_result(self, failed_tools: List[str]) -> Dict[str, Union[str, float]]: """Return a deterministic fail result without invoking the LLM judge. Used when the runtime explicitly marks one or more tool calls as diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py index 288181c87e7a..6993d87337b4 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -141,11 +141,13 @@ def test_unknown_runtime_status_is_ignored(self): def test_parallel_calls_one_failed_returns_only_the_failed_name(self): msgs = [ - _assistant_parallel_tool_calls([ - ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), - ("c2", "send_email", {"to": "x@example.com"}, "failed"), - ("c3", "lookup_user", {"id": "u42"}, "completed"), - ]), + _assistant_parallel_tool_calls( + [ + ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), + ("c2", "send_email", {"to": "x@example.com"}, "failed"), + ("c3", "lookup_user", {"id": "u42"}, "completed"), + ] + ), _tool_result("c1", "Sunny, 72F.", status="completed"), _tool_result("c2", "", status="failed"), _tool_result("c3", {"user_id": "u42"}, status="completed"), @@ -154,11 +156,13 @@ def test_parallel_calls_one_failed_returns_only_the_failed_name(self): def test_multiple_distinct_failures_dedupe_across_passes(self): msgs = [ - _assistant_parallel_tool_calls([ - ("c1", "send_email", {"to": "x"}, "failed"), - ("c2", "fetch_weather", {"city": "Seattle"}, None), - ("c3", "lookup_user", {"id": "u42"}, "incomplete"), - ]), + _assistant_parallel_tool_calls( + [ + ("c1", "send_email", {"to": "x"}, "failed"), + ("c2", "fetch_weather", {"city": "Seattle"}, None), + ("c3", "lookup_user", {"id": "u42"}, "incomplete"), + ] + ), _tool_result("c2", "Sunny", status="failed"), # c1's tool_result also fails -- must not double-list send_email. _tool_result("c1", "", status="failed"), @@ -284,11 +288,13 @@ def test_absent_status_back_compat_unchanged(self): def test_parallel_tool_calls_in_one_message_no_status_in_output(self): msgs = [ - _assistant_parallel_tool_calls([ - ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), - ("c2", "send_email", {"to": "x@example.com"}, "completed"), - ("c3", "lookup_user", {"id": "u42"}, "completed"), - ]), + _assistant_parallel_tool_calls( + [ + ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), + ("c2", "send_email", {"to": "x@example.com"}, "completed"), + ("c3", "lookup_user", {"id": "u42"}, "completed"), + ] + ), _tool_result("c1", "Sunny, 72F.", status="completed"), _tool_result("c2", "ok", status="completed"), _tool_result("c3", {"user_id": "u42"}, status="completed"),