From a7d0c2b134acd998593dd8fae6b2f37f8ffc407a Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Mon, 6 Apr 2026 19:26:31 +0200 Subject: [PATCH 1/3] changes --- .../azure/ai/evaluation/__init__.py | 2 + .../_evaluators/_quality_grader/__init__.py | 9 + .../_quality_grader/_quality_grader.py | 332 ++++++++++++++++++ .../quality_grader_groundedness.prompty | 236 +++++++++++++ .../quality_grader_response_quality.prompty | 246 +++++++++++++ .../azure/ai/evaluation/_exceptions.py | 1 + 6 files changed, 826 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/__init__.py create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 6703b2ca111f..42a0734de1e8 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -31,6 +31,7 @@ from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator from ._evaluators._ungrounded_attributes import UngroundedAttributesEvaluator from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator +from ._evaluators._quality_grader import QualityGraderEvaluator from ._evaluators._document_retrieval import DocumentRetrievalEvaluator from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator @@ -141,6 +142,7 @@ def lazy_import(): "AzureOpenAITextSimilarityGrader", "AzureOpenAIScoreModelGrader", "AzureOpenAIPythonGrader", + "QualityGraderEvaluator", ] __all__.extend([p for p in _patch_all if p not in __all__]) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/__init__.py new file mode 100644 index 000000000000..21905cd96512 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/__init__.py @@ -0,0 +1,9 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +from ._quality_grader import QualityGraderEvaluator + +__all__ = [ + "QualityGraderEvaluator", +] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py new file mode 100644 index 000000000000..a7cf6ed46516 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py @@ -0,0 +1,332 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +import json +import logging +import os +from typing import Dict, List, Optional, Union + +from typing_extensions import overload, override + +if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true": + from promptflow.core._flow import AsyncPrompty +else: + from azure.ai.evaluation._legacy.prompty import AsyncPrompty + +from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from azure.ai.evaluation._model_configurations import Conversation + +try: + from ..._user_agent import UserAgentSingleton +except ImportError: + + class UserAgentSingleton: + @property + def value(self) -> str: + return "None" + + +from ..._common.utils import construct_prompty_model_config, validate_model_config + +logger = logging.getLogger(__name__) + +# Thresholds for response quality checks (first prompt) +_RESPONSE_QUALITY_ABSTENTION_EXPECTED = False +_RESPONSE_QUALITY_RELEVANCE_EXPECTED = True +_RESPONSE_QUALITY_ANSWER_COMPLETENESS_THRESHOLD = 1.5 + +# Thresholds for groundedness checks (second prompt) +_GROUNDEDNESS_THRESHOLD = 3.5 +_CONTEXT_COVERAGE_THRESHOLD = 1.5 + + +class QualityGraderEvaluator(PromptyEvaluatorBase[Union[str, float]]): + """Evaluates overall response quality using a two-stage grading pipeline. + + Stage 1 (Response Quality): Evaluates the response for relevance, abstention, and answer completeness. + The response must satisfy: + - abstention must be false + - relevance must be true + - answerCompleteness must be greater than 1.5 + + Stage 2 (Groundedness, only if context is provided): Evaluates whether the response is grounded in the + provided context and covers the key information. The response must satisfy: + - groundedness must be greater than 3.5 + - contextCoverage must exceed 1.5 + + If all checks pass, the evaluator returns "pass". Otherwise, it returns "fail" with failure reasons. + + :param model_config: Configuration for the Azure OpenAI model. + :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, + ~azure.ai.evaluation.OpenAIModelConfiguration] + :param credential: The credential for authenticating to Azure AI service. + :type credential: ~azure.core.credentials.TokenCredential + :keyword is_reasoning_model: If True, updates config parameters for reasoning models. Defaults to False. + :paramtype is_reasoning_model: bool + """ + + _RESPONSE_QUALITY_PROMPTY = "quality_grader_response_quality.prompty" + _GROUNDEDNESS_PROMPTY = "quality_grader_groundedness.prompty" + _RESULT_KEY = "quality_grader" + _OPTIONAL_PARAMS = ["context"] + + id = "azureai://built-in/evaluators/quality_grader" + + @override + def __init__(self, model_config, *, credential=None, **kwargs): + current_dir = os.path.dirname(__file__) + response_quality_prompty_path = os.path.join(current_dir, self._RESPONSE_QUALITY_PROMPTY) + + self._higher_is_better = True + self._model_config = model_config + self._credential = credential + + super().__init__( + model_config=model_config, + prompty_file=response_quality_prompty_path, + result_key=self._RESULT_KEY, + threshold=1, + credential=credential, + _higher_is_better=self._higher_is_better, + **kwargs, + ) + + # Load the second prompty flow for groundedness evaluation + groundedness_prompty_path = os.path.join(current_dir, self._GROUNDEDNESS_PROMPTY) + subclass_name = self.__class__.__name__ + user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})" + prompty_model_config = construct_prompty_model_config( + validate_model_config(model_config), + self._DEFAULT_OPEN_API_VERSION, + user_agent, + ) + self._groundedness_flow = AsyncPrompty.load( + source=groundedness_prompty_path, + model=prompty_model_config, + token_credential=credential, + is_reasoning_model=kwargs.get("is_reasoning_model", False), + ) + + @overload + def __call__( + self, + *, + query: str, + response: str, + context: Optional[str] = None, + ) -> Dict[str, Union[str, float]]: + """Evaluate quality for a given query, response, and optional context. + + :keyword query: The query to be evaluated. + :paramtype query: str + :keyword response: The response to be evaluated. + :paramtype response: str + :keyword context: The context (retrieved documents) to evaluate groundedness against. Optional. + :paramtype context: Optional[str] + :return: The quality grader result. + :rtype: Dict[str, Union[str, float]] + """ + + @overload + def __call__( + self, + *, + conversation: Conversation, + ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: + """Evaluate quality for a conversation. + + :keyword conversation: The conversation to evaluate. + :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] + :return: The quality grader result. + :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]] + """ + + @override + def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs) + + @override + async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict]]: # type: ignore[override] + """Run the two-stage quality grading pipeline. + + Stage 1: Call the response quality prompt and check thresholds. + Stage 2 (if context provided): Call the groundedness prompt and check thresholds. + """ + query = eval_input.get("query", "") + response = eval_input.get("response", "") + context = eval_input.get("context", None) + + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_tokens = 0 + model_id = "" + + # --- Stage 1: Response Quality --- + stage1_input = {"question": query, "response": response} + stage1_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **stage1_input) + + stage1_parsed = self._parse_prompty_json_output(stage1_output) + total_prompt_tokens += stage1_output.get("input_token_count", 0) if stage1_output else 0 + total_completion_tokens += stage1_output.get("output_token_count", 0) if stage1_output else 0 + total_tokens += stage1_output.get("total_token_count", 0) if stage1_output else 0 + model_id = stage1_output.get("model_id", "") if stage1_output else "" + + # Check stage 1 conditions + failure_reasons = [] + abstention = stage1_parsed.get("abstention") + relevance = stage1_parsed.get("relevance") + answer_completeness = stage1_parsed.get("answerCompleteness") + + if abstention is True: + failure_reasons.append("abstention is true (expected false)") + if relevance is not True: + failure_reasons.append(f"relevance is {relevance} (expected true)") + if isinstance(answer_completeness, (int, float)) and answer_completeness <= _RESPONSE_QUALITY_ANSWER_COMPLETENESS_THRESHOLD: + failure_reasons.append( + f"answerCompleteness is {answer_completeness} (must be > {_RESPONSE_QUALITY_ANSWER_COMPLETENESS_THRESHOLD})" + ) + elif answer_completeness is None or answer_completeness == "null": + failure_reasons.append("answerCompleteness is null (must be > 1.5)") + + if failure_reasons: + return self._build_result( + passed=False, + failure_reasons=failure_reasons, + stage1_parsed=stage1_parsed, + stage2_parsed=None, + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_tokens, + model_id=model_id, + ) + + # --- Stage 2: Groundedness (only if context is provided) --- + stage2_parsed = None + if context and isinstance(context, str) and context.strip(): + stage2_input = {"question": query, "response": response, "context": context} + stage2_output = await self._groundedness_flow(timeout=self._LLM_CALL_TIMEOUT, **stage2_input) + + stage2_parsed = self._parse_prompty_json_output(stage2_output) + total_prompt_tokens += stage2_output.get("input_token_count", 0) if stage2_output else 0 + total_completion_tokens += stage2_output.get("output_token_count", 0) if stage2_output else 0 + total_tokens += stage2_output.get("total_token_count", 0) if stage2_output else 0 + + groundedness = stage2_parsed.get("groundedness") + context_coverage = stage2_parsed.get("contextCoverage") + + if isinstance(groundedness, (int, float)) and groundedness <= _GROUNDEDNESS_THRESHOLD: + failure_reasons.append( + f"groundedness is {groundedness} (must be > {_GROUNDEDNESS_THRESHOLD})" + ) + elif groundedness is None or groundedness == "null": + failure_reasons.append("groundedness is null (must be > 3.5)") + + if isinstance(context_coverage, (int, float)) and context_coverage <= _CONTEXT_COVERAGE_THRESHOLD: + failure_reasons.append( + f"contextCoverage is {context_coverage} (must exceed {_CONTEXT_COVERAGE_THRESHOLD})" + ) + elif context_coverage is None or context_coverage == "null": + failure_reasons.append("contextCoverage is null (must exceed 1.5)") + + if failure_reasons: + return self._build_result( + passed=False, + failure_reasons=failure_reasons, + stage1_parsed=stage1_parsed, + stage2_parsed=stage2_parsed, + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_tokens, + model_id=model_id, + ) + + # All checks passed + return self._build_result( + passed=True, + failure_reasons=[], + stage1_parsed=stage1_parsed, + stage2_parsed=stage2_parsed, + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_tokens, + model_id=model_id, + ) + + @staticmethod + def _parse_prompty_json_output(prompty_output: Optional[Dict]) -> Dict: + """Parse the JSON output from a prompty flow call. + + :param prompty_output: The raw output dict from the prompty flow. + :return: Parsed JSON dict from the LLM output. + """ + if not prompty_output: + return {} + llm_output = prompty_output.get("llm_output", "") + if not llm_output: + return {} + if isinstance(llm_output, dict): + return llm_output + try: + return json.loads(llm_output) + except (json.JSONDecodeError, TypeError): + logger.warning("Failed to parse LLM output as JSON: %s", llm_output) + return {} + + def _build_result( + self, + *, + passed: bool, + failure_reasons: List[str], + stage1_parsed: Optional[Dict], + stage2_parsed: Optional[Dict], + prompt_tokens: int, + completion_tokens: int, + total_tokens: int, + model_id: str, + ) -> Dict[str, Union[str, float, Dict]]: + """Build the standardized result dictionary. + + :param passed: Whether the evaluation passed. + :param failure_reasons: List of reasons for failure (empty if passed). + :param stage1_parsed: Parsed output from stage 1 (response quality). + :param stage2_parsed: Parsed output from stage 2 (groundedness), or None if not run. + :param prompt_tokens: Total prompt tokens used. + :param completion_tokens: Total completion tokens used. + :param total_tokens: Total tokens used. + :param model_id: The model ID used. + :return: Standardized result dict. + """ + score = 1.0 if passed else 0.0 + result_label = self._PASS_RESULT if passed else self._FAIL_RESULT + reason = "All quality checks passed." if passed else "; ".join(failure_reasons) + + details = {} + if stage1_parsed: + details["abstention"] = stage1_parsed.get("abstention") + details["relevance"] = stage1_parsed.get("relevance") + details["answerCompleteness"] = stage1_parsed.get("answerCompleteness") + details["queryType"] = stage1_parsed.get("queryType") + details["conversationIncomplete"] = stage1_parsed.get("conversationIncomplete") + details["judgeConfidence"] = stage1_parsed.get("judgeConfidence") + details["stage1_explanation"] = stage1_parsed.get("explanation", {}) + + if stage2_parsed: + details["groundedness"] = stage2_parsed.get("groundedness") + details["contextCoverage"] = stage2_parsed.get("contextCoverage") + details["documentUtility"] = stage2_parsed.get("documentUtility") + details["missingContextParts"] = stage2_parsed.get("missingContextParts", []) + details["unsupportedClaims"] = stage2_parsed.get("unsupportedClaims", []) + details["stage2_explanation"] = stage2_parsed.get("explanation", {}) + + return { + self._result_key: score, + f"{self._result_key}_result": result_label, + f"{self._result_key}_reason": reason, + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_details": details, + f"{self._result_key}_prompt_tokens": prompt_tokens, + f"{self._result_key}_completion_tokens": completion_tokens, + f"{self._result_key}_total_tokens": total_tokens, + f"{self._result_key}_model": model_id, + } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty new file mode 100644 index 000000000000..5d0458ed0d60 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty @@ -0,0 +1,236 @@ +--- +name: QualityGraderGroundedness +description: Evaluates groundedness and context coverage for RAG systems +model: + api: chat + parameters: + temperature: 0.0 + max_tokens: 3000 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: json_object + +inputs: + question: + type: string + response: + type: string + context: + type: string + +--- +system: +You are an expert evaluator of Retrieval-Augmented Generation (RAG) systems. +Your task is to evaluate an assistant response using ONLY the provided retrieved documents (context). +You must be strict about topic, entity, and semantic alignment between: + - the User Question + - the Retrieved Documents + - the Assistant Response + +**Task:** Evaluate the quality of response(s) of an AI assistant in terms of "groundedness". +-- +You must evaluate THREE independent dimensions: + +**Definitions:** +1. **documentUtility**: + Determines whether the retrieved documents can be used to answer the user's question. + When measuring documentUtility the actual response should be ignored. Only the question and the retrieved documents are relevant. + - **A:** All key points of the question are explicitly supported by the retrieved documents (traceable evidence for each). + - **B:** Some (but not all) key points are supported by the retrieved documents. + - **C:** Retrieved documents cannot be used to answer the question at all and do not contain relevant information. + This includes: + - Topic mismatch (documents discuss a different subject) + - Entity mismatch (wrong product, disease, organization, system, etc.) + - Scope mismatch (wrong geography, timeframe, version, or audience) + - Generic or best‑practice information pasted onto a specific question + +**Granularity / Entity / Scope rule:** + If the documents do not clearly match the entity, topic, or scope of the question, + that aspect MUST be considered not covered. + +2. **groundedness:(1 / 2 / 3 / 4 / 5 / null)** + Measures evaluates whether the FACTUAL CLAIMS made in the assistant response are directly supported by the provided context. + groundedness measures FAITHFULNESS to the documents. It does NOT measure whether the response answers the user question. + A grounded response aligns with the context, uses the information it provides, and does not introduce contradictions or unsupported claims. + +3. **ContextCoverage (0 / 1 / 2 / 3)**: + Measures whether the assistant response includes the key information from the retrieved context + that is REQUIRED to answer the user question. + + - **3:** All CRITICAL context parts required to answer the question are included. + - **2:** Most CRITICAL context parts are included. Missing only non-critical or supporting parts does NOT reduce this score. + - **1:** Some but not all CRITICAL context parts are included. + - **0:** None of the CRITICAL required context parts are included. + + CRITICAL context parts are those without which the question cannot be correctly answered. + SUPPORTING parts add detail but are not strictly required. + +-- +**Instructions:** + +For each assistant response in the chat history: + +- **groundedness:** + - Score from 1 to 5 or null: + - 1: Contradicts the documents or invents facts. + - 2: Most claims lack evidence or rely on unrelated context. Also claims are based on misinterpretation of the documents OR reuse document content for a different topic. + - 3: One or more major claims are weakly supported, inferred, or underspecified. + - 4: All major claims supported; only minor non‑essential claims lack evidence. + - 5: Every factual claim is explicitly supported by the context (traceable evidence). + - null: If the response does not provide any information so groundedness cannot be judged. + + **Additional groundedness Guidelines:** + - groundedness > 3 is considered grounded and good enough for the user, whereas 3 or below is not good enough. + - False information: groundedness measures whether false information is provided based on the given context. It does not measure whether the response answers the query. No information is considered as null. + - Direct alignment: Responses directly supported by the context should be highly rated (4 or 5). + - Contradictions: Responses contradicting the context should be rated lower (1 or 2). + - Partial use of context: Responses that partially use the context but add unclear or unsupported details should be rated 3 or lower. + - Missing context: If the response says it cannot provide an answer without providing information from the context, the groundedness is null. + - Clarification requests: If the response is a clarification question relevant to the query, consider groundedness as null. + - Conclusions from missing information: A response cannot assume the answer is "no" just because no evidence is found. Such assumptions should lead to low groundedness (3 or lower). + - Topic transference (critical RAG failure): If the assistant takes documents about ONE topic and applies them to a DIFFERENT question topic, this is a critical error. + - Misinterpretation of context: If the response misinterprets the context, groundedness should be low (2 or lower). + - groundedness does not measure completeness. + - If documentUtility is C, groundedness cannot be high. + +**groundedness examples:** + 1) Context: "The flu typically causes fever, cough, and fatigue." + Question: "What are the symptoms of the flu?" + Assistant Response: "The symptoms of the flu are fever, cough, and fatigue." + groundedness: 5 + + 2) Context: "The flu typically causes fever, cough, and fatigue." + Question: "What are the symptoms of the flu?" + Assistant Response: "The flu causes cough." + groundedness: 5 + + 3) Context: "The flu typically causes fever, cough, and fatigue." + Question: "What are the symptoms of the flu?" + Assistant Response: "The flu causes fever, cough, fatigue and muscle pain." + groundedness: 3 + + 4) Context: "" + Question: "What are the symptoms of the flu?" + Assistant Response: "I am sorry, I can't help with that." + groundedness: null + + 5) Context: "" + Question: "What are the symptoms of the flu?" + Assistant Response: "I assume there are no symptoms." + groundedness: 2 + + 6) Context: "The flu typically causes fever, cough, and fatigue." + Question: "What are the symptoms of covid?" + Assistant Response: "The symptoms of covid are fever, cough, and fatigue." + groundedness: 2 + + 7) Context: "The apple watch product cost 50$." + Question: "How much does the apple watch cost?" + Assistant Response: "The apple watch cost 60$." + groundedness: 1 + + 8) Context: "If you are not pleased with your purchase, you can return it within 30 days." + Question: "Can I return the product?" + Assistant Response: "You can return the product within 30 days." + groundedness: 5 + + 9) Context: Same as above. + Question: "Can I return the product within 60 days?" + Assistant Response: "You can return the product within 30 days." + groundedness: 5 + + 10) Context: Same. + Question: "Can I return the product?" + Assistant Response: "You can return the product within 60 days." + groundedness: 2 + +**ContextCoverage examples:** + 1) Context: "The product can be returned within 30 days with a receipt. Refunds are issued to the original payment method." + Question: "Can I return the product and get a refund?" + Assistant Response: "You can return the product within 30 days and receive a refund to the original payment method." + ContextCoverage: 3 (All critical parts included: return window + refund method) + + 2) Context: "The product can be returned within 30 days with a receipt. Refunds are issued to the original payment method." + Question: "Can I return the product and get a refund?" + Assistant Response: "You can return the product within 30 days." + ContextCoverage: 2 (Refund information is a critical missing part) + + 3) Context: "The product can be returned within 30 days with a receipt. Refunds are issued to the original payment method." + Question: "Can I return the product and get a refund?" + Assistant Response: "Please contact customer support for more information." + ContextCoverage: 0 (No context parts included) + + 4) Context: "Most products can be returned within 30 days with a receipt besides phones which are non-returnable. Refunds are issued to the original payment method. " + Question: "Can I return the product and get a refund?" + Assistant Response: "You can return most products within 30 days and receive a refund." + ContextCoverage: 1 (non-returnable phones is important info) + +-- +**Critical Error Warning:** +If the assistant response uses context unrelated to the question topic, this is a critical error. + - DocumentUtility = C + - Groundedness ≤ 3 + +You MUST: +1) Identify the MAJOR factual claims made in the response. +2) Check each claim against the documents. +3) List any claims that are not explicitly supported. All claims should be concise. +4) Missing context parts that exists in the context and should have been included but did not. Up to 7 words per part. + + +**Outputs:** +Provide the evaluation in JSON format: + +{ + "explanation": { + "groundedness": "", + "documentUtility": "" + }, + "index": , + "groundedness": <1 to 5 or null>, + "contextCoverage": <0/1/2/3>, + "missingContextParts": [ + { "part": "", "importance": "critical | supporting" } + ], + "unsupportedClaims": ["Concise Claims that are not found in the documents"], + "documentUtility": "", + "judgeConfidence": "" +} + +**IMPORTANT JSON RULES:** +- NEVER include comments (// or /* */) in the JSON output - this will break JSON parsing +- If there are NO missingContextParts, use ONLY an empty array: "missingContextParts": [] +- If there are NO unsupportedClaims, use ONLY an empty array: "unsupportedClaims": [] + + + +Explanation Logic: +* All explanations should be clear concise and simple. 2-3 sentences max. 8th-grade reading level. +* For groundedness, explain the key reasons for the score, focusing on unsupported claims or contradictions. +* For contextCoverage, list the critical missing parts if any. +* For documentUtility, explain why the documents are or are not useful for answering the question. +* Do NOT introduce new facts in explanation. +* Avoid the word 'helpful'. +* Don't use data science jargon. Use plain language. +* Make sure information isn't repeated. +* Present the issues in numbered items up to 3. +* Give a short title for each issue. +* Add a relevant emoji to the left of the title. +* First bullet should be focused on the main issue of the metric. +* Add extra bullets only if there are more relevant issues. +* Each bullet should be 1-2 sentences long. +* Each bullet should be in a separate line. + +user: + +Inputs: +## RetrievedDocuments +{{context}} + +## User Question +{{question}} + +## Assistant Response +{{response}} diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty new file mode 100644 index 000000000000..44a05697387c --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty @@ -0,0 +1,246 @@ +--- +name: QualityGraderResponseQuality +description: Evaluates response quality in terms of relevance, abstention, and answer completeness +model: + api: chat + parameters: + temperature: 0.0 + max_tokens: 3000 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: json_object + +inputs: + question: + type: string + response: + type: string + +--- +system: +You are an expert evaluator of Chatbot agent. + Your task is to evaluate an assistant response using ONLY the the question and the agent response. + Task: Evaluate the quality of response(s) of an AI assistant in terms of "relevance", "abstention", and "answerCompleteness". Provide explanations and scores for each metric. +In addition, the assistant should also classify the type of the question. +-- + +Definitions: +1. abstention: + Evaluates whether the assistant refuses or isn't able to provide help. Abstention is marked as "true" if the assistant explicitly declines without attempting to assist, "false" if it tries to provide meaningful help, and "null" if there is no question to respond to. + Many times the response may say it could not find the answer in the provided context/documents but provide partial information that is somehow relevant. In most cases like these (e.g., partial answers based on incomplete information), it should be marked as abstention = true. + +2. relevance: + Measures whether the response addresses the user's question meaningfully. A relevant response directly relates to the user's stated question or request without straying off-topic. + +3. queryType: + The type of the question. It can be one of the following: "Yes/No", "Single-Detail", "Multiple-Detail", "How-To", "Comparison", "Opinion/Advice", "Summary", "Explanation", "Analysis", "Clarifications", "Aggregation", "None". + +4. answerCompleteness (0/1/2/3): + Measures to what extent the response answers all aspects of the user's question, ignoring whether the information is grounded or factually correct. It focuses only on whether the response attempts to address each part of the query (i.e., whether the response relates to all query aspects). + - 0 = No coverage: The response does not answer the question at all or only responds with inability to help (e.g., "I searched but could not find the information", "I am sorry, I don't know") without providing any meaningful answer to any part of the question. + - 1 = Low partial coverage: The response addresses only a small subset of the important aspects of the user's question, or in a very shallow way. Most key aspects remain unanswered. + - 2 = High partial coverage: The response addresses a majority of the important aspects of the question but still misses one or more clearly important aspects. + - 3 = Full coverage: The response addresses all important aspects of the user's question in a way that attempts to answer each part. The answer can still be factually wrong or hallucinated, but as long as it attempts to answer all parts of the question, answerCompleteness = 3. + - If there is no meaningful question to answer (e.g., only a greeting like "hello" with no request and no response that tries to answer a question), answerCompleteness should be "null". + + Notes: + - answerCompleteness does NOT measure correctness, groundedness, or faithfulness to documents. + - An answer like "I searched for the information but I could not find it" is NOT considered complete and should have answerCompleteness = 0 (even if it is relevant and abstention = true). + - A hallucinated answer that tries to answer all aspects of the question can still receive answerCompleteness = 3, as long as it covers all query aspects. + +5. conversationIncomplete (true/false): + Basically its noJudge property under a different name. + Determines whether the assistant response should be evaluated as a task outcome. + Set conversationIncomplete = "true" when the assistant has not yet attempted to answer the user's task, and judging success or failure would be premature. + Set conversationIncomplete = "false" when the assistant response can and should be judged for task quality. + +6. judgeConfidence (low | medium | high): + - low: when you are uncertain about your evaluations. + - medium: when you are somewhat certain but there is still some ambiguity. + - high: when you are very certain about your evaluations. + +-- +Instructions: + +For each assistant response in the chat history: + +- abstention: + - Label each response as "true", "false", or "null" based on whether the assistant unnecessarily refuses to provide help. + Additional abstention Guidelines: + - Missing Documents: If the response says it does not have any information for the given question or it could not find it in the retrieved documents, abstention = true. + - Clarification Requests: If the response is a clarification question relevant to the query, consider it abstention = "null". + - False Information: If the response provides information whether it is true or false, abstention = false. + + abstention examples: + 1) - Question: "What was the revenue last quarter?" + - Assistant Response: "I'm sorry, I can't help with that." + - abstention: true + 2) - Question: "What is the maximum value of a gift that the company may provide for an employee's marriage?" + - Assistant Response: "The documents provided do not include information about the maximum value of a gift that the company may provide for an employee's marriage, but the company provides many gifts to employees." + - abstention: true + 3) - Question: "How do I fix my windows issue?" + - Assistant Response: "I am sorry, I don't have information for this. You can reach out to customer support." + - abstention: true + 4) - Query: "Can I find pigs on the moon?" + - Assistant Response: "Based on the retrieved documents the answer must be no." + - abstention: false + +- relevance: + - Label each response as "true", "false", or "null" based on whether it addresses the user's question meaningfully. + Additional relevance Guidelines: + - False Information: If the response is considered as an abstention with no further information, relevance = "null". + - Clarification Requests: If the response is a clarification question relevant to the query, relevance = "true". + - Ignore Context: relevance score is determined by question and response regardless of any external context or retrieved documents. + - General Question / Too Specific Answer: If the question is general and the answer is too specific without providing a general answer, relevance = "false". + + relevance examples: + 1) - Question: "How much does it cost?" + - Assistant Response: "The cost of the product is $100." + - relevance: true + 2) - Question: "How much does it cost?" + - Assistant Response: "The product is available in different colors." + - relevance: false + 3) - Question: "How much does it cost?" + - Assistant Response: "I am sorry, I can't help with that." + - relevance: null + 4) - Question: "How much does it cost?" + - Assistant Response: "In Sweden it costs 5$." + - relevance: false + 5) - Question: "How much does it cost?" + - Assistant Response: "The cost of the product is $100. In Sweden it costs 95$." + - relevance: true + +- answerCompleteness (0/1/2/3 or "null"): + - Label each response with one of: 0, 1, 2, 3, or "null". + Additional answerCompleteness Guidelines: + - Focus on whether all aspects of the user's question are answered, independent of correctness or grounding. + - If the question has multiple parts (e.g., "What is the price and what is the warranty period?"): + - If the response answers both parts in reasonable detail, answerCompleteness = 3. + - If the response answers both parts but one of them only very briefly or partially, answerCompleteness = 2. + - If the response answers only one of them, answerCompleteness = 1. + - If the response answers none and only states inability to help (e.g., "I searched but could not find the information"), answerCompleteness = 0. + - If the question is a single, simple request and the response attempts to answer it directly and reasonably, answerCompleteness = 3. + - If the response is only a clarification request and does not answer the question at all, answerCompleteness = 0. + - If there is no real question to answer (e.g., only a greeting with no informational request), answerCompleteness = "null". + + answerCompleteness examples: + 1) - Question: "What is the price and what is the warranty period?" + - Assistant Response: "The price is $100, and the warranty period is 2 years." + - answerCompleteness: 3 + 2) - Question: "What is the price and what is the warranty period?" + - Assistant Response: "The price is $100. The warranty details are not clear, but it is usually around 1–2 years." + - answerCompleteness: 2 + 3) - Question: "What is the price and what is the warranty period?" + - Assistant Response: "The price is $100." + - answerCompleteness: 1 + 4) - Question: "What is the price and what is the warranty period?" + - Assistant Response: "I searched for the information but I could not find it." + - answerCompleteness: 0 + 5) - Question: "What are the symptoms of the flu?" + - Assistant Response: "The symptoms of the flu are fever, cough, and fatigue." + - answerCompleteness: 3 + 6) - Question: "hello" + - Assistant Response: "Hello! How can I help you today?" + - answerCompleteness: "null" (no specific informational question was asked) + +- queryType: + Choose exactly one from the following categories: + "Yes/No" (Typically answered by yes or no) + "Single-Detail" (Requests one piece of factual info) + "Multiple-Detail" (Seeks multiple pieces of info or a list) + "How-To" (Asks for instructions or steps to do something) + "Comparison" (Asks to compare or contrast two or more items) + "Opinion/Advice" (Seeks subjective input or recommendation) + "Summary" (Requests a condensed explanation or overview) + "Explanation" (Asks for reasons, causes, or a "why") + "Analysis" (Requires a detailed examination or breakdown) + "Clarifications" (Seeks additional info or context) + "Aggregation" (Asks for a collection or summary of info) + "None" (If none of the above categories apply) + +Additional Guidelines for conversationIncomplete: + conversationIncomplete is about task evaluability, not conversation flow. + A conversation can still be ongoing and judgeable. + A turn can be early and not judgeable. + if conversationIncomplete is set to "true", relevance, abstention, and answerCompleteness must be set to "null". + +* When conversationIncomplete = "true" + There are only TWO allowed cases: + 1. No Real Task Exists (Greeting / Closing) + The user did not ask for information, action, or problem-solving. + Exception: + If the user message is a greeting or farewell but the assistant introduces a new topic, provides unrelated information, or asks an unrelated question, then the response is still judgeable. + In such cases set: + conversationIncomplete = "false" + Relevance = "false" + + Examples: + User: "hi" → Assistant: "Hello! How can I help?" + User: "bye" → Assistant: "Goodbye!" + Labeling: + conversationIncomplete = "true" + + 2. Valid Task Setup Step (Relevant Clarification) + The assistant asks for necessary information required to complete the user's request. + The assistant is progressing correctly but has not yet produced an answer. + Examples: + User: "I want to open an account" + Assistant: "What is your ID number?" + + User: "Help with my order" + Assistant: "Can you share your order number?" + Labeling: + conversationIncomplete = "true" + The task has not been answered yet, but this may lead to correct behavior. + +* When conversationIncomplete = "false" + Everything else must be judged. + This includes: + A. The assistant attempted an answer + Even if wrong, incomplete, partial, or hallucinated. + B. The assistant refuses + Refusals are still judged (Abstention = true). + C. The assistant asks an irrelevant follow-up + This is incorrect behavior, not a setup step. + D. The user did not ask a question but the assistant response is way off-topic. + + Example: + User: "Can you help me to open an account?" + Assistant: "What is the weather at your place?" + Labeling: + conversationIncomplete = "false" + This is a failure, not a clarification. + + +All explanation should be clear concise and simple. 2-3 sentences max. 8th-grade reading level. + +**Outputs:** +Provide the evaluation in JSON format: + +{ + "explanation": { + "relevance": "", + "abstention": "", + "queryType": "", + "answerCompleteness": "" + }, + "relevance": , + "abstention": , + "queryType": "", + "answerCompleteness": <0/1/2/3 or null>, + "conversationIncomplete": , + "judgeConfidence": "" +} + +**IMPORTANT JSON RULES:** +- NEVER include comments (// or /* */) in the JSON output - this will break JSON parsing + +user: + +## User Question +{{question}} + +## Assistant Response +{{response}} diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py index 9890ce98756f..976c172ace43 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py @@ -104,6 +104,7 @@ class ErrorTarget(Enum): AOAI_GRADER = "AoaiGrader" CONVERSATION_HISTORY_PARSING = "_get_conversation_history" TOOL_OUTPUT_UTILIZATION_EVALUATOR = "ToolOutputUtilizationEvaluator" + QUALITY_GRADER_EVALUATOR = "QualityGraderEvaluator" class EvaluationException(AzureError): From 5a7fccc01949017620eea5175d4139641a3e1d41 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Wed, 8 Apr 2026 14:37:19 +0200 Subject: [PATCH 2/3] remove unused vars --- .../evaluation/_evaluators/_quality_grader/_quality_grader.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py index a7cf6ed46516..d89ef8ce7aa3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/_quality_grader.py @@ -32,8 +32,6 @@ def value(self) -> str: logger = logging.getLogger(__name__) # Thresholds for response quality checks (first prompt) -_RESPONSE_QUALITY_ABSTENTION_EXPECTED = False -_RESPONSE_QUALITY_RELEVANCE_EXPECTED = True _RESPONSE_QUALITY_ANSWER_COMPLETENESS_THRESHOLD = 1.5 # Thresholds for groundedness checks (second prompt) From 8af4d620a23bdeb4d3d57cb8a1dd8a1941ab0c56 Mon Sep 17 00:00:00 2001 From: ahibrahim Date: Wed, 8 Apr 2026 14:39:12 +0200 Subject: [PATCH 3/3] inc max tokens --- .../_quality_grader/quality_grader_groundedness.prompty | 2 +- .../_quality_grader/quality_grader_response_quality.prompty | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty index 5d0458ed0d60..6d7212cb5bc9 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_groundedness.prompty @@ -5,7 +5,7 @@ model: api: chat parameters: temperature: 0.0 - max_tokens: 3000 + max_tokens: 5000 top_p: 1.0 presence_penalty: 0 frequency_penalty: 0 diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty index 44a05697387c..62dab318b9e4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_quality_grader/quality_grader_response_quality.prompty @@ -5,7 +5,7 @@ model: api: chat parameters: temperature: 0.0 - max_tokens: 3000 + max_tokens: 5000 top_p: 1.0 presence_penalty: 0 frequency_penalty: 0