From 2cf862757f42cafa671f4dddedbf4c04f188edc0 Mon Sep 17 00:00:00 2001 From: KoushikReddy Date: Sat, 11 Apr 2026 13:02:16 -0700 Subject: [PATCH 1/6] fix(flows): prevent double-execution of FunctionTools returning None in thread pool When RunConfig.tool_thread_pool_config is enabled, _call_tool_in_thread_pool used None as a sentinel to distinguish "FunctionTool ran in thread pool" from "non-FunctionTool sync tool, needs async fallback". Because None is also a valid return value from any FunctionTool whose underlying function has no explicit return statement (implicit None), the sentinel check failed and execution fell through to tool.run_async(), invoking the function a second time silently. Replace the None sentinel with a dedicated _SYNC_TOOL_RESULT_UNSET object so that a legitimate None result from a FunctionTool is correctly returned on the first execution, without triggering the async fallback path. Fixes #5284 --- src/google/adk/flows/llm_flows/functions.py | 7 ++ .../llm_flows/test_functions_thread_pool.py | 64 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/src/google/adk/flows/llm_flows/functions.py b/src/google/adk/flows/llm_flows/functions.py index ea3b3e76c3..e47ca816e6 100644 --- a/src/google/adk/flows/llm_flows/functions.py +++ b/src/google/adk/flows/llm_flows/functions.py @@ -67,6 +67,13 @@ _TOOL_THREAD_POOLS: dict[int, ThreadPoolExecutor] = {} _TOOL_THREAD_POOL_LOCK = threading.Lock() +# Sentinel object used to distinguish a FunctionTool that legitimately returns +# None from a non-FunctionTool sync tool that skips thread-pool execution. +# Using None as a sentinel would cause tools whose underlying function has no +# explicit return statement (implicit None) to fall through to the async +# fallback path and execute a second time. +_SYNC_TOOL_RESULT_UNSET = object() + def _is_live_request_queue_annotation(param: inspect.Parameter) -> bool: """Check whether a parameter is annotated as LiveRequestQueue. diff --git a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py index 5ffd0f26d6..bf89ce43b6 100644 --- a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py +++ b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py @@ -278,6 +278,70 @@ def blocking_sleep() -> dict: event_loop_ticks >= 5 ), f'Event loop should have ticked at least 5 times, got {event_loop_ticks}' + @pytest.mark.asyncio + async def test_sync_tool_returning_none_executes_exactly_once(self): + """Test that a sync FunctionTool returning None is not double-executed. + + Regression test for https://github.com/google/adk-python/issues/5284. + A FunctionTool whose underlying function returns None (e.g. side-effect- + only tools with no explicit return statement) must execute exactly once. + Previously the None return was mistaken for the internal sentinel used to + signal 'non-FunctionTool, fall back to run_async', causing a second + invocation via tool.run_async(). + """ + call_count = 0 + + def side_effect_only() -> None: + nonlocal call_count + call_count += 1 + # No return statement — implicit None. + + tool = FunctionTool(side_effect_only) + model = testing_utils.MockModel.create(responses=[]) + agent = Agent(name='test_agent', model=model, tools=[tool]) + invocation_context = await testing_utils.create_invocation_context( + agent=agent, user_content='' + ) + tool_context = ToolContext( + invocation_context=invocation_context, + function_call_id='test_id', + ) + + result = await _call_tool_in_thread_pool(tool, {}, tool_context) + + assert result is None + assert call_count == 1, ( + f'Tool function executed {call_count} time(s); expected exactly 1.' + ) + + @pytest.mark.asyncio + async def test_sync_tool_returning_explicit_none_executes_exactly_once(self): + """Test that explicit None return from FunctionTool is not double-executed.""" + call_count = 0 + + def explicit_none_return() -> None: + nonlocal call_count + call_count += 1 + return None # Explicit None return. + + tool = FunctionTool(explicit_none_return) + model = testing_utils.MockModel.create(responses=[]) + agent = Agent(name='test_agent', model=model, tools=[tool]) + invocation_context = await testing_utils.create_invocation_context( + agent=agent, user_content='' + ) + tool_context = ToolContext( + invocation_context=invocation_context, + function_call_id='test_id', + ) + + result = await _call_tool_in_thread_pool(tool, {}, tool_context) + + assert result is None + assert call_count == 1, ( + f'Tool function executed {call_count} time(s); expected exactly 1.' + ) + @pytest.mark.asyncio async def test_sync_tool_exception_propagates(self): """Test that exceptions from sync tools propagate correctly.""" From 01f178ecdd8309c939b73585ffb0dc171b28f9f1 Mon Sep 17 00:00:00 2001 From: KoushikReddy Date: Sat, 11 Apr 2026 15:35:36 -0700 Subject: [PATCH 2/6] test(flows): consolidate None tests and add falsy-value parametrize cases Per reviewer feedback: collapse the two near-identical None tests into a single @pytest.mark.parametrize test, and add falsy-but-not-None cases (0, '', {}, False) to prove the sentinel is identity-based and does not mishandle any falsy return value from a FunctionTool. --- .../llm_flows/test_functions_thread_pool.py | 62 +++++++------------ 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py index bf89ce43b6..1dfc1673a6 100644 --- a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py +++ b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py @@ -279,52 +279,38 @@ def blocking_sleep() -> dict: ), f'Event loop should have ticked at least 5 times, got {event_loop_ticks}' @pytest.mark.asyncio - async def test_sync_tool_returning_none_executes_exactly_once(self): - """Test that a sync FunctionTool returning None is not double-executed. + @pytest.mark.parametrize( + 'return_value,use_implicit_return', + [ + (None, True), # implicit None (no return statement) + (None, False), # explicit `return None` + (0, False), # falsy int + ('', False), # falsy str + ({}, False), # falsy dict + (False, False), # falsy bool + ], + ) + async def test_sync_tool_falsy_return_executes_exactly_once( + self, return_value, use_implicit_return + ): + """FunctionTools returning None or other falsy values must execute exactly once. Regression test for https://github.com/google/adk-python/issues/5284. - A FunctionTool whose underlying function returns None (e.g. side-effect- - only tools with no explicit return statement) must execute exactly once. - Previously the None return was mistaken for the internal sentinel used to + Previously, a None return was mistaken for the internal sentinel used to signal 'non-FunctionTool, fall back to run_async', causing a second - invocation via tool.run_async(). + invocation. The fix uses an identity-based sentinel so that None and other + falsy values (0, '', {}, False) are treated as valid results. """ call_count = 0 - def side_effect_only() -> None: + def sync_func(): nonlocal call_count call_count += 1 - # No return statement — implicit None. + if not use_implicit_return: + return return_value + # implicit None — no return statement - tool = FunctionTool(side_effect_only) - model = testing_utils.MockModel.create(responses=[]) - agent = Agent(name='test_agent', model=model, tools=[tool]) - invocation_context = await testing_utils.create_invocation_context( - agent=agent, user_content='' - ) - tool_context = ToolContext( - invocation_context=invocation_context, - function_call_id='test_id', - ) - - result = await _call_tool_in_thread_pool(tool, {}, tool_context) - - assert result is None - assert call_count == 1, ( - f'Tool function executed {call_count} time(s); expected exactly 1.' - ) - - @pytest.mark.asyncio - async def test_sync_tool_returning_explicit_none_executes_exactly_once(self): - """Test that explicit None return from FunctionTool is not double-executed.""" - call_count = 0 - - def explicit_none_return() -> None: - nonlocal call_count - call_count += 1 - return None # Explicit None return. - - tool = FunctionTool(explicit_none_return) + tool = FunctionTool(sync_func) model = testing_utils.MockModel.create(responses=[]) agent = Agent(name='test_agent', model=model, tools=[tool]) invocation_context = await testing_utils.create_invocation_context( @@ -337,7 +323,7 @@ def explicit_none_return() -> None: result = await _call_tool_in_thread_pool(tool, {}, tool_context) - assert result is None + assert result == return_value assert call_count == 1, ( f'Tool function executed {call_count} time(s); expected exactly 1.' ) From beec80d28f3bc94c56615aad91b7320f7f830e8f Mon Sep 17 00:00:00 2001 From: KoushikReddy Date: Mon, 13 Apr 2026 14:18:50 -0700 Subject: [PATCH 3/6] style: apply pyink formatting to thread pool test file --- .../llm_flows/test_functions_thread_pool.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py index 1dfc1673a6..b09fa799b7 100644 --- a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py +++ b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py @@ -282,12 +282,12 @@ def blocking_sleep() -> dict: @pytest.mark.parametrize( 'return_value,use_implicit_return', [ - (None, True), # implicit None (no return statement) + (None, True), # implicit None (no return statement) (None, False), # explicit `return None` - (0, False), # falsy int - ('', False), # falsy str - ({}, False), # falsy dict - (False, False), # falsy bool + (0, False), # falsy int + ('', False), # falsy str + ({}, False), # falsy dict + (False, False), # falsy bool ], ) async def test_sync_tool_falsy_return_executes_exactly_once( @@ -324,9 +324,9 @@ def sync_func(): result = await _call_tool_in_thread_pool(tool, {}, tool_context) assert result == return_value - assert call_count == 1, ( - f'Tool function executed {call_count} time(s); expected exactly 1.' - ) + assert ( + call_count == 1 + ), f'Tool function executed {call_count} time(s); expected exactly 1.' @pytest.mark.asyncio async def test_sync_tool_exception_propagates(self): From 2e3072b8457282b06f760e4ebd79db7d4665ec2c Mon Sep 17 00:00:00 2001 From: KoushikReddy Date: Mon, 20 Apr 2026 12:25:38 -0700 Subject: [PATCH 4/6] fix(eval): include function-call events in invocation_events when skip_summarization is set EvaluationGenerator.convert_events_to_eval_invocations builds invocation_events by excluding the final_event from intermediate steps. However, is_final_response() returns True for any event with skip_summarization=True, even when that event contains function calls (e.g. tools using skip_summarization to bypass LLM summarization). Such events were incorrectly excluded from invocation_events, causing get_all_tool_calls() to return an empty list and tool_trajectory_avg_score to always be 0.0 despite matching tool calls. Fix: keep an event in invocation_events even if it is the final_event when it contains function calls. Fixes #5410 --- .../adk/evaluation/evaluation_generator.py | 2 +- .../evaluation/test_evaluation_generator.py | 47 ++++++++++++++ .../evaluation/test_trajectory_evaluator.py | 61 +++++++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index f8fb6795aa..4e0159c355 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -315,7 +315,7 @@ def convert_events_to_eval_invocations( invocation_events = [ InvocationEvent(author=e.author, content=e.content) for e in events_to_add - if e is not final_event + if e is not final_event or e.get_function_calls() ] invocations.append( Invocation( diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py index a4aa8691fd..122d3b7c97 100644 --- a/tests/unittests/evaluation/test_evaluation_generator.py +++ b/tests/unittests/evaluation/test_evaluation_generator.py @@ -16,12 +16,14 @@ from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails +from google.adk.evaluation.eval_case import get_all_tool_calls from google.adk.evaluation.evaluation_generator import EvaluationGenerator from google.adk.evaluation.request_intercepter_plugin import _RequestIntercepterPlugin from google.adk.evaluation.simulation.user_simulator import NextUserMessage from google.adk.evaluation.simulation.user_simulator import Status as UserSimulatorStatus from google.adk.evaluation.simulation.user_simulator import UserSimulator from google.adk.events.event import Event +from google.adk.events.event_actions import EventActions from google.adk.models.llm_request import LlmRequest from google.genai import types import pytest @@ -479,3 +481,48 @@ async def mock_generate_inferences_side_effect( mock_generate_inferences.assert_called_once() called_with_content = mock_generate_inferences.call_args.args[3] assert called_with_content.parts[0].text == "message 1" + + +def test_convert_events_preserves_tool_calls_when_skip_summarization(): + """Regression test for #5410. + + When an event has skip_summarization=True, is_final_response() returns True + even if the event contains function calls. Previously such an event was + treated as final_event and excluded from invocation_events, causing + get_all_tool_calls() to return an empty list and tool_trajectory_avg_score + to always be 0.0 despite matching tool calls. + """ + events = [ + Event( + invocation_id="inv1", + author="user", + content=types.Content( + parts=[types.Part(text="run a query")], role="user" + ), + timestamp=1000.0, + ), + Event( + invocation_id="inv1", + author="agent", + content=types.Content( + parts=[ + types.Part( + function_call=types.FunctionCall( + id="call_01", + name="execute_sql", + args={"project_id": "my-proj", "query": "SELECT 1"}, + ) + ) + ] + ), + actions=EventActions(skip_summarization=True), + ), + ] + + invocations = EvaluationGenerator.convert_events_to_eval_invocations(events) + assert len(invocations) == 1 + + tool_calls = get_all_tool_calls(invocations[0].intermediate_data) + assert len(tool_calls) == 1 + assert tool_calls[0].name == "execute_sql" + assert tool_calls[0].args == {"project_id": "my-proj", "query": "SELECT 1"} diff --git a/tests/unittests/evaluation/test_trajectory_evaluator.py b/tests/unittests/evaluation/test_trajectory_evaluator.py index 0fa3fa5a73..8a3dae02a7 100644 --- a/tests/unittests/evaluation/test_trajectory_evaluator.py +++ b/tests/unittests/evaluation/test_trajectory_evaluator.py @@ -16,6 +16,8 @@ from google.adk.evaluation.eval_case import IntermediateData from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_case import InvocationEvent +from google.adk.evaluation.eval_case import InvocationEvents from google.adk.evaluation.eval_metrics import EvalMetric from google.adk.evaluation.eval_metrics import PrebuiltMetrics from google.adk.evaluation.eval_metrics import ToolTrajectoryCriterion @@ -462,3 +464,62 @@ def test_evaluate_invocations_no_invocations(evaluator: TrajectoryEvaluator): assert result.overall_score is None assert result.overall_eval_status == EvalStatus.NOT_EVALUATED assert not result.per_invocation_results + + +def _make_invocation_events( + *tool_calls: genai_types.FunctionCall, +) -> Invocation: + """Returns an Invocation using InvocationEvents intermediate_data format.""" + return Invocation( + user_content=_USER_CONTENT, + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=genai_types.Content( + parts=[genai_types.Part(function_call=tc)] + ), + ) + for tc in tool_calls + ] + ), + ) + + +def test_evaluate_invocations_invocation_events_format_exact_match( + evaluator: TrajectoryEvaluator, +): + """InvocationEvents intermediate_data format should score 1.0 on exact match. + + Regression test for #5410: tool_trajectory_avg_score returned 0.0 even when + tool name and args were identical because function-call events with + skip_summarization=True were incorrectly excluded from invocation_events. + """ + tool_call = genai_types.FunctionCall( + id="toolu_01", name="execute_sql", args={"query": "SELECT 1"} + ) + expected_tool_call = genai_types.FunctionCall( + name="execute_sql", args={"query": "SELECT 1"} + ) + actual = _make_invocation_events(tool_call) + expected = _make_invocation_events(expected_tool_call) + + result = evaluator.evaluate_invocations([actual], [expected]) + assert result.overall_score == 1.0 + assert result.overall_eval_status == EvalStatus.PASSED + + +def test_evaluate_invocations_invocation_events_format_mismatch( + evaluator: TrajectoryEvaluator, +): + """InvocationEvents format should score 0.0 when tool calls differ.""" + actual = _make_invocation_events( + genai_types.FunctionCall(name="tool_a", args={"x": "1"}) + ) + expected = _make_invocation_events( + genai_types.FunctionCall(name="tool_b", args={"x": "1"}) + ) + + result = evaluator.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + assert result.overall_eval_status == EvalStatus.FAILED From f84d75bf01d35754bdab71531a29282124feece9 Mon Sep 17 00:00:00 2001 From: KoushikReddy Date: Mon, 20 Apr 2026 15:37:07 -0700 Subject: [PATCH 5/6] fix(eval): add type annotation and guard to resolve mypy union-attr error --- src/google/adk/evaluation/evaluation_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 4e0159c355..734a17f4b2 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -280,7 +280,7 @@ def convert_events_to_eval_invocations( invocations = [] for invocation_id, events in events_by_invocation_id.items(): final_response = None - final_event = None + final_event: Optional[Event] = None user_content = Content(parts=[]) invocation_timestamp = 0 app_details = None @@ -315,7 +315,7 @@ def convert_events_to_eval_invocations( invocation_events = [ InvocationEvent(author=e.author, content=e.content) for e in events_to_add - if e is not final_event or e.get_function_calls() + if final_event is None or e is not final_event or e.get_function_calls() ] invocations.append( Invocation( From 11ba2e1996125b4b1ce3bafab63e8ee0236b9110 Mon Sep 17 00:00:00 2001 From: KoushikReddy Date: Mon, 20 Apr 2026 15:45:53 -0700 Subject: [PATCH 6/6] style: apply pyink formatting to evaluation_generator.py --- src/google/adk/evaluation/evaluation_generator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 734a17f4b2..fafd1d28c6 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -315,7 +315,9 @@ def convert_events_to_eval_invocations( invocation_events = [ InvocationEvent(author=e.author, content=e.content) for e in events_to_add - if final_event is None or e is not final_event or e.get_function_calls() + if final_event is None + or e is not final_event + or e.get_function_calls() ] invocations.append( Invocation(