From 2cf862757f42cafa671f4dddedbf4c04f188edc0 Mon Sep 17 00:00:00 2001
From: KoushikReddy <koushik.7rdd@gmail.com>
Date: Sat, 11 Apr 2026 13:02:16 -0700
Subject: [PATCH 1/6] fix(flows): prevent double-execution of FunctionTools
 returning None in thread pool

When RunConfig.tool_thread_pool_config is enabled, _call_tool_in_thread_pool
used None as a sentinel to distinguish "FunctionTool ran in thread pool" from
"non-FunctionTool sync tool, needs async fallback". Because None is also a
valid return value from any FunctionTool whose underlying function has no
explicit return statement (implicit None), the sentinel check failed and
execution fell through to tool.run_async(), invoking the function a second
time silently.

Replace the None sentinel with a dedicated _SYNC_TOOL_RESULT_UNSET object so
that a legitimate None result from a FunctionTool is correctly returned on the
first execution, without triggering the async fallback path.

Fixes #5284
---
 src/google/adk/flows/llm_flows/functions.py   |  7 ++
 .../llm_flows/test_functions_thread_pool.py   | 64 +++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/src/google/adk/flows/llm_flows/functions.py b/src/google/adk/flows/llm_flows/functions.py
index ea3b3e76c3..e47ca816e6 100644
--- a/src/google/adk/flows/llm_flows/functions.py
+++ b/src/google/adk/flows/llm_flows/functions.py
@@ -67,6 +67,13 @@
 _TOOL_THREAD_POOLS: dict[int, ThreadPoolExecutor] = {}
 _TOOL_THREAD_POOL_LOCK = threading.Lock()
 
+# Sentinel object used to distinguish a FunctionTool that legitimately returns
+# None from a non-FunctionTool sync tool that skips thread-pool execution.
+# Using None as a sentinel would cause tools whose underlying function has no
+# explicit return statement (implicit None) to fall through to the async
+# fallback path and execute a second time.
+_SYNC_TOOL_RESULT_UNSET = object()
+
 
 def _is_live_request_queue_annotation(param: inspect.Parameter) -> bool:
   """Check whether a parameter is annotated as LiveRequestQueue.
diff --git a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
index 5ffd0f26d6..bf89ce43b6 100644
--- a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
+++ b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
@@ -278,6 +278,70 @@ def blocking_sleep() -> dict:
         event_loop_ticks >= 5
     ), f'Event loop should have ticked at least 5 times, got {event_loop_ticks}'
 
+  @pytest.mark.asyncio
+  async def test_sync_tool_returning_none_executes_exactly_once(self):
+    """Test that a sync FunctionTool returning None is not double-executed.
+
+    Regression test for https://github.com/google/adk-python/issues/5284.
+    A FunctionTool whose underlying function returns None (e.g. side-effect-
+    only tools with no explicit return statement) must execute exactly once.
+    Previously the None return was mistaken for the internal sentinel used to
+    signal 'non-FunctionTool, fall back to run_async', causing a second
+    invocation via tool.run_async().
+    """
+    call_count = 0
+
+    def side_effect_only() -> None:
+      nonlocal call_count
+      call_count += 1
+      # No return statement — implicit None.
+
+    tool = FunctionTool(side_effect_only)
+    model = testing_utils.MockModel.create(responses=[])
+    agent = Agent(name='test_agent', model=model, tools=[tool])
+    invocation_context = await testing_utils.create_invocation_context(
+        agent=agent, user_content=''
+    )
+    tool_context = ToolContext(
+        invocation_context=invocation_context,
+        function_call_id='test_id',
+    )
+
+    result = await _call_tool_in_thread_pool(tool, {}, tool_context)
+
+    assert result is None
+    assert call_count == 1, (
+        f'Tool function executed {call_count} time(s); expected exactly 1.'
+    )
+
+  @pytest.mark.asyncio
+  async def test_sync_tool_returning_explicit_none_executes_exactly_once(self):
+    """Test that explicit None return from FunctionTool is not double-executed."""
+    call_count = 0
+
+    def explicit_none_return() -> None:
+      nonlocal call_count
+      call_count += 1
+      return None  # Explicit None return.
+
+    tool = FunctionTool(explicit_none_return)
+    model = testing_utils.MockModel.create(responses=[])
+    agent = Agent(name='test_agent', model=model, tools=[tool])
+    invocation_context = await testing_utils.create_invocation_context(
+        agent=agent, user_content=''
+    )
+    tool_context = ToolContext(
+        invocation_context=invocation_context,
+        function_call_id='test_id',
+    )
+
+    result = await _call_tool_in_thread_pool(tool, {}, tool_context)
+
+    assert result is None
+    assert call_count == 1, (
+        f'Tool function executed {call_count} time(s); expected exactly 1.'
+    )
+
   @pytest.mark.asyncio
   async def test_sync_tool_exception_propagates(self):
     """Test that exceptions from sync tools propagate correctly."""

From 01f178ecdd8309c939b73585ffb0dc171b28f9f1 Mon Sep 17 00:00:00 2001
From: KoushikReddy <koushik.7rdd@gmail.com>
Date: Sat, 11 Apr 2026 15:35:36 -0700
Subject: [PATCH 2/6] test(flows): consolidate None tests and add falsy-value
 parametrize cases

Per reviewer feedback: collapse the two near-identical None tests into a
single @pytest.mark.parametrize test, and add falsy-but-not-None cases
(0, '', {}, False) to prove the sentinel is identity-based and does not
mishandle any falsy return value from a FunctionTool.
---
 .../llm_flows/test_functions_thread_pool.py   | 62 +++++++------------
 1 file changed, 24 insertions(+), 38 deletions(-)

diff --git a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
index bf89ce43b6..1dfc1673a6 100644
--- a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
+++ b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
@@ -279,52 +279,38 @@ def blocking_sleep() -> dict:
     ), f'Event loop should have ticked at least 5 times, got {event_loop_ticks}'
 
   @pytest.mark.asyncio
-  async def test_sync_tool_returning_none_executes_exactly_once(self):
-    """Test that a sync FunctionTool returning None is not double-executed.
+  @pytest.mark.parametrize(
+      'return_value,use_implicit_return',
+      [
+          (None, True),   # implicit None (no return statement)
+          (None, False),  # explicit `return None`
+          (0, False),     # falsy int
+          ('', False),    # falsy str
+          ({}, False),    # falsy dict
+          (False, False), # falsy bool
+      ],
+  )
+  async def test_sync_tool_falsy_return_executes_exactly_once(
+      self, return_value, use_implicit_return
+  ):
+    """FunctionTools returning None or other falsy values must execute exactly once.
 
     Regression test for https://github.com/google/adk-python/issues/5284.
-    A FunctionTool whose underlying function returns None (e.g. side-effect-
-    only tools with no explicit return statement) must execute exactly once.
-    Previously the None return was mistaken for the internal sentinel used to
+    Previously, a None return was mistaken for the internal sentinel used to
     signal 'non-FunctionTool, fall back to run_async', causing a second
-    invocation via tool.run_async().
+    invocation. The fix uses an identity-based sentinel so that None and other
+    falsy values (0, '', {}, False) are treated as valid results.
     """
     call_count = 0
 
-    def side_effect_only() -> None:
+    def sync_func():
       nonlocal call_count
       call_count += 1
-      # No return statement — implicit None.
+      if not use_implicit_return:
+        return return_value
+      # implicit None — no return statement
 
-    tool = FunctionTool(side_effect_only)
-    model = testing_utils.MockModel.create(responses=[])
-    agent = Agent(name='test_agent', model=model, tools=[tool])
-    invocation_context = await testing_utils.create_invocation_context(
-        agent=agent, user_content=''
-    )
-    tool_context = ToolContext(
-        invocation_context=invocation_context,
-        function_call_id='test_id',
-    )
-
-    result = await _call_tool_in_thread_pool(tool, {}, tool_context)
-
-    assert result is None
-    assert call_count == 1, (
-        f'Tool function executed {call_count} time(s); expected exactly 1.'
-    )
-
-  @pytest.mark.asyncio
-  async def test_sync_tool_returning_explicit_none_executes_exactly_once(self):
-    """Test that explicit None return from FunctionTool is not double-executed."""
-    call_count = 0
-
-    def explicit_none_return() -> None:
-      nonlocal call_count
-      call_count += 1
-      return None  # Explicit None return.
-
-    tool = FunctionTool(explicit_none_return)
+    tool = FunctionTool(sync_func)
     model = testing_utils.MockModel.create(responses=[])
     agent = Agent(name='test_agent', model=model, tools=[tool])
     invocation_context = await testing_utils.create_invocation_context(
@@ -337,7 +323,7 @@ def explicit_none_return() -> None:
 
     result = await _call_tool_in_thread_pool(tool, {}, tool_context)
 
-    assert result is None
+    assert result == return_value
     assert call_count == 1, (
         f'Tool function executed {call_count} time(s); expected exactly 1.'
     )

From beec80d28f3bc94c56615aad91b7320f7f830e8f Mon Sep 17 00:00:00 2001
From: KoushikReddy <koushik.7rdd@gmail.com>
Date: Mon, 13 Apr 2026 14:18:50 -0700
Subject: [PATCH 3/6] style: apply pyink formatting to thread pool test file

---
 .../llm_flows/test_functions_thread_pool.py      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
index 1dfc1673a6..b09fa799b7 100644
--- a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
+++ b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py
@@ -282,12 +282,12 @@ def blocking_sleep() -> dict:
   @pytest.mark.parametrize(
       'return_value,use_implicit_return',
       [
-          (None, True),   # implicit None (no return statement)
+          (None, True),  # implicit None (no return statement)
           (None, False),  # explicit `return None`
-          (0, False),     # falsy int
-          ('', False),    # falsy str
-          ({}, False),    # falsy dict
-          (False, False), # falsy bool
+          (0, False),  # falsy int
+          ('', False),  # falsy str
+          ({}, False),  # falsy dict
+          (False, False),  # falsy bool
       ],
   )
   async def test_sync_tool_falsy_return_executes_exactly_once(
@@ -324,9 +324,9 @@ def sync_func():
     result = await _call_tool_in_thread_pool(tool, {}, tool_context)
 
     assert result == return_value
-    assert call_count == 1, (
-        f'Tool function executed {call_count} time(s); expected exactly 1.'
-    )
+    assert (
+        call_count == 1
+    ), f'Tool function executed {call_count} time(s); expected exactly 1.'
 
   @pytest.mark.asyncio
   async def test_sync_tool_exception_propagates(self):

From 2e3072b8457282b06f760e4ebd79db7d4665ec2c Mon Sep 17 00:00:00 2001
From: KoushikReddy <koushik.7rdd@gmail.com>
Date: Mon, 20 Apr 2026 12:25:38 -0700
Subject: [PATCH 4/6] fix(eval): include function-call events in
 invocation_events when skip_summarization is set

EvaluationGenerator.convert_events_to_eval_invocations builds invocation_events
by excluding the final_event from intermediate steps. However, is_final_response()
returns True for any event with skip_summarization=True, even when that event
contains function calls (e.g. tools using skip_summarization to bypass LLM
summarization). Such events were incorrectly excluded from invocation_events,
causing get_all_tool_calls() to return an empty list and
tool_trajectory_avg_score to always be 0.0 despite matching tool calls.

Fix: keep an event in invocation_events even if it is the final_event when
it contains function calls.

Fixes #5410
---
 .../adk/evaluation/evaluation_generator.py    |  2 +-
 .../evaluation/test_evaluation_generator.py   | 47 ++++++++++++++
 .../evaluation/test_trajectory_evaluator.py   | 61 +++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py
index f8fb6795aa..4e0159c355 100644
--- a/src/google/adk/evaluation/evaluation_generator.py
+++ b/src/google/adk/evaluation/evaluation_generator.py
@@ -315,7 +315,7 @@ def convert_events_to_eval_invocations(
       invocation_events = [
           InvocationEvent(author=e.author, content=e.content)
           for e in events_to_add
-          if e is not final_event
+          if e is not final_event or e.get_function_calls()
       ]
       invocations.append(
           Invocation(
diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py
index a4aa8691fd..122d3b7c97 100644
--- a/tests/unittests/evaluation/test_evaluation_generator.py
+++ b/tests/unittests/evaluation/test_evaluation_generator.py
@@ -16,12 +16,14 @@
 
 from google.adk.evaluation.app_details import AgentDetails
 from google.adk.evaluation.app_details import AppDetails
+from google.adk.evaluation.eval_case import get_all_tool_calls
 from google.adk.evaluation.evaluation_generator import EvaluationGenerator
 from google.adk.evaluation.request_intercepter_plugin import _RequestIntercepterPlugin
 from google.adk.evaluation.simulation.user_simulator import NextUserMessage
 from google.adk.evaluation.simulation.user_simulator import Status as UserSimulatorStatus
 from google.adk.evaluation.simulation.user_simulator import UserSimulator
 from google.adk.events.event import Event
+from google.adk.events.event_actions import EventActions
 from google.adk.models.llm_request import LlmRequest
 from google.genai import types
 import pytest
@@ -479,3 +481,48 @@ async def mock_generate_inferences_side_effect(
     mock_generate_inferences.assert_called_once()
     called_with_content = mock_generate_inferences.call_args.args[3]
     assert called_with_content.parts[0].text == "message 1"
+
+
+def test_convert_events_preserves_tool_calls_when_skip_summarization():
+  """Regression test for #5410.
+
+  When an event has skip_summarization=True, is_final_response() returns True
+  even if the event contains function calls.  Previously such an event was
+  treated as final_event and excluded from invocation_events, causing
+  get_all_tool_calls() to return an empty list and tool_trajectory_avg_score
+  to always be 0.0 despite matching tool calls.
+  """
+  events = [
+      Event(
+          invocation_id="inv1",
+          author="user",
+          content=types.Content(
+              parts=[types.Part(text="run a query")], role="user"
+          ),
+          timestamp=1000.0,
+      ),
+      Event(
+          invocation_id="inv1",
+          author="agent",
+          content=types.Content(
+              parts=[
+                  types.Part(
+                      function_call=types.FunctionCall(
+                          id="call_01",
+                          name="execute_sql",
+                          args={"project_id": "my-proj", "query": "SELECT 1"},
+                      )
+                  )
+              ]
+          ),
+          actions=EventActions(skip_summarization=True),
+      ),
+  ]
+
+  invocations = EvaluationGenerator.convert_events_to_eval_invocations(events)
+  assert len(invocations) == 1
+
+  tool_calls = get_all_tool_calls(invocations[0].intermediate_data)
+  assert len(tool_calls) == 1
+  assert tool_calls[0].name == "execute_sql"
+  assert tool_calls[0].args == {"project_id": "my-proj", "query": "SELECT 1"}
diff --git a/tests/unittests/evaluation/test_trajectory_evaluator.py b/tests/unittests/evaluation/test_trajectory_evaluator.py
index 0fa3fa5a73..8a3dae02a7 100644
--- a/tests/unittests/evaluation/test_trajectory_evaluator.py
+++ b/tests/unittests/evaluation/test_trajectory_evaluator.py
@@ -16,6 +16,8 @@
 
 from google.adk.evaluation.eval_case import IntermediateData
 from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_case import InvocationEvent
+from google.adk.evaluation.eval_case import InvocationEvents
 from google.adk.evaluation.eval_metrics import EvalMetric
 from google.adk.evaluation.eval_metrics import PrebuiltMetrics
 from google.adk.evaluation.eval_metrics import ToolTrajectoryCriterion
@@ -462,3 +464,62 @@ def test_evaluate_invocations_no_invocations(evaluator: TrajectoryEvaluator):
   assert result.overall_score is None
   assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
   assert not result.per_invocation_results
+
+
+def _make_invocation_events(
+    *tool_calls: genai_types.FunctionCall,
+) -> Invocation:
+  """Returns an Invocation using InvocationEvents intermediate_data format."""
+  return Invocation(
+      user_content=_USER_CONTENT,
+      intermediate_data=InvocationEvents(
+          invocation_events=[
+              InvocationEvent(
+                  author="agent",
+                  content=genai_types.Content(
+                      parts=[genai_types.Part(function_call=tc)]
+                  ),
+              )
+              for tc in tool_calls
+          ]
+      ),
+  )
+
+
+def test_evaluate_invocations_invocation_events_format_exact_match(
+    evaluator: TrajectoryEvaluator,
+):
+  """InvocationEvents intermediate_data format should score 1.0 on exact match.
+
+  Regression test for #5410: tool_trajectory_avg_score returned 0.0 even when
+  tool name and args were identical because function-call events with
+  skip_summarization=True were incorrectly excluded from invocation_events.
+  """
+  tool_call = genai_types.FunctionCall(
+      id="toolu_01", name="execute_sql", args={"query": "SELECT 1"}
+  )
+  expected_tool_call = genai_types.FunctionCall(
+      name="execute_sql", args={"query": "SELECT 1"}
+  )
+  actual = _make_invocation_events(tool_call)
+  expected = _make_invocation_events(expected_tool_call)
+
+  result = evaluator.evaluate_invocations([actual], [expected])
+  assert result.overall_score == 1.0
+  assert result.overall_eval_status == EvalStatus.PASSED
+
+
+def test_evaluate_invocations_invocation_events_format_mismatch(
+    evaluator: TrajectoryEvaluator,
+):
+  """InvocationEvents format should score 0.0 when tool calls differ."""
+  actual = _make_invocation_events(
+      genai_types.FunctionCall(name="tool_a", args={"x": "1"})
+  )
+  expected = _make_invocation_events(
+      genai_types.FunctionCall(name="tool_b", args={"x": "1"})
+  )
+
+  result = evaluator.evaluate_invocations([actual], [expected])
+  assert result.overall_score == 0.0
+  assert result.overall_eval_status == EvalStatus.FAILED

From f84d75bf01d35754bdab71531a29282124feece9 Mon Sep 17 00:00:00 2001
From: KoushikReddy <koushik.7rdd@gmail.com>
Date: Mon, 20 Apr 2026 15:37:07 -0700
Subject: [PATCH 5/6] fix(eval): add type annotation and guard to resolve mypy
 union-attr error

---
 src/google/adk/evaluation/evaluation_generator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py
index 4e0159c355..734a17f4b2 100644
--- a/src/google/adk/evaluation/evaluation_generator.py
+++ b/src/google/adk/evaluation/evaluation_generator.py
@@ -280,7 +280,7 @@ def convert_events_to_eval_invocations(
     invocations = []
     for invocation_id, events in events_by_invocation_id.items():
       final_response = None
-      final_event = None
+      final_event: Optional[Event] = None
       user_content = Content(parts=[])
       invocation_timestamp = 0
       app_details = None
@@ -315,7 +315,7 @@ def convert_events_to_eval_invocations(
       invocation_events = [
           InvocationEvent(author=e.author, content=e.content)
           for e in events_to_add
-          if e is not final_event or e.get_function_calls()
+          if final_event is None or e is not final_event or e.get_function_calls()
       ]
       invocations.append(
           Invocation(

From 11ba2e1996125b4b1ce3bafab63e8ee0236b9110 Mon Sep 17 00:00:00 2001
From: KoushikReddy <koushik.7rdd@gmail.com>
Date: Mon, 20 Apr 2026 15:45:53 -0700
Subject: [PATCH 6/6] style: apply pyink formatting to evaluation_generator.py

---
 src/google/adk/evaluation/evaluation_generator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py
index 734a17f4b2..fafd1d28c6 100644
--- a/src/google/adk/evaluation/evaluation_generator.py
+++ b/src/google/adk/evaluation/evaluation_generator.py
@@ -315,7 +315,9 @@ def convert_events_to_eval_invocations(
       invocation_events = [
           InvocationEvent(author=e.author, content=e.content)
           for e in events_to_add
-          if final_event is None or e is not final_event or e.get_function_calls()
+          if final_event is None
+          or e is not final_event
+          or e.get_function_calls()
       ]
       invocations.append(
           Invocation(