From a9394cc4d168fe4ff37e360e1432000971e47e69 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 06:01:16 +0000 Subject: [PATCH 1/8] ci: implement multi-model evaluation support by replacing static run configs with dynamic discovery and adding Claude/Gemini-specific configurations. --- cloudbuild.yaml | 25 +++++++---- evals/claude_code_model.yaml | 18 ++++++++ evals/claude_run_config.yaml | 41 +++++++++++++++++++ evals/gemini_cli_model.yaml | 33 +++++++++++++++ ...run_config.yaml => gemini_run_config.yaml} | 3 +- evals/substitute_env.py | 27 +++++++----- 6 files changed, 126 insertions(+), 21 deletions(-) create mode 100644 evals/claude_code_model.yaml create mode 100644 evals/claude_run_config.yaml create mode 100644 evals/gemini_cli_model.yaml rename evals/{run_config.yaml => gemini_run_config.yaml} (96%) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 921717b..0036625 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -45,8 +45,8 @@ steps: PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') # Check if execution labels are present using exact matching via jq - if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then - echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." + if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals" or .name == "ci:run-evals-gemini" or .name == "ci:run-evals-claude")' pr_data.json > /dev/null; then + echo "PR does not have required labels. Skipping execution." exit 0 fi echo "Execution label detected. Processing release version context..." @@ -72,7 +72,6 @@ steps: export GOOGLE_CLOUD_PROJECT=$PROJECT_ID export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT - # Set environment variables for extension export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE @@ -84,18 +83,28 @@ steps: # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD - # Combine CI metadata with run config - cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml + # Combine CI metadata with all available run configs + for config in /workspace/evals/*run_config.yaml; do + if [ -f "$config" ]; then + echo "Appending CI metadata to $config" + cat /workspace/evals/ci_metadata.yaml >> "$config" + fi + done - # Substitute environment variables in model_config.yaml + # Substitute environment variables in all configs python3 /workspace/evals/substitute_env.py cd /evalbench export PYTHONPATH=./evalbench:./evalbench/evalproto export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - echo "Launching Standalone Evaluation..." - python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml + # Run evaluations for all available run configs + for config in /workspace/evals/*run_config.yaml; do + if [ -f "$config" ]; then + echo "Launching Evaluation for config: $config" + python3 evalbench/evalbench.py --experiment_config="$config" + fi + done availableSecrets: diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml new file mode 100644 index 0000000..3009a2e --- /dev/null +++ b/evals/claude_code_model.yaml @@ -0,0 +1,18 @@ +claude_code_version: "@anthropic-ai/claude-code@2.1.85" +generator: claude_code +model: "claude-opus-4-6" # Use "claude-opus-4-20250514" for direct API + +use_vertex: true +vertex_project_id: "${GOOGLE_CLOUD_PROJECT}" +vertex_region: "us-central1" + +env: + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + +setup: + mcp_servers: + "cloud-sql": + httpUrl: "https://sqladmin.googleapis.com/mcp" + authProviderType: google_credentials + headers: + X-Goog-User-Project: "${GOOGLE_CLOUD_PROJECT}" diff --git a/evals/claude_run_config.yaml b/evals/claude_run_config.yaml new file mode 100644 index 0000000..7bc10f4 --- /dev/null +++ b/evals/claude_run_config.yaml @@ -0,0 +1,41 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset_config: /workspace/evals/dataset.json +dataset_format: gemini-cli-format + +orchestrator: agent +model_config: /workspace/evals/claude_code_model.yaml +simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml + +scorers: + # Qualitative (Judge-based) + goal_completion: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_best_practices: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_dir: /workspace/cloud-sql-postgresql/skills + + # Performance + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + skills_trajectory: {} + +reporting: + bigquery: + gcp_project_id: "${EVAL_REPORTING_PROJECT}" diff --git a/evals/gemini_cli_model.yaml b/evals/gemini_cli_model.yaml new file mode 100644 index 0000000..2973cb4 --- /dev/null +++ b/evals/gemini_cli_model.yaml @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +gemini_cli_version: "@google/gemini-cli@latest" +generator: gemini_cli +env: + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + GOOGLE_CLOUD_LOCATION: "global" + GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_CLI_TRUST_WORKSPACE: "true" +setup: + extensions: + # Points to the symlink created in cloudbuild.yaml to match the extension ID + "/workspace/cloud-sql-postgresql": + settings: + CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" + CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" + CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}" + CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}" + CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}" + CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}' + CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}" diff --git a/evals/run_config.yaml b/evals/gemini_run_config.yaml similarity index 96% rename from evals/run_config.yaml rename to evals/gemini_run_config.yaml index 600bddd..719ed45 100644 --- a/evals/run_config.yaml +++ b/evals/gemini_run_config.yaml @@ -16,7 +16,7 @@ dataset_config: /workspace/evals/dataset.json dataset_format: gemini-cli-format orchestrator: geminicli -model_config: /workspace/evals/model_config.yaml +model_config: /workspace/evals/gemini_cli_model.yaml simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml scorers: @@ -39,4 +39,3 @@ scorers: reporting: bigquery: gcp_project_id: "${EVAL_REPORTING_PROJECT}" - diff --git a/evals/substitute_env.py b/evals/substitute_env.py index cbe1a3a..f04200b 100644 --- a/evals/substitute_env.py +++ b/evals/substitute_env.py @@ -1,18 +1,23 @@ import os import re +import glob def main(): - yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json'] - for yaml_path in yaml_paths: - if os.path.exists(yaml_path): - with open(yaml_path, 'r') as f: - content = f.read() - content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) - with open(yaml_path, 'w') as f: - f.write(content) - print(f"Successfully substituted environment variables in {yaml_path}") - else: - print(f"File not found: {yaml_path}") + # Find all .yaml and .json files in /workspace/evals + paths = glob.glob('/workspace/evals/**/*.yaml', recursive=True) + glob.glob('/workspace/evals/**/*.json', recursive=True) + + for path in paths: + if os.path.isfile(path): + try: + with open(path, 'r') as f: + content = f.read() + # Substitute ${VAR} with environment variables + content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) + with open(path, 'w') as f: + f.write(content) + print(f"Successfully substituted environment variables in {path}") + except Exception as e: + print(f"Error processing {path}: {e}") if __name__ == '__main__': main() \ No newline at end of file From 9aa8b4f50da052a965c208dcffde58465d678f19 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 06:12:41 +0000 Subject: [PATCH 2/8] ci: simplify PR label checks and update Claude model version in evals config --- cloudbuild.yaml | 4 ++-- evals/claude_code_model.yaml | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 0036625..cceb631 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -45,8 +45,8 @@ steps: PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') # Check if execution labels are present using exact matching via jq - if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals" or .name == "ci:run-evals-gemini" or .name == "ci:run-evals-claude")' pr_data.json > /dev/null; then - echo "PR does not have required labels. Skipping execution." + if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then + echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." exit 0 fi echo "Execution label detected. Processing release version context..." diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml index 3009a2e..231e62e 100644 --- a/evals/claude_code_model.yaml +++ b/evals/claude_code_model.yaml @@ -1,6 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + claude_code_version: "@anthropic-ai/claude-code@2.1.85" generator: claude_code -model: "claude-opus-4-6" # Use "claude-opus-4-20250514" for direct API +model: "claude-opus-4-1@20250805" # Suggested by Vertex deployment error use_vertex: true vertex_project_id: "${GOOGLE_CLOUD_PROJECT}" From 82735353742351af125e0905f1b30fd066f45207 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 06:36:36 +0000 Subject: [PATCH 3/8] ci: update model, region, and environment configuration for Cloud SQL evaluation --- evals/claude_code_model.yaml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml index 231e62e..0d2027e 100644 --- a/evals/claude_code_model.yaml +++ b/evals/claude_code_model.yaml @@ -14,19 +14,21 @@ claude_code_version: "@anthropic-ai/claude-code@2.1.85" generator: claude_code -model: "claude-opus-4-1@20250805" # Suggested by Vertex deployment error +model: "claude-opus-4-6" use_vertex: true vertex_project_id: "${GOOGLE_CLOUD_PROJECT}" -vertex_region: "us-central1" +vertex_region: "us-east5" env: GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" + CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" + CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}" + CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}" + CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}" + CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}' + CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}" setup: - mcp_servers: - "cloud-sql": - httpUrl: "https://sqladmin.googleapis.com/mcp" - authProviderType: google_credentials - headers: - X-Goog-User-Project: "${GOOGLE_CLOUD_PROJECT}" + skills_dir: "/workspace/cloud-sql-postgresql" From bbb044b2d0962cc847ff9134fd8a8765582fb3a1 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 07:39:47 +0000 Subject: [PATCH 4/8] ci: update evaluation model to claude-opus-4-1 and set CLOUD_ML_REGION env var --- evals/claude_code_model.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml index 0d2027e..8654e15 100644 --- a/evals/claude_code_model.yaml +++ b/evals/claude_code_model.yaml @@ -14,13 +14,14 @@ claude_code_version: "@anthropic-ai/claude-code@2.1.85" generator: claude_code -model: "claude-opus-4-6" +model: "claude-opus-4-1@20250805" use_vertex: true vertex_project_id: "${GOOGLE_CLOUD_PROJECT}" vertex_region: "us-east5" env: + CLOUD_ML_REGION: "us-east5" GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" From abae0d3e56118a8cd60a1cecb9352b5b73e30bff Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 07:51:03 +0000 Subject: [PATCH 5/8] ci: update claude-opus model version to 4-6 in eval configuration --- evals/claude_code_model.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml index 8654e15..d0ec594 100644 --- a/evals/claude_code_model.yaml +++ b/evals/claude_code_model.yaml @@ -14,7 +14,7 @@ claude_code_version: "@anthropic-ai/claude-code@2.1.85" generator: claude_code -model: "claude-opus-4-1@20250805" +model: "claude-opus-4-6" use_vertex: true vertex_project_id: "${GOOGLE_CLOUD_PROJECT}" From 5d5fc1c701673e43442c76597e2880627732df3d Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 11:45:30 +0000 Subject: [PATCH 6/8] ci: update evaluation configuration with skill tagging, model upgrade to 4-7, and restructured run settings --- evals/claude_code_model.yaml | 9 ++++++--- evals/claude_run_config.yaml | 10 +++++++++- evals/dataset.json | 12 ++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml index d0ec594..cce3f00 100644 --- a/evals/claude_code_model.yaml +++ b/evals/claude_code_model.yaml @@ -14,15 +14,18 @@ claude_code_version: "@anthropic-ai/claude-code@2.1.85" generator: claude_code -model: "claude-opus-4-6" +model: "claude-opus-4-7" use_vertex: true vertex_project_id: "${GOOGLE_CLOUD_PROJECT}" -vertex_region: "us-east5" +vertex_region: "global" env: - CLOUD_ML_REGION: "us-east5" + # Global environment variables + CLOUD_ML_REGION: "global" GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + + # Cloud SQL PostgreSQL extension configuration CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}" diff --git a/evals/claude_run_config.yaml b/evals/claude_run_config.yaml index 7bc10f4..5e88265 100644 --- a/evals/claude_run_config.yaml +++ b/evals/claude_run_config.yaml @@ -12,13 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Dataset Related Configs dataset_config: /workspace/evals/dataset.json -dataset_format: gemini-cli-format +dataset_format: agent-format +# Orchestrator Configuration orchestrator: agent model_config: /workspace/evals/claude_code_model.yaml simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml +# Runner Related Configs +runners: + agent_runners: 1 + +# Scorer Related Configs scorers: # Qualitative (Judge-based) goal_completion: @@ -36,6 +43,7 @@ scorers: token_consumption: {} skills_trajectory: {} +# Reporting Related Configs reporting: bigquery: gcp_project_id: "${EVAL_REPORTING_PROJECT}" diff --git a/evals/dataset.json b/evals/dataset.json index 654015f..72d0878 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -8,6 +8,9 @@ "list_instances", "get_instance" ], + "expected_skills": [ + "cloud-sql-postgres-admin" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, @@ -22,6 +25,9 @@ "list_schemas", "list_tables" ], + "expected_skills": [ + "cloud-sql-postgres-data" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, @@ -36,6 +42,9 @@ "list_active_queries", "list_locks" ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, @@ -50,6 +59,9 @@ "get_system_metrics", "list_database_stats" ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, From 9dfca58730385e7587112dda8bb830d05b55a8c2 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 11:50:54 +0000 Subject: [PATCH 7/8] ci: upgrade claude-code version to 2.1.119 --- evals/claude_code_model.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml index cce3f00..d84c40a 100644 --- a/evals/claude_code_model.yaml +++ b/evals/claude_code_model.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -claude_code_version: "@anthropic-ai/claude-code@2.1.85" +claude_code_version: "@anthropic-ai/claude-code@2.1.119" generator: claude_code model: "claude-opus-4-7" From 9b88419c54bbc63deb7a85450246d76dfd0a1f91 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Thu, 7 May 2026 12:26:38 +0000 Subject: [PATCH 8/8] ci: split shared dataset into model-specific configurations and update references --- evals/claude_dataset.json | 72 +++++++++++++++++++++ evals/claude_run_config.yaml | 2 +- evals/{dataset.json => gemini_dataset.json} | 2 +- evals/gemini_run_config.yaml | 2 +- 4 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 evals/claude_dataset.json rename evals/{dataset.json => gemini_dataset.json} (99%) diff --git a/evals/claude_dataset.json b/evals/claude_dataset.json new file mode 100644 index 0000000..acc8a19 --- /dev/null +++ b/evals/claude_dataset.json @@ -0,0 +1,72 @@ +{ + "scenarios": [ + { + "id": "cloud-sql-debug-instance", + "starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.", + "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.", + "expected_trajectory": [ + "list_instances.js", + "get_instance.js" + ], + "expected_skills": [ + "cloud-sql-postgres-admin" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-schema-tables-explore", + "starting_prompt": "I want to understand the structure of my database.", + "conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.", + "expected_trajectory": [ + "list_schemas.js", + "list_tables.js" + ], + "expected_skills": [ + "cloud-sql-postgres-data" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-performance-check", + "starting_prompt": "Our database performance seems degraded.", + "conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.", + "expected_trajectory": [ + "list_active_queries.js", + "list_locks.js" + ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-metrics-cpu-investigation", + "starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.", + "conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", + "expected_trajectory": [ + "get_system_metrics.js", + "list_database_stats.js" + ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} diff --git a/evals/claude_run_config.yaml b/evals/claude_run_config.yaml index 5e88265..5afd7c0 100644 --- a/evals/claude_run_config.yaml +++ b/evals/claude_run_config.yaml @@ -13,7 +13,7 @@ # limitations under the License. # Dataset Related Configs -dataset_config: /workspace/evals/dataset.json +dataset_config: /workspace/evals/claude_dataset.json dataset_format: agent-format # Orchestrator Configuration diff --git a/evals/dataset.json b/evals/gemini_dataset.json similarity index 99% rename from evals/dataset.json rename to evals/gemini_dataset.json index 72d0878..7ceead7 100644 --- a/evals/dataset.json +++ b/evals/gemini_dataset.json @@ -69,4 +69,4 @@ "max_turns": 3 } ] -} \ No newline at end of file +} diff --git a/evals/gemini_run_config.yaml b/evals/gemini_run_config.yaml index 719ed45..bc0c7ab 100644 --- a/evals/gemini_run_config.yaml +++ b/evals/gemini_run_config.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -dataset_config: /workspace/evals/dataset.json +dataset_config: /workspace/evals/gemini_dataset.json dataset_format: gemini-cli-format orchestrator: geminicli