From a9394cc4d168fe4ff37e360e1432000971e47e69 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 06:01:16 +0000
Subject: [PATCH 1/8] ci: implement multi-model evaluation support by replacing
 static run configs with dynamic discovery and adding Claude/Gemini-specific
 configurations.

---
 cloudbuild.yaml                               | 25 +++++++----
 evals/claude_code_model.yaml                  | 18 ++++++++
 evals/claude_run_config.yaml                  | 41 +++++++++++++++++++
 evals/gemini_cli_model.yaml                   | 33 +++++++++++++++
 ...run_config.yaml => gemini_run_config.yaml} |  3 +-
 evals/substitute_env.py                       | 27 +++++++-----
 6 files changed, 126 insertions(+), 21 deletions(-)
 create mode 100644 evals/claude_code_model.yaml
 create mode 100644 evals/claude_run_config.yaml
 create mode 100644 evals/gemini_cli_model.yaml
 rename evals/{run_config.yaml => gemini_run_config.yaml} (96%)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 921717b..0036625 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -45,8 +45,8 @@ steps:
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
         # Check if execution labels are present using exact matching via jq
-        if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
-          echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
+        if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals" or .name == "ci:run-evals-gemini" or .name == "ci:run-evals-claude")' pr_data.json > /dev/null; then
+          echo "PR does not have required labels. Skipping execution."
           exit 0
         fi
         echo "Execution label detected. Processing release version context..."
@@ -72,7 +72,6 @@ steps:
         export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
         export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
 
-
         # Set environment variables for extension
         export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
         export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
@@ -84,18 +83,28 @@ steps:
         # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
         export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD
 
-        # Combine CI metadata with run config
-        cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
+        # Combine CI metadata with all available run configs
+        for config in /workspace/evals/*run_config.yaml; do
+          if [ -f "$config" ]; then
+            echo "Appending CI metadata to $config"
+            cat /workspace/evals/ci_metadata.yaml >> "$config"
+          fi
+        done
 
-        # Substitute environment variables in model_config.yaml
+        # Substitute environment variables in all configs
         python3 /workspace/evals/substitute_env.py
 
         cd /evalbench
         export PYTHONPATH=./evalbench:./evalbench/evalproto
         export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 
-        echo "Launching Standalone Evaluation..."
-        python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
+        # Run evaluations for all available run configs
+        for config in /workspace/evals/*run_config.yaml; do
+          if [ -f "$config" ]; then
+            echo "Launching Evaluation for config: $config"
+            python3 evalbench/evalbench.py --experiment_config="$config"
+          fi
+        done
 
 
 availableSecrets:
diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml
new file mode 100644
index 0000000..3009a2e
--- /dev/null
+++ b/evals/claude_code_model.yaml
@@ -0,0 +1,18 @@
+claude_code_version: "@anthropic-ai/claude-code@2.1.85"
+generator: claude_code
+model: "claude-opus-4-6" # Use "claude-opus-4-20250514" for direct API
+
+use_vertex: true
+vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
+vertex_region: "us-central1"
+
+env:
+  GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
+
+setup:
+  mcp_servers:
+    "cloud-sql":
+      httpUrl: "https://sqladmin.googleapis.com/mcp"
+      authProviderType: google_credentials
+      headers:
+        X-Goog-User-Project: "${GOOGLE_CLOUD_PROJECT}"
diff --git a/evals/claude_run_config.yaml b/evals/claude_run_config.yaml
new file mode 100644
index 0000000..7bc10f4
--- /dev/null
+++ b/evals/claude_run_config.yaml
@@ -0,0 +1,41 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dataset_config: /workspace/evals/dataset.json
+dataset_format: gemini-cli-format
+
+orchestrator: agent
+model_config: /workspace/evals/claude_code_model.yaml
+simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+
+scorers:
+  # Qualitative (Judge-based)
+  goal_completion:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  behavioral_metrics:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  skills_best_practices:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+    skills_dir: /workspace/cloud-sql-postgresql/skills
+
+  # Performance
+  turn_count: {}
+  end_to_end_latency: {}
+  tool_call_latency: {}
+  token_consumption: {}
+  skills_trajectory: {}
+
+reporting:
+  bigquery:
+    gcp_project_id: "${EVAL_REPORTING_PROJECT}"
diff --git a/evals/gemini_cli_model.yaml b/evals/gemini_cli_model.yaml
new file mode 100644
index 0000000..2973cb4
--- /dev/null
+++ b/evals/gemini_cli_model.yaml
@@ -0,0 +1,33 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gemini_cli_version: "@google/gemini-cli@latest"
+generator: gemini_cli
+env:
+  GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
+  GOOGLE_CLOUD_LOCATION: "global"
+  GOOGLE_GENAI_USE_VERTEXAI: "true"
+  GEMINI_CLI_TRUST_WORKSPACE: "true"
+setup:
+  extensions:
+    # Points to the symlink created in cloudbuild.yaml to match the extension ID
+    "/workspace/cloud-sql-postgresql":
+      settings:
+        CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
+        CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
+        CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
+        CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
+        CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
+        CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
+        CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
diff --git a/evals/run_config.yaml b/evals/gemini_run_config.yaml
similarity index 96%
rename from evals/run_config.yaml
rename to evals/gemini_run_config.yaml
index 600bddd..719ed45 100644
--- a/evals/run_config.yaml
+++ b/evals/gemini_run_config.yaml
@@ -16,7 +16,7 @@ dataset_config: /workspace/evals/dataset.json
 dataset_format: gemini-cli-format
 
 orchestrator: geminicli
-model_config: /workspace/evals/model_config.yaml
+model_config: /workspace/evals/gemini_cli_model.yaml
 simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
 
 scorers:
@@ -39,4 +39,3 @@ scorers:
 reporting:
   bigquery:
     gcp_project_id: "${EVAL_REPORTING_PROJECT}"
-
diff --git a/evals/substitute_env.py b/evals/substitute_env.py
index cbe1a3a..f04200b 100644
--- a/evals/substitute_env.py
+++ b/evals/substitute_env.py
@@ -1,18 +1,23 @@
 import os
 import re
+import glob
 
 def main():
-    yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
-    for yaml_path in yaml_paths:
-        if os.path.exists(yaml_path):
-            with open(yaml_path, 'r') as f:
-                content = f.read()
-            content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
-            with open(yaml_path, 'w') as f:
-                f.write(content)
-            print(f"Successfully substituted environment variables in {yaml_path}")
-        else:
-            print(f"File not found: {yaml_path}")
+    # Find all .yaml and .json files in /workspace/evals
+    paths = glob.glob('/workspace/evals/**/*.yaml', recursive=True) + glob.glob('/workspace/evals/**/*.json', recursive=True)
+    
+    for path in paths:
+        if os.path.isfile(path):
+            try:
+                with open(path, 'r') as f:
+                    content = f.read()
+                # Substitute ${VAR} with environment variables
+                content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
+                with open(path, 'w') as f:
+                    f.write(content)
+                print(f"Successfully substituted environment variables in {path}")
+            except Exception as e:
+                print(f"Error processing {path}: {e}")
 
 if __name__ == '__main__':
     main()
\ No newline at end of file

From 9aa8b4f50da052a965c208dcffde58465d678f19 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 06:12:41 +0000
Subject: [PATCH 2/8] ci: simplify PR label checks and update Claude model
 version in evals config

---
 cloudbuild.yaml              |  4 ++--
 evals/claude_code_model.yaml | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 0036625..cceb631 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -45,8 +45,8 @@ steps:
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
         # Check if execution labels are present using exact matching via jq
-        if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals" or .name == "ci:run-evals-gemini" or .name == "ci:run-evals-claude")' pr_data.json > /dev/null; then
-          echo "PR does not have required labels. Skipping execution."
+        if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
+          echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
           exit 0
         fi
         echo "Execution label detected. Processing release version context..."
diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml
index 3009a2e..231e62e 100644
--- a/evals/claude_code_model.yaml
+++ b/evals/claude_code_model.yaml
@@ -1,6 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 claude_code_version: "@anthropic-ai/claude-code@2.1.85"
 generator: claude_code
-model: "claude-opus-4-6" # Use "claude-opus-4-20250514" for direct API
+model: "claude-opus-4-1@20250805" # Suggested by Vertex deployment error
 
 use_vertex: true
 vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"

From 82735353742351af125e0905f1b30fd066f45207 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 06:36:36 +0000
Subject: [PATCH 3/8] ci: update model, region, and environment configuration
 for Cloud SQL evaluation

---
 evals/claude_code_model.yaml | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml
index 231e62e..0d2027e 100644
--- a/evals/claude_code_model.yaml
+++ b/evals/claude_code_model.yaml
@@ -14,19 +14,21 @@
 
 claude_code_version: "@anthropic-ai/claude-code@2.1.85"
 generator: claude_code
-model: "claude-opus-4-1@20250805" # Suggested by Vertex deployment error
+model: "claude-opus-4-6"
 
 use_vertex: true
 vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
-vertex_region: "us-central1"
+vertex_region: "us-east5"
 
 env:
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
+  CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
+  CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
+  CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
+  CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
+  CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
+  CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
+  CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
 
 setup:
-  mcp_servers:
-    "cloud-sql":
-      httpUrl: "https://sqladmin.googleapis.com/mcp"
-      authProviderType: google_credentials
-      headers:
-        X-Goog-User-Project: "${GOOGLE_CLOUD_PROJECT}"
+  skills_dir: "/workspace/cloud-sql-postgresql"

From bbb044b2d0962cc847ff9134fd8a8765582fb3a1 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 07:39:47 +0000
Subject: [PATCH 4/8] ci: update evaluation model to claude-opus-4-1 and set
 CLOUD_ML_REGION env var

---
 evals/claude_code_model.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml
index 0d2027e..8654e15 100644
--- a/evals/claude_code_model.yaml
+++ b/evals/claude_code_model.yaml
@@ -14,13 +14,14 @@
 
 claude_code_version: "@anthropic-ai/claude-code@2.1.85"
 generator: claude_code
-model: "claude-opus-4-6"
+model: "claude-opus-4-1@20250805"
 
 use_vertex: true
 vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
 vertex_region: "us-east5"
 
 env:
+  CLOUD_ML_REGION: "us-east5"
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
   CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
   CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"

From abae0d3e56118a8cd60a1cecb9352b5b73e30bff Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 07:51:03 +0000
Subject: [PATCH 5/8] ci: update claude-opus model version to 4-6 in eval
 configuration

---
 evals/claude_code_model.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml
index 8654e15..d0ec594 100644
--- a/evals/claude_code_model.yaml
+++ b/evals/claude_code_model.yaml
@@ -14,7 +14,7 @@
 
 claude_code_version: "@anthropic-ai/claude-code@2.1.85"
 generator: claude_code
-model: "claude-opus-4-1@20250805"
+model: "claude-opus-4-6"
 
 use_vertex: true
 vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"

From 5d5fc1c701673e43442c76597e2880627732df3d Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 11:45:30 +0000
Subject: [PATCH 6/8] ci: update evaluation configuration with skill tagging,
 model upgrade to 4-7, and restructured run settings

---
 evals/claude_code_model.yaml |  9 ++++++---
 evals/claude_run_config.yaml | 10 +++++++++-
 evals/dataset.json           | 12 ++++++++++++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml
index d0ec594..cce3f00 100644
--- a/evals/claude_code_model.yaml
+++ b/evals/claude_code_model.yaml
@@ -14,15 +14,18 @@
 
 claude_code_version: "@anthropic-ai/claude-code@2.1.85"
 generator: claude_code
-model: "claude-opus-4-6"
+model: "claude-opus-4-7"
 
 use_vertex: true
 vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
-vertex_region: "us-east5"
+vertex_region: "global"
 
 env:
-  CLOUD_ML_REGION: "us-east5"
+  # Global environment variables
+  CLOUD_ML_REGION: "global"
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
+
+  # Cloud SQL PostgreSQL extension configuration
   CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
   CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
   CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
diff --git a/evals/claude_run_config.yaml b/evals/claude_run_config.yaml
index 7bc10f4..5e88265 100644
--- a/evals/claude_run_config.yaml
+++ b/evals/claude_run_config.yaml
@@ -12,13 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Dataset Related Configs
 dataset_config: /workspace/evals/dataset.json
-dataset_format: gemini-cli-format
+dataset_format: agent-format
 
+# Orchestrator Configuration
 orchestrator: agent
 model_config: /workspace/evals/claude_code_model.yaml
 simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml
 
+# Runner Related Configs
+runners:
+  agent_runners: 1
+
+# Scorer Related Configs
 scorers:
   # Qualitative (Judge-based)
   goal_completion:
@@ -36,6 +43,7 @@ scorers:
   token_consumption: {}
   skills_trajectory: {}
 
+# Reporting Related Configs
 reporting:
   bigquery:
     gcp_project_id: "${EVAL_REPORTING_PROJECT}"
diff --git a/evals/dataset.json b/evals/dataset.json
index 654015f..72d0878 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -8,6 +8,9 @@
         "list_instances",
         "get_instance"
       ],
+      "expected_skills": [
+        "cloud-sql-postgres-admin"
+      ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
@@ -22,6 +25,9 @@
         "list_schemas",
         "list_tables"
       ],
+      "expected_skills": [
+        "cloud-sql-postgres-data"
+      ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
@@ -36,6 +42,9 @@
         "list_active_queries",
         "list_locks"
       ],
+      "expected_skills": [
+        "cloud-sql-postgres-monitor"
+      ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
@@ -50,6 +59,9 @@
         "get_system_metrics",
         "list_database_stats"
       ],
+      "expected_skills": [
+        "cloud-sql-postgres-monitor"
+      ],
       "env": {
         "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },

From 9dfca58730385e7587112dda8bb830d05b55a8c2 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 11:50:54 +0000
Subject: [PATCH 7/8] ci: upgrade claude-code version to 2.1.119

---
 evals/claude_code_model.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml
index cce3f00..d84c40a 100644
--- a/evals/claude_code_model.yaml
+++ b/evals/claude_code_model.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-claude_code_version: "@anthropic-ai/claude-code@2.1.85"
+claude_code_version: "@anthropic-ai/claude-code@2.1.119"
 generator: claude_code
 model: "claude-opus-4-7"
 

From 9b88419c54bbc63deb7a85450246d76dfd0a1f91 Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Thu, 7 May 2026 12:26:38 +0000
Subject: [PATCH 8/8] ci: split shared dataset into model-specific
 configurations and update references

---
 evals/claude_dataset.json                   | 72 +++++++++++++++++++++
 evals/claude_run_config.yaml                |  2 +-
 evals/{dataset.json => gemini_dataset.json} |  2 +-
 evals/gemini_run_config.yaml                |  2 +-
 4 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 evals/claude_dataset.json
 rename evals/{dataset.json => gemini_dataset.json} (99%)

diff --git a/evals/claude_dataset.json b/evals/claude_dataset.json
new file mode 100644
index 0000000..acc8a19
--- /dev/null
+++ b/evals/claude_dataset.json
@@ -0,0 +1,72 @@
+{
+  "scenarios": [
+    {
+      "id": "cloud-sql-debug-instance",
+      "starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
+      "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
+      "expected_trajectory": [
+        "list_instances.js",
+        "get_instance.js"
+      ],
+      "expected_skills": [
+        "cloud-sql-postgres-admin"
+      ],
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
+      },
+      "kind": "tools",
+      "max_turns": 3
+    },
+    {
+      "id": "cloud-sql-schema-tables-explore",
+      "starting_prompt": "I want to understand the structure of my database.",
+      "conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.",
+      "expected_trajectory": [
+        "list_schemas.js",
+        "list_tables.js"
+      ],
+      "expected_skills": [
+        "cloud-sql-postgres-data"
+      ],
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
+      },
+      "kind": "tools",
+      "max_turns": 3
+    },
+    {
+      "id": "cloud-sql-performance-check",
+      "starting_prompt": "Our database performance seems degraded.",
+      "conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.",
+      "expected_trajectory": [
+        "list_active_queries.js",
+        "list_locks.js"
+      ],
+      "expected_skills": [
+        "cloud-sql-postgres-monitor"
+      ],
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
+      },
+      "kind": "tools",
+      "max_turns": 3
+    },
+    {
+      "id": "cloud-sql-metrics-cpu-investigation",
+      "starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.",
+      "conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
+      "expected_trajectory": [
+        "get_system_metrics.js",
+        "list_database_stats.js"
+      ],
+      "expected_skills": [
+        "cloud-sql-postgres-monitor"
+      ],
+      "env": {
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
+      },
+      "kind": "tools",
+      "max_turns": 3
+    }
+  ]
+}
diff --git a/evals/claude_run_config.yaml b/evals/claude_run_config.yaml
index 5e88265..5afd7c0 100644
--- a/evals/claude_run_config.yaml
+++ b/evals/claude_run_config.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # Dataset Related Configs
-dataset_config: /workspace/evals/dataset.json
+dataset_config: /workspace/evals/claude_dataset.json
 dataset_format: agent-format
 
 # Orchestrator Configuration
diff --git a/evals/dataset.json b/evals/gemini_dataset.json
similarity index 99%
rename from evals/dataset.json
rename to evals/gemini_dataset.json
index 72d0878..7ceead7 100644
--- a/evals/dataset.json
+++ b/evals/gemini_dataset.json
@@ -69,4 +69,4 @@
       "max_turns": 3
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/evals/gemini_run_config.yaml b/evals/gemini_run_config.yaml
index 719ed45..bc0c7ab 100644
--- a/evals/gemini_run_config.yaml
+++ b/evals/gemini_run_config.yaml
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-dataset_config: /workspace/evals/dataset.json
+dataset_config: /workspace/evals/gemini_dataset.json
 dataset_format: gemini-cli-format
 
 orchestrator: geminicli