From 99b967ad62074080959bb08ffd384162c1c6922e Mon Sep 17 00:00:00 2001
From: Annie Liang <xinlian@microsoft.com>
Date: Fri, 27 Feb 2026 11:40:01 -0800
Subject: [PATCH 01/22] Add benchmark infrastructure: scripts, Java changes,
 and config

Add benchmark shell scripts for VM provisioning, setup, execution,
monitoring, diagnostics capture, and dashboard generation.

Update BenchmarkConfig, BenchmarkOrchestrator, and TenantWorkloadConfig
to support multi-tenant benchmark orchestration with per-tenant
configuration overrides.

Add .gitignore entries for benchmark artifacts and Copilot skills.
Add test-setup and test-results directory scaffolding with READMEs
and a sample tenants.json template (no real credentials).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .gitignore                                    |   3 +
 sdk/cosmos/.gitignore                         |   2 +
 sdk/cosmos/azure-cosmos-benchmark/.gitignore  |  24 +++
 .../scripts/capture-diagnostics.sh            |  72 ++++++++
 .../scripts/generate-dashboard.py             | 157 ++++++++++++++++++
 .../azure-cosmos-benchmark/scripts/monitor.sh |  46 +++++
 .../scripts/provision-benchmark-vm.sh         | 123 ++++++++++++++
 .../scripts/run-benchmark.sh                  |  92 ++++++++++
 .../scripts/setup-benchmark-vm.sh             |  36 ++++
 .../scripts/setup-result-storage.sh           |  33 ++++
 .../scripts/trigger-benchmark.sh              | 100 +++++++++++
 .../cosmos/benchmark/BenchmarkConfig.java     |  69 ++------
 .../benchmark/BenchmarkOrchestrator.java      | 145 ++++++++--------
 .../benchmark/TenantWorkloadConfig.java       |  65 +++-----
 .../test-results/BENCHMARK_RESULTS.md         | 154 +++++++++++++++++
 .../test-results/README.md                    |  39 +++++
 .../test-setup/README.md                      |  39 +++++
 .../test-setup/tenants-sample.json            |  51 ++++++
 18 files changed, 1085 insertions(+), 165 deletions(-)
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/.gitignore
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/capture-diagnostics.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/monitor.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/setup-result-storage.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/test-results/README.md
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants-sample.json

diff --git a/.gitignore b/.gitignore
index 997c0e0648a2..bd8efd5068c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -126,3 +126,6 @@ stress-test-addons*
 
 # Temp typespec files
 TempTypeSpecFiles/
+
+# Copilot skills (local agent config)
+.github/skills/
diff --git a/sdk/cosmos/.gitignore b/sdk/cosmos/.gitignore
index 1ea74182f6bb..3e6db8ffea6a 100644
--- a/sdk/cosmos/.gitignore
+++ b/sdk/cosmos/.gitignore
@@ -2,3 +2,5 @@
 
 metastore_db/*
 spark-warehouse/*
+
+multi-tenancy-analysis.md
diff --git a/sdk/cosmos/azure-cosmos-benchmark/.gitignore b/sdk/cosmos/azure-cosmos-benchmark/.gitignore
new file mode 100644
index 000000000000..ffe22f016492
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/.gitignore
@@ -0,0 +1,24 @@
+# Test setup (contains secrets and tenant configs)
+test-setup/
+!test-setup/README.md
+!test-setup/tenants-sample.json
+
+# Test results (downloaded from VM)
+test-results/
+!test-results/README.md
+
+# VM connection files
+.vm-ip
+.vm-user
+.vm-key
+
+# Results directory
+results/
+
+# Scripts
+scripts/
+
+# Benchmark documentation
+BENCHMARK_RESULTS.md
+IMPLEMENTATION_GUIDE.md
+MULTI_TENANCY_TEST_PLAN.md
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/capture-diagnostics.sh b/sdk/cosmos/azure-cosmos-benchmark/scripts/capture-diagnostics.sh
new file mode 100644
index 000000000000..db3959442489
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/capture-diagnostics.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# capture-diagnostics.sh — Capture thread/heap dumps of a running benchmark
+#
+# Usage:
+#   ./capture-diagnostics.sh [--threads] [--heap] [--jfr <duration>] [--output-dir <dir>] [--all]
+
+set -euo pipefail
+
+CAPTURE_THREADS=false
+CAPTURE_HEAP=false
+JFR_DURATION=""
+OUTPUT_DIR="./diagnostics/$(date +%Y%m%dT%H%M%S)"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --threads)     CAPTURE_THREADS=true ;;
+        --heap)        CAPTURE_HEAP=true ;;
+        --jfr)         JFR_DURATION="$2"; shift ;;
+        --output-dir)  OUTPUT_DIR="$2"; shift ;;
+        --all)         CAPTURE_THREADS=true; CAPTURE_HEAP=true; JFR_DURATION="60" ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+mkdir -p "$OUTPUT_DIR"
+
+# Find the benchmark JVM PID
+BENCH_PID=$(jps -l 2>/dev/null | grep -E 'azure-cosmos-benchmark|MultiTenancyBenchmark' | awk '{print $1}')
+if [[ -z "$BENCH_PID" ]]; then
+    echo "ERROR: No running benchmark JVM found."
+    echo "Running Java processes:"
+    jps -l 2>/dev/null || echo "(jps not available)"
+    exit 1
+fi
+echo "Found benchmark PID: $BENCH_PID"
+
+# Thread dump
+if [[ "$CAPTURE_THREADS" == "true" ]]; then
+    THREAD_FILE="$OUTPUT_DIR/thread-dump-$(date +%H%M%S).txt"
+    echo "Capturing thread dump..."
+    jstack "$BENCH_PID" > "$THREAD_FILE" 2>&1
+    echo "  Thread dump: $THREAD_FILE ($(wc -l < "$THREAD_FILE") lines)"
+    echo "" >> "$THREAD_FILE"
+    echo "=== Thread Name Prefix Summary ===" >> "$THREAD_FILE"
+    grep '"' "$THREAD_FILE" | sed 's/"\([^"]*\)".*/\1/' | sed 's/-[0-9]*$//' | sort | uniq -c | sort -rn >> "$THREAD_FILE"
+fi
+
+# Heap dump
+if [[ "$CAPTURE_HEAP" == "true" ]]; then
+    HEAP_FILE="$OUTPUT_DIR/heap-dump-$(date +%H%M%S).hprof"
+    echo "Capturing heap dump (this may take a minute)..."
+    jmap -dump:live,format=b,file="$HEAP_FILE" "$BENCH_PID"
+    HEAP_SIZE_MB=$(du -m "$HEAP_FILE" | awk '{print $1}')
+    echo "  Heap dump: $HEAP_FILE (${HEAP_SIZE_MB} MB)"
+fi
+
+# JFR recording
+if [[ -n "$JFR_DURATION" ]]; then
+    JFR_FILE="$OUTPUT_DIR/recording-$(date +%H%M%S).jfr"
+    echo "Starting JFR recording for ${JFR_DURATION}s..."
+    jcmd "$BENCH_PID" JFR.start duration="${JFR_DURATION}s" filename="$JFR_FILE" settings=profile
+    echo "  JFR will be saved to: $JFR_FILE (after ${JFR_DURATION}s)"
+fi
+
+# Quick process stats
+echo ""
+echo "=== Process Stats ==="
+echo "Threads: $(ls /proc/$BENCH_PID/task 2>/dev/null | wc -l || echo 'N/A')"
+echo "FDs:     $(ls /proc/$BENCH_PID/fd 2>/dev/null | wc -l || echo 'N/A')"
+echo ""
+echo "Diagnostics saved to: $OUTPUT_DIR"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py b/sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py
new file mode 100644
index 000000000000..41ac591345f8
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py
@@ -0,0 +1,157 @@
+import csv, re, os, sys
+from datetime import datetime, timezone
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+
+metrics_dir = sys.argv[1]
+log_file = sys.argv[2]
+output_html = sys.argv[3]
+# Optional: external monitor CSV
+monitor_csv = sys.argv[4] if len(sys.argv) > 4 else None
+
+# Parse lifecycle events
+lifecycle = []
+with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
+    for line in f:
+        m = re.search(r'\[LIFECYCLE\]\s+(\S+).*timestamp=(\S+)', line)
+        if m:
+            event = m.group(1)
+            ts_str = m.group(2)
+            try:
+                dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
+                lifecycle.append((dt.isoformat(), event))
+            except:
+                pass
+
+def read_metric(filename, value_col=1):
+    filepath = os.path.join(metrics_dir, filename)
+    if not os.path.exists(filepath):
+        return [], []
+    times, vals = [], []
+    seen = set()
+    with open(filepath, 'r') as f:
+        reader = csv.reader(f)
+        next(reader)
+        for row in reader:
+            t = int(row[0])
+            if t in seen:
+                continue
+            seen.add(t)
+            dt = datetime.fromtimestamp(t, tz=timezone.utc)
+            try:
+                v = float(row[value_col])
+            except:
+                continue
+            times.append(dt.isoformat())
+            vals.append(v)
+    return times, vals
+
+def read_monitor_csv(filepath):
+    if not filepath or not os.path.exists(filepath):
+        return {}
+    cols = {}
+    with open(filepath, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            for key in row:
+                if key not in cols:
+                    cols[key] = []
+                cols[key].append(row[key])
+    # Convert timestamps
+    if 'timestamp' in cols:
+        cols['time'] = cols['timestamp']
+    elif 'epoch' in cols:
+        cols['time'] = [datetime.fromtimestamp(int(e), tz=timezone.utc).isoformat() for e in cols['epoch']]
+    # Convert numeric cols
+    for key in ['cpu_pct', 'rss_mb', 'vsz_mb', 'threads', 'fd_count', 'tcp_established', 'tcp_time_wait', 'tcp_close_wait']:
+        if key in cols:
+            cols[key] = [float(v) for v in cols[key]]
+    return cols
+
+# Load in-process metrics
+heap_t, heap_v = read_metric('memory.heap.used.csv')
+heap_v_mb = [v / (1024*1024) for v in heap_v]
+thread_t, thread_v = read_metric('threads.count.csv')
+success_t, success_v = read_metric('#Successful Operations.csv', value_col=1)
+gc_t, gc_v = read_metric('gc.G1-Young-Generation.count.csv')
+
+# Load external monitor
+mon = read_monitor_csv(monitor_csv)
+has_monitor = 'time' in mon and len(mon['time']) > 0
+
+# Determine layout
+if has_monitor:
+    rows = 7
+    titles = ('Heap Memory (MB) [JVM]', 'RSS Memory (MB) [OS]', 'CPU % [OS]',
+              'Thread Count [JVM]', 'File Descriptors [OS]', 'TCP Connections [OS]',
+              'Successful Ops (cumulative)')
+    heights = [0.16, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14]
+else:
+    rows = 4
+    titles = ('Heap Memory (MB)', 'Thread Count', 'Successful Ops (cumulative)', 'GC Count (G1 Young)')
+    heights = [0.3, 0.25, 0.25, 0.2]
+
+fig = make_subplots(rows=rows, cols=1, shared_xaxes=True, vertical_spacing=0.04,
+    subplot_titles=titles, row_heights=heights)
+
+if has_monitor:
+    # Row 1: Heap (JVM)
+    fig.add_trace(go.Scatter(x=heap_t, y=heap_v_mb, mode='lines', name='Heap Used',
+        line=dict(color='#2196F3', width=1), fill='tozeroy', fillcolor='rgba(33,150,243,0.1)'), row=1, col=1)
+    # Row 2: RSS (OS)
+    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('rss_mb', []), mode='lines', name='RSS',
+        line=dict(color='#E91E63', width=2)), row=2, col=1)
+    # Row 3: CPU (OS)
+    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('cpu_pct', []), mode='lines', name='CPU %',
+        line=dict(color='#F44336', width=1.5), fill='tozeroy', fillcolor='rgba(244,67,54,0.1)'), row=3, col=1)
+    # Row 4: Threads (JVM)
+    fig.add_trace(go.Scatter(x=thread_t, y=thread_v, mode='lines', name='Threads (JVM)',
+        line=dict(color='#FF9800', width=2)), row=4, col=1)
+    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('threads', []), mode='lines', name='Threads (OS)',
+        line=dict(color='#FF9800', width=1, dash='dot')), row=4, col=1)
+    # Row 5: FDs (OS)
+    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('fd_count', []), mode='lines', name='File Descriptors',
+        line=dict(color='#795548', width=2)), row=5, col=1)
+    # Row 6: TCP (OS)
+    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('tcp_established', []), mode='lines', name='TCP ESTAB',
+        line=dict(color='#009688', width=2)), row=6, col=1)
+    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('tcp_time_wait', []), mode='lines', name='TCP TIME_WAIT',
+        line=dict(color='#CDDC39', width=1)), row=6, col=1)
+    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('tcp_close_wait', []), mode='lines', name='TCP CLOSE_WAIT',
+        line=dict(color='#FF5722', width=1)), row=6, col=1)
+    # Row 7: Success ops
+    fig.add_trace(go.Scatter(x=success_t, y=success_v, mode='lines', name='Success Ops',
+        line=dict(color='#4CAF50', width=2)), row=7, col=1)
+else:
+    fig.add_trace(go.Scatter(x=heap_t, y=heap_v_mb, mode='lines', name='Heap Used MB',
+        line=dict(color='#2196F3', width=1), fill='tozeroy', fillcolor='rgba(33,150,243,0.1)'), row=1, col=1)
+    fig.add_trace(go.Scatter(x=thread_t, y=thread_v, mode='lines', name='Threads',
+        line=dict(color='#FF9800', width=2)), row=2, col=1)
+    fig.add_trace(go.Scatter(x=success_t, y=success_v, mode='lines', name='Success Ops',
+        line=dict(color='#4CAF50', width=2)), row=3, col=1)
+    fig.add_trace(go.Scatter(x=gc_t, y=gc_v, mode='lines', name='GC Count',
+        line=dict(color='#9C27B0', width=1.5)), row=4, col=1)
+
+# Lifecycle vertical lines
+event_colors = {'CYCLE_START': 'green', 'POST_CREATE': 'blue', 'POST_WORKLOAD': 'orange',
+    'POST_CLOSE': 'red', 'POST_SETTLE': 'gray', 'COMPLETE': 'black', 'PRE_CREATE': 'lightgray'}
+shapes = []
+annotations = []
+for dt_str, event in lifecycle:
+    color = event_colors.get(event, 'gray')
+    shapes.append(dict(type='line', x0=dt_str, x1=dt_str, y0=0, y1=1, yref='paper',
+        line=dict(color=color, width=1, dash='dot')))
+    short = event.replace('POST_', '').replace('CYCLE_', '')
+    annotations.append(dict(x=dt_str, y=1.02, yref='paper', text=short,
+        showarrow=False, font=dict(size=7, color=color), textangle=-45))
+
+monitor_label = ' + OS Monitor' if has_monitor else ''
+fig.update_layout(
+    shapes=shapes, annotations=annotations,
+    title=f'CHURN Benchmark Dashboard{monitor_label}',
+    height=250 * rows, showlegend=True, hovermode='x unified', template='plotly_white',
+    legend=dict(orientation='h', y=-0.03)
+)
+
+fig.write_html(output_html, include_plotlyjs=True)
+print(f'Dashboard ({rows} panels, monitor={"yes" if has_monitor else "no"}): {output_html}')
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/monitor.sh b/sdk/cosmos/azure-cosmos-benchmark/scripts/monitor.sh
new file mode 100644
index 000000000000..be1b1bf854aa
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/monitor.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# monitor.sh  External resource monitor for Java benchmark process
+#
+# Usage:
+#   ./monitor.sh <PID> [interval_sec] [output_dir]
+#   ./monitor.sh $(pgrep -f MultiTenancyBenchmark) 10 ./results/run1
+#
+# Writes monitor.csv with columns:
+#   timestamp, threads, fds, rss_kb, cpu_pct, heap_used_kb, heap_max_kb, gc_count, gc_time_ms
+
+set -euo pipefail
+
+PID=${1:?Usage: monitor.sh <PID> [interval] [output_dir]}
+INTERVAL=${2:-10}
+OUTDIR=${3:-.}
+
+mkdir -p "$OUTDIR"
+OUTFILE="$OUTDIR/monitor.csv"
+
+echo "timestamp,threads,fds,rss_kb,cpu_pct,heap_used_kb,heap_max_kb,gc_count,gc_time_ms" > "$OUTFILE"
+echo "Monitoring PID=$PID every ${INTERVAL}s -> $OUTFILE"
+
+while kill -0 "$PID" 2>/dev/null; do
+    TS=$(date -u +%Y-%m-%dT%H:%M:%SZ)
+    THREADS=$(ls /proc/$PID/task 2>/dev/null | wc -l || echo 0)
+    FDS=$(ls /proc/$PID/fd 2>/dev/null | wc -l || echo 0)
+    RSS=$(ps -p $PID -o rss= 2>/dev/null | tr -d ' ' || echo 0)
+    CPU=$(ps -p $PID -o %cpu= 2>/dev/null | tr -d ' ' || echo 0)
+
+    # Parse jstat -gc output for heap usage
+    JSTAT=$(jstat -gc $PID 2>/dev/null | tail -1 || echo "")
+    if [[ -n "$JSTAT" ]]; then
+        # Columns: S0C S1C S0U S1U EC EU OC OU MC MU CCSC CCSU YGC YGCT FGC FGCT CGC CGCT GCT
+        HEAP_USED=$(echo "$JSTAT" | awk '{printf "%.0f", ($4+$6+$8)}')  # S1U+EU+OU in KB
+        HEAP_MAX=$(echo "$JSTAT" | awk '{printf "%.0f", ($1+$2+$5+$7)}')  # S0C+S1C+EC+OC in KB
+        GC_COUNT=$(echo "$JSTAT" | awk '{printf "%.0f", ($13+$15+$17)}')  # YGC+FGC+CGC
+        GC_TIME=$(echo "$JSTAT" | awk '{printf "%.0f", $NF * 1000}')  # GCT in ms
+    else
+        HEAP_USED=0; HEAP_MAX=0; GC_COUNT=0; GC_TIME=0
+    fi
+
+    echo "$TS,$THREADS,$FDS,$RSS,$CPU,$HEAP_USED,$HEAP_MAX,$GC_COUNT,$GC_TIME" >> "$OUTFILE"
+    sleep "$INTERVAL"
+done
+
+echo "Process $PID exited. Monitor stopped. Results in $OUTFILE"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh b/sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh
new file mode 100644
index 000000000000..8b9a425848c1
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# provision-benchmark-vm.sh — Create a new Azure VM or connect to an existing one
+# See §6.5 of the test plan for full details.
+#
+# Usage:
+#   ./provision-benchmark-vm.sh --new --location eastus [--create-key] [--ssh-key <pub>] [options]
+#   ./provision-benchmark-vm.sh --existing --ip <ip> --user <user> --key <private-key>
+#   ./provision-benchmark-vm.sh --existing --rg <rg> --vm-name <name> --key <private-key>
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+MODE=""
+LOCATION="eastus"
+RG="rg-cosmos-benchmark"
+VM_NAME="vm-benchmark-01"
+VM_SIZE="Standard_D16s_v5"
+VM_IP=""
+SSH_USER="benchuser"
+SSH_PRIVATE_KEY=""
+SSH_PUBLIC_KEY=""
+CREATE_KEY=false
+CREATE_KEY_PATH=""
+DISK_SIZE=256
+SETUP_AFTER_CREATE=true
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --new)          MODE="new" ;;
+        --existing)     MODE="existing" ;;
+        --location)     LOCATION="$2"; shift ;;
+        --rg)           RG="$2"; shift ;;
+        --vm-name)      VM_NAME="$2"; shift ;;
+        --size)         VM_SIZE="$2"; shift ;;
+        --ip)           VM_IP="$2"; shift ;;
+        --user)         SSH_USER="$2"; shift ;;
+        --key)          SSH_PRIVATE_KEY="$2"; shift ;;
+        --ssh-key)      SSH_PUBLIC_KEY="$2"; shift ;;
+        --create-key)
+            CREATE_KEY=true
+            if [[ $# -gt 1 && ! "$2" =~ ^-- ]]; then
+                CREATE_KEY_PATH="$2"; shift
+            fi
+            ;;
+        --disk-size)    DISK_SIZE="$2"; shift ;;
+        --skip-setup)   SETUP_AFTER_CREATE=false ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+ssh_cmd() {
+    local cmd="ssh"
+    [[ -n "$SSH_PRIVATE_KEY" ]] && cmd="$cmd -i $SSH_PRIVATE_KEY"
+    echo "$cmd"
+}
+
+generate_ssh_key() {
+    local key_path="$1"
+    mkdir -p "$(dirname "$key_path")"
+    if [[ -f "$key_path" ]]; then
+        echo "  SSH key already exists at $key_path — reusing."
+    else
+        echo "  Generating new SSH key pair: $key_path"
+        ssh-keygen -t rsa -b 4096 -f "$key_path" -N "" -C "cosmos-benchmark-${VM_NAME}" -q
+    fi
+    chmod 600 "$key_path"
+    chmod 644 "${key_path}.pub"
+    SSH_PRIVATE_KEY="$key_path"
+    SSH_PUBLIC_KEY="${key_path}.pub"
+}
+
+if [[ "$MODE" == "new" ]]; then
+    echo "=== Creating VM: $VM_NAME in $RG ($LOCATION) ==="
+    az group create --name "$RG" --location "$LOCATION" 2>/dev/null || true
+
+    SSH_KEY_ARGS=""
+    if [[ "$CREATE_KEY" == "true" ]]; then
+        [[ -z "$CREATE_KEY_PATH" ]] && CREATE_KEY_PATH="$HOME/.ssh/cosmos-bench-${VM_NAME}"
+        generate_ssh_key "$CREATE_KEY_PATH"
+        SSH_KEY_ARGS="--ssh-key-value $SSH_PUBLIC_KEY"
+    elif [[ -n "$SSH_PUBLIC_KEY" ]]; then
+        SSH_KEY_ARGS="--ssh-key-value $SSH_PUBLIC_KEY"
+        [[ -z "$SSH_PRIVATE_KEY" && "$SSH_PUBLIC_KEY" == *.pub ]] && SSH_PRIVATE_KEY="${SSH_PUBLIC_KEY%.pub}"
+    else
+        SSH_KEY_ARGS="--generate-ssh-keys"
+        [[ -z "$SSH_PRIVATE_KEY" ]] && SSH_PRIVATE_KEY="$HOME/.ssh/id_rsa"
+    fi
+
+    az vm create --resource-group "$RG" --name "$VM_NAME" --image Ubuntu2204 \
+      --size "$VM_SIZE" --accelerated-networking true --admin-username "$SSH_USER" \
+      $SSH_KEY_ARGS --authentication-type ssh --os-disk-size-gb "$DISK_SIZE" --storage-sku Premium_LRS
+
+    az vm open-port --resource-group "$RG" --name "$VM_NAME" --port 22
+    VM_IP=$(az vm show -g "$RG" -n "$VM_NAME" -d --query publicIps -o tsv)
+    echo "VM created. IP: $VM_IP"
+
+    if [[ "$SETUP_AFTER_CREATE" == "true" ]]; then
+        echo "=== Running setup script ==="
+        $(ssh_cmd) -o StrictHostKeyChecking=no "${SSH_USER}@${VM_IP}" 'bash -s' < "$SCRIPT_DIR/setup-benchmark-vm.sh"
+    fi
+
+elif [[ "$MODE" == "existing" ]]; then
+    [[ -z "$SSH_PRIVATE_KEY" && -f "$HOME/.ssh/id_rsa" ]] && SSH_PRIVATE_KEY="$HOME/.ssh/id_rsa"
+    [[ -z "$SSH_PRIVATE_KEY" ]] && { echo "ERROR: --key <private-key-path> required"; exit 1; }
+    [[ -z "$VM_IP" ]] && VM_IP=$(az vm show -g "$RG" -n "$VM_NAME" -d --query publicIps -o tsv)
+    echo "=== Connecting to ${SSH_USER}@${VM_IP} ==="
+    $(ssh_cmd) -o ConnectTimeout=10 "${SSH_USER}@${VM_IP}" 'echo "VM reachable. JDK: $(java -version 2>&1 | head -1)"'
+else
+    echo "Usage: $0 --new --location <region> | --existing --ip <ip> --key <key>"
+    exit 1
+fi
+
+echo "$VM_IP" > .vm-ip
+echo "$SSH_USER" > .vm-user
+echo "$SSH_PRIVATE_KEY" > .vm-key
+
+# Also save to test-setup/ for organized access
+mkdir -p test-setup
+echo "VM_IP=$VM_IP" > test-setup/vm-config.env
+echo "VM_USER=$SSH_USER" >> test-setup/vm-config.env
+echo "VM_KEY_PATH=$SSH_PRIVATE_KEY" >> test-setup/vm-config.env
+echo "=== Ready: $(ssh_cmd) ${SSH_USER}@${VM_IP} ==="
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh
new file mode 100644
index 000000000000..67414bd8046e
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# run-benchmark.sh  Run a multi-tenancy benchmark with external resource monitoring
+#
+# Usage:
+#   ./run-benchmark.sh <scenario> <tenants-file> [output-dir] [extra-args...]
+
+set -euo pipefail
+
+SCENARIO=${1:-SCALING}
+TENANTS_FILE=${2:-tenants.json}
+if [[ ! -f "$TENANTS_FILE" && -f "sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants.json" ]]; then
+    TENANTS_FILE="sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants.json"
+fi
+
+OUTPUT_DIR=${3:-./results/$(date +%Y%m%dT%H%M%S)-${SCENARIO}}
+shift 3 2>/dev/null || true
+EXTRA_ARGS="$*"
+
+mkdir -p "$OUTPUT_DIR"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+MODULE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# Git metadata
+BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
+COMMIT_ID=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+cat > "${OUTPUT_DIR}/git-info.json" <<EOF
+{
+    "branch": "$BRANCH",
+    "commitId": "$COMMIT_ID",
+    "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+}
+EOF
+
+echo "=== Multi-Tenancy Benchmark ==="
+echo "  Scenario:  $SCENARIO"
+echo "  Tenants:   $TENANTS_FILE"
+echo "  Branch:    $BRANCH"
+echo "  Commit:    $COMMIT_ID"
+echo "  Output:    $OUTPUT_DIR"
+
+# Find benchmark JAR
+BENCHMARK_JAR=$(find "$MODULE_DIR/target" -name "azure-cosmos-benchmark-*-jar-with-dependencies.jar" 2>/dev/null | head -1 || true)
+if [[ -z "$BENCHMARK_JAR" ]]; then
+    echo "ERROR: Benchmark JAR not found. Build first."
+    exit 1
+fi
+
+JVM_OPTS="-Xmx8g -Xms8g -XX:+UseG1GC -XX:MaxDirectMemorySize=2g \
+  -Xlog:gc*:file=${OUTPUT_DIR}/gc.log:time,uptime,level \
+  -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=${OUTPUT_DIR}/"
+
+# Start benchmark process (not piped, so we get the real PID)
+java $JVM_OPTS \
+  -cp "$BENCHMARK_JAR" \
+  com.azure.cosmos.benchmark.BenchmarkOrchestrator \
+  --tenantsFile "$TENANTS_FILE" \
+  --scenario "$SCENARIO" \
+  --outputDir "$OUTPUT_DIR" \
+  $EXTRA_ARGS \
+  > >(tee "${OUTPUT_DIR}/benchmark.log") 2>&1 &
+JAVA_PID=$!
+
+echo "  Java PID:  $JAVA_PID"
+
+# Start monitor alongside (auto-stops when Java PID exits)
+MONITOR_PID=""
+if [[ -f "$SCRIPT_DIR/monitor.sh" ]]; then
+    echo "  Monitor:   ${OUTPUT_DIR}/monitor.csv"
+    bash "$SCRIPT_DIR/monitor.sh" "$JAVA_PID" 60 "$OUTPUT_DIR" &
+    MONITOR_PID=$!
+else
+    echo "  Monitor:   skipped (monitor.sh not found)"
+fi
+
+# Cleanup function: stop monitor when benchmark exits
+cleanup() {
+    if [[ -n "$MONITOR_PID" ]] && kill -0 "$MONITOR_PID" 2>/dev/null; then
+        kill "$MONITOR_PID" 2>/dev/null || true
+        wait "$MONITOR_PID" 2>/dev/null || true
+    fi
+}
+trap cleanup EXIT
+
+# Wait for benchmark to finish
+wait "$JAVA_PID"
+BENCH_EXIT=$?
+
+# Collect final system snapshot
+ss -s > "${OUTPUT_DIR}/ss-summary.txt" 2>/dev/null || true
+
+echo "Results in: $OUTPUT_DIR"
+exit $BENCH_EXIT
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh b/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh
new file mode 100644
index 000000000000..371cb9bc72df
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# setup-benchmark-vm.sh — Run once after VM creation to install dependencies
+# See §6.3 of the test plan.
+
+set -euo pipefail
+
+echo "=== Setting up benchmark VM ==="
+
+# JDK + networking tools
+sudo apt-get update && sudo apt-get install -y openjdk-21-jdk git net-tools iproute2 sysstat procps tmux
+
+# Maven 3.9+ (Ubuntu 22.04 apt provides 3.6.3 which is too old for SDK plugins)
+wget -q https://dlcdn.apache.org/maven/maven-3/3.9.12/binaries/apache-maven-3.9.12-bin.tar.gz -O /tmp/maven.tar.gz
+sudo tar -xzf /tmp/maven.tar.gz -C /opt/
+sudo ln -sf /opt/apache-maven-3.9.12/bin/mvn /usr/local/bin/mvn
+export PATH=/opt/apache-maven-3.9.12/bin:$PATH
+
+# Async-profiler
+wget -qO /tmp/async-profiler.tar.gz \
+  https://github.com/async-profiler/async-profiler/releases/download/v3.0/async-profiler-3.0-linux-x64.tar.gz
+sudo tar -xzf /tmp/async-profiler.tar.gz -C /opt/
+echo 'export PATH=$PATH:/opt/async-profiler-3.0-linux-x64/bin' >> ~/.bashrc
+
+# Clone SDK
+git clone https://github.com/Azure/azure-sdk-for-java.git ~/azure-sdk-for-java
+cd ~/azure-sdk-for-java
+
+# Build benchmark module (must run from sdk/cosmos)
+cd sdk/cosmos
+mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos -am clean install
+mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos-test clean install
+mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos-encryption clean install
+mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos-benchmark clean package -P package-assembly
+
+echo "=== Setup complete ==="
+echo "Next: Set APPLICATIONINSIGHTS_CONNECTION_STRING and copy tenants.json from test-setup/ to the VM"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-result-storage.sh b/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-result-storage.sh
new file mode 100644
index 000000000000..69faaa231418
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-result-storage.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# setup-result-storage.sh — Create Cosmos DB containers for benchmark results
+# See §9.2.3 of the test plan.
+
+set -euo pipefail
+
+RESOURCE_GROUP="${1:-rg-cosmos-benchmark}"
+ACCOUNT_NAME="${2:-cosmos-bench-results}"
+LOCATION="${3:-eastus}"
+
+echo "=== Creating result storage ==="
+echo "  RG:       $RESOURCE_GROUP"
+echo "  Account:  $ACCOUNT_NAME"
+echo "  Location: $LOCATION"
+
+az cosmosdb create --name "$ACCOUNT_NAME" --resource-group "$RESOURCE_GROUP" \
+  --default-consistency-level Session --locations regionName="$LOCATION" failoverPriority=0
+
+az cosmosdb sql database create --account-name "$ACCOUNT_NAME" \
+  --resource-group "$RESOURCE_GROUP" --name benchresults
+
+az cosmosdb sql container create --account-name "$ACCOUNT_NAME" \
+  --resource-group "$RESOURCE_GROUP" --database-name benchresults \
+  --name runs --partition-key-path /scenario --throughput 400
+
+az cosmosdb sql container create --account-name "$ACCOUNT_NAME" \
+  --resource-group "$RESOURCE_GROUP" --database-name benchresults \
+  --name snapshots --partition-key-path /testRunId --throughput 400 --default-ttl 2592000
+
+echo ""
+echo "=== Result storage ready ==="
+echo "  export RESULT_COSMOS_ENDPOINT=$(az cosmosdb show -n $ACCOUNT_NAME -g $RESOURCE_GROUP --query documentEndpoint -o tsv)"
+echo "  export RESULT_COSMOS_KEY=$(az cosmosdb keys list -n $ACCOUNT_NAME -g $RESOURCE_GROUP --query primaryMasterKey -o tsv)"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh
new file mode 100644
index 000000000000..48660ee16c87
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+# trigger-benchmark.sh — Checkout a branch/PR, build, and run benchmark
+#
+# Usage:
+#   ./trigger-benchmark.sh --branch <name> --scenario <scenario> --tenants <file> [options]
+#   ./trigger-benchmark.sh --pr <number> --scenario <scenario> --tenants <file> [options]
+#   ./trigger-benchmark.sh --compare <branch-a> <branch-b> --scenario <scenario> --tenants <file>
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SDK_DIR="${SDK_DIR:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
+BRANCH=""
+PR_NUMBER=""
+COMPARE_A=""
+COMPARE_B=""
+SCENARIO="SCALING"
+TENANTS_FILE="tenants.json"
+RESULT_SINK="CSV"
+SKIP_BUILD=false
+EXTRA_ARGS=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --branch)       BRANCH="$2"; shift ;;
+        --pr)           PR_NUMBER="$2"; shift ;;
+        --compare)      COMPARE_A="$2"; COMPARE_B="$3"; shift 2 ;;
+        --scenario)     SCENARIO="$2"; shift ;;
+        --tenants)      TENANTS_FILE="$2"; shift ;;
+        --result-sink)  RESULT_SINK="$2"; shift ;;
+        --sdk-dir)      SDK_DIR="$2"; shift ;;
+        --skip-build)   SKIP_BUILD=true ;;
+        *)              EXTRA_ARGS="$EXTRA_ARGS $1" ;;
+    esac
+    shift
+done
+
+build_and_run() {
+    local ref="$1"
+    local label="$2"
+
+    echo ""
+    echo "════════════════════════════════════════════════════════"
+    echo "  Building and running: $label ($ref)"
+    echo "════════════════════════════════════════════════════════"
+
+    cd "$SDK_DIR"
+
+    # Checkout
+    if [[ "$ref" =~ ^[0-9]+$ ]]; then
+        echo "Fetching PR #${ref}..."
+        git fetch origin pull/${ref}/head:pr-${ref}
+        git checkout pr-${ref}
+    else
+        echo "Checking out branch: $ref"
+        git fetch origin "$ref"
+        git checkout "$ref"
+        git pull origin "$ref" 2>/dev/null || true
+    fi
+
+    COMMIT_ID=$(git rev-parse --short HEAD)
+    BRANCH_NAME=$(git rev-parse --abbrev-ref HEAD)
+    echo "  Commit: $COMMIT_ID"
+
+    # Build
+    if [[ "$SKIP_BUILD" == "false" ]]; then
+        echo "Building azure-cosmos + benchmark module..."
+        mvn install -pl sdk/cosmos/azure-cosmos -am -DskipTests -q
+        mvn package -pl sdk/cosmos/azure-cosmos-benchmark -DskipTests -q
+        echo "Build complete."
+    fi
+
+    # Run
+    local OUTPUT_DIR="./results/$(date +%Y%m%dT%H%M%S)-${label}-${SCENARIO}"
+    "$SCRIPT_DIR/run-benchmark.sh" "$SCENARIO" "$TENANTS_FILE" "$OUTPUT_DIR" \
+        --branch "$BRANCH_NAME" \
+        ${PR_NUMBER:+--pr "$PR_NUMBER"} \
+        --result-sink "$RESULT_SINK" \
+        $EXTRA_ARGS
+
+    echo "  Results: $OUTPUT_DIR"
+    echo "$OUTPUT_DIR" >> .last-benchmark-runs
+}
+
+if [[ -n "$COMPARE_A" && -n "$COMPARE_B" ]]; then
+    build_and_run "$COMPARE_A" "before"
+    build_and_run "$COMPARE_B" "after"
+    echo ""
+    echo "════════════════════════════════════════════════════════"
+    echo "  Both runs complete. Compare results:"
+    tail -2 .last-benchmark-runs
+    echo "════════════════════════════════════════════════════════"
+elif [[ -n "$PR_NUMBER" ]]; then
+    build_and_run "$PR_NUMBER" "pr-${PR_NUMBER}"
+elif [[ -n "$BRANCH" ]]; then
+    build_and_run "$BRANCH" "$BRANCH"
+else
+    echo "Usage: $0 --branch <name> | --pr <number> | --compare <branch-a> <branch-b>"
+    exit 1
+fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java
index a1e564e2ef85..484bb3b6128a 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java
@@ -3,8 +3,6 @@
 
 package com.azure.cosmos.benchmark;
 
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -37,6 +35,8 @@ public class BenchmarkConfig {
 
     // -- Reporting --
     private String reportingDirectory;
+    private String graphiteEndpoint;
+    private int graphiteEndpointPort = 2003;
     private int printingInterval = 10;
     private String resultUploadEndpoint;
     private String resultUploadKey;
@@ -48,11 +48,6 @@ public class BenchmarkConfig {
     private String branchName = "";
     private String commitId = "";
 
-    // -- JVM-global system properties (apply to all tenants, set once at startup) --
-    private boolean isPartitionLevelCircuitBreakerEnabled = true;
-    private boolean isPerPartitionAutomaticFailoverRequired = true;
-    private int minConnectionPoolSizePerEndpoint = 0;
-
     // -- Tenants (each carries its full effective config) --
     private List<TenantWorkloadConfig> tenantWorkloads = Collections.emptyList();
 
@@ -70,13 +65,8 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
         config.suppressCleanup = cfg.isSuppressCleanup();
 
         if (config.cycles > 1) {
-            long configuredSettleTimeMs = cfg.getSettleTimeMs();
-            // Only apply the default settle time when the configuration uses the sentinel -1.
-            // An explicit value (including 0 to disable settling) should be respected.
-            config.settleTimeMs = (configuredSettleTimeMs == -1)
-                ? DEFAULT_SETTLE_TIME_MS
-                : configuredSettleTimeMs;
-            config.suppressCleanup = true; // suppress container/database cleanup
+          config.settleTimeMs = Math.max(DEFAULT_SETTLE_TIME_MS, cfg.getSettleTimeMs());
+          config.suppressCleanup = true; // suppress container/database cleanup
         }
 
         config.gcBetweenCycles = cfg.isGcBetweenCycles();
@@ -85,6 +75,10 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
         // Reporting
         config.reportingDirectory = cfg.getReportingDirectory() != null
             ? cfg.getReportingDirectory().getPath() : null;
+        if (cfg.getGraphiteEndpoint() != null) {
+            config.graphiteEndpoint = cfg.getGraphiteEndpoint();
+            config.graphiteEndpointPort = cfg.getGraphiteEndpointPort();
+        }
         config.printingInterval = cfg.getPrintingInterval();
         config.resultUploadEndpoint = cfg.getServiceEndpointForRunResultsUploadAccount();
         config.resultUploadKey = cfg.getMasterKeyForRunResultsUploadAccount();
@@ -103,18 +97,10 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
             logger.info("Loading tenant configs from {}. " +
                 "Workload parameters from tenants.json will take priority over CLI args.", tenantsFile);
             config.tenantWorkloads = TenantWorkloadConfig.parseTenantsFile(new File(tenantsFile));
-
-            // Extract JVM-global system properties from globalDefaults
-            config.loadGlobalSystemPropertiesFromTenantsFile(new File(tenantsFile));
         } else {
             // Single tenant from CLI args - use fromConfiguration() to copy ALL fields
             config.tenantWorkloads = Collections.singletonList(
                 TenantWorkloadConfig.fromConfiguration(cfg));
-
-            // JVM-global system properties from CLI
-            config.isPartitionLevelCircuitBreakerEnabled = cfg.isPartitionLevelCircuitBreakerEnabled();
-            config.isPerPartitionAutomaticFailoverRequired = cfg.isPerPartitionAutomaticFailoverRequired();
-            config.minConnectionPoolSizePerEndpoint = cfg.getMinConnectionPoolSizePerEndpoint();
         }
 
         return config;
@@ -129,6 +115,8 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
     public boolean isEnableJvmStats() { return enableJvmStats; }
 
     public String getReportingDirectory() { return reportingDirectory; }
+    public String getGraphiteEndpoint() { return graphiteEndpoint; }
+    public int getGraphiteEndpointPort() { return graphiteEndpointPort; }
     public int getPrintingInterval() { return printingInterval; }
     public String getResultUploadEndpoint() { return resultUploadEndpoint; }
     public String getResultUploadKey() { return resultUploadKey; }
@@ -139,47 +127,14 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
     public String getBranchName() { return branchName; }
     public String getCommitId() { return commitId; }
 
-    public boolean isPartitionLevelCircuitBreakerEnabled() { return isPartitionLevelCircuitBreakerEnabled; }
-    public boolean isPerPartitionAutomaticFailoverRequired() { return isPerPartitionAutomaticFailoverRequired; }
-    public int getMinConnectionPoolSizePerEndpoint() { return minConnectionPoolSizePerEndpoint; }
-
     public List<TenantWorkloadConfig> getTenantWorkloads() { return tenantWorkloads; }
 
     @Override
     public String toString() {
         return String.format(
             "BenchmarkConfig{cycles=%d, settleTimeMs=%d, suppressCleanup=%s, " +
-            "gcBetweenCycles=%s, tenants=%d, reportingDirectory=%s, " +
-            "circuitBreaker=%s, ppaf=%s, minConnPoolSize=%d}",
+            "gcBetweenCycles=%s, tenants=%d, reportingDirectory=%s}",
             cycles, settleTimeMs, suppressCleanup, gcBetweenCycles,
-            tenantWorkloads.size(), reportingDirectory,
-            isPartitionLevelCircuitBreakerEnabled, isPerPartitionAutomaticFailoverRequired,
-            minConnectionPoolSizePerEndpoint);
-    }
-
-    /**
-     * Reads JVM-global system properties from the globalDefaults section of a tenants.json file.
-     * These properties are JVM-wide and cannot vary per tenant.
-     */
-    private void loadGlobalSystemPropertiesFromTenantsFile(File tenantsFile) throws IOException {
-        ObjectMapper mapper = new ObjectMapper();
-        JsonNode root = mapper.readTree(tenantsFile);
-        JsonNode defaults = root.get("globalDefaults");
-        if (defaults == null || !defaults.isObject()) {
-            return;
-        }
-
-        if (defaults.has("isPartitionLevelCircuitBreakerEnabled")) {
-            isPartitionLevelCircuitBreakerEnabled =
-                Boolean.parseBoolean(defaults.get("isPartitionLevelCircuitBreakerEnabled").asText());
-        }
-        if (defaults.has("isPerPartitionAutomaticFailoverRequired")) {
-            isPerPartitionAutomaticFailoverRequired =
-                Boolean.parseBoolean(defaults.get("isPerPartitionAutomaticFailoverRequired").asText());
-        }
-        if (defaults.has("minConnectionPoolSizePerEndpoint")) {
-            minConnectionPoolSizePerEndpoint =
-                Integer.parseInt(defaults.get("minConnectionPoolSizePerEndpoint").asText());
-        }
+            tenantWorkloads.size(), reportingDirectory);
     }
 }
diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
index e8f840f0c481..f266fedc11d8 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
@@ -6,6 +6,9 @@
 import com.codahale.metrics.ConsoleReporter;
 import com.codahale.metrics.CsvReporter;
 import com.codahale.metrics.ScheduledReporter;
+import com.codahale.metrics.graphite.Graphite;
+import com.codahale.metrics.graphite.GraphiteReporter;
+import com.codahale.metrics.MetricFilter;
 import com.azure.cosmos.CosmosClient;
 import com.azure.cosmos.CosmosClientBuilder;
 import com.codahale.metrics.MetricRegistry;
@@ -17,6 +20,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.net.InetSocketAddress;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -29,7 +33,6 @@
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
 
 /**
  * Benchmark orchestrator. Sets up infrastructure (metrics, reporters, system properties),
@@ -54,7 +57,7 @@ public void run(BenchmarkConfig config) throws Exception {
             return;
         }
 
-        setGlobalSystemProperties(config);
+        setGlobalSystemProperties(config.getTenantWorkloads().get(0));
 
         // Set up shared metric registry
         MetricRegistry registry = new MetricRegistry();
@@ -69,9 +72,20 @@ public void run(BenchmarkConfig config) throws Exception {
         // Prepare all tenants (inject shared state, set defaults)
         prepareTenants(config);
 
-        // Reporter selection: CSV > Console
+        // Reporter selection: Graphite > CSV > Console (same pattern as original AsyncBenchmark)
         ScheduledReporter reporter;
-        if (config.getReportingDirectory() != null) {
+        if (config.getGraphiteEndpoint() != null) {
+            Graphite graphite = new Graphite(new InetSocketAddress(
+                config.getGraphiteEndpoint(),
+                config.getGraphiteEndpointPort()));
+            reporter = GraphiteReporter.forRegistry(registry)
+                .convertDurationsTo(TimeUnit.MILLISECONDS)
+                .convertRatesTo(TimeUnit.SECONDS)
+                .filter(MetricFilter.ALL)
+                .build(graphite);
+            logger.info("Graphite reporter started -> {}:{}",
+                config.getGraphiteEndpoint(), config.getGraphiteEndpointPort());
+        } else if (config.getReportingDirectory() != null) {
             Path metricsDir = Paths.get(config.getReportingDirectory(), "metrics");
             Files.createDirectories(metricsDir);
             reporter = CsvReporter.forRegistry(registry)
@@ -153,65 +167,41 @@ private void runLifecycleLoop(BenchmarkConfig config, MetricRegistry registry,
         logger.info("Starting benchmark: {} cycles x {} tenants", totalCycles, tenants.size());
         long startTime = System.currentTimeMillis();
 
-        AtomicInteger threadCounter = new AtomicInteger(0);
-        ExecutorService executor = Executors.newFixedThreadPool(tenants.size(), r -> {
-            Thread t = new Thread(r, "tenant-worker-" + threadCounter.getAndIncrement());
-            t.setDaemon(false);
-            return t;
-        });
-
-        try {
-            for (int cycle = 1; cycle <= totalCycles; cycle++) {
-                reporter.report();
-                logger.info("[LIFECYCLE] CYCLE_START cycle={} timestamp={}", cycle, Instant.now());
-
-                // 1. Create clients
-                List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry);
-                reporter.report();
-                logger.info("[LIFECYCLE] POST_CREATE cycle={} clients={} timestamp={}",
-                    cycle, benchmarks.size(), Instant.now());
-
-                // 2. Run workload in parallel
-                runWorkload(benchmarks, cycle, executor);
-                reporter.report();
-                logger.info("[LIFECYCLE] POST_WORKLOAD cycle={} timestamp={}", cycle, Instant.now());
-
-                // 3. Close all clients
-                shutdownBenchmarks(benchmarks, cycle);
-                reporter.report();
-                logger.info("[LIFECYCLE] POST_CLOSE cycle={} timestamp={}", cycle, Instant.now());
-
-                // 4. Settle
-                if (config.getSettleTimeMs() > 0) {
-                    logger.info("  Settling for {}ms...", config.getSettleTimeMs());
-                    long halfSettle = config.getSettleTimeMs() / 2;
-                    Thread.sleep(halfSettle);
-                    if (config.isGcBetweenCycles()) {
-                        System.gc();
-                    }
-                    Thread.sleep(config.getSettleTimeMs() - halfSettle);
-                    if (config.isGcBetweenCycles()) {
-                        System.gc();
-                    }
+        for (int cycle = 1; cycle <= totalCycles; cycle++) {
+            reporter.report();
+            logger.info("[LIFECYCLE] CYCLE_START cycle={} timestamp={}", cycle, Instant.now());
+
+            // 1. Create clients
+            List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry);
+            reporter.report();
+            logger.info("[LIFECYCLE] POST_CREATE cycle={} clients={} timestamp={}",
+                cycle, benchmarks.size(), Instant.now());
+
+            // 2. Run workload in parallel
+            runWorkload(benchmarks, cycle);
+            reporter.report();
+            logger.info("[LIFECYCLE] POST_WORKLOAD cycle={} timestamp={}", cycle, Instant.now());
+
+            // 3. Close all clients
+            shutdownBenchmarks(benchmarks, cycle);
+            reporter.report();
+            logger.info("[LIFECYCLE] POST_CLOSE cycle={} timestamp={}", cycle, Instant.now());
+
+            // 4. Settle
+            if (config.getSettleTimeMs() > 0) {
+                logger.info("  Settling for {}ms...", config.getSettleTimeMs());
+                long halfSettle = config.getSettleTimeMs() / 2;
+                Thread.sleep(halfSettle);
+                if (config.isGcBetweenCycles()) {
+                    System.gc();
                 }
-                reporter.report();
-                logger.info("[LIFECYCLE] POST_SETTLE cycle={} timestamp={}", cycle, Instant.now());
-            }
-        } finally {
-            executor.shutdown();
-            try {
-                if (!executor.awaitTermination(60, TimeUnit.SECONDS)) {
-                    logger.warn("Executor did not terminate within the timeout");
-                    executor.shutdownNow();
-                    if (!executor.awaitTermination(60, TimeUnit.SECONDS)) {
-                        logger.error("Executor did not terminate after shutdownNow");
-                    }
+                Thread.sleep(config.getSettleTimeMs() - halfSettle);
+                if (config.isGcBetweenCycles()) {
+                    System.gc();
                 }
-            } catch (InterruptedException e) {
-                logger.warn("Interrupted while awaiting executor termination", e);
-                executor.shutdownNow();
-                Thread.currentThread().interrupt();
             }
+            reporter.report();
+            logger.info("[LIFECYCLE] POST_SETTLE cycle={} timestamp={}", cycle, Instant.now());
         }
 
         long durationSec = (System.currentTimeMillis() - startTime) / 1000;
@@ -227,7 +217,12 @@ private List<AsyncBenchmark<?>> createBenchmarks(BenchmarkConfig config, MetricR
         return benchmarks;
     }
 
-    private void runWorkload(List<AsyncBenchmark<?>> benchmarks, int cycle, ExecutorService executor) throws Exception {
+    private void runWorkload(List<AsyncBenchmark<?>> benchmarks, int cycle) throws Exception {
+        ExecutorService executor = Executors.newFixedThreadPool(benchmarks.size(), r -> {
+            Thread t = new Thread(r, "tenant-worker");
+            t.setDaemon(false);
+            return t;
+        });
         List<Future<?>> futures = new ArrayList<>();
         final int currentCycle = cycle;
         for (AsyncBenchmark<?> benchmark : benchmarks) {
@@ -242,6 +237,7 @@ private void runWorkload(List<AsyncBenchmark<?>> benchmarks, int cycle, Executor
         for (Future<?> f : futures) {
             f.get();
         }
+        executor.shutdown();
     }
 
     private void shutdownBenchmarks(List<AsyncBenchmark<?>> benchmarks, int cycle) {
@@ -334,6 +330,15 @@ private MeterRegistry buildCosmosMicrometerRegistry() {
             return tempCfg.getAzureMonitorMeterRegistry();
         }
 
+        String graphiteAddress = System.getProperty("azure.cosmos.monitoring.graphite.serviceAddress",
+            StringUtils.defaultString(
+                com.google.common.base.Strings.emptyToNull(
+                    System.getenv("GRAPHITE_SERVICE_ADDRESS")), null));
+        if (graphiteAddress != null) {
+            Configuration tempCfg = new Configuration();
+            return tempCfg.getGraphiteMeterRegistry();
+        }
+
         return null;
     }
 
@@ -350,8 +355,10 @@ private void clearGlobalSystemProperties() {
         System.clearProperty("COSMOS.MIN_CONNECTION_POOL_SIZE_PER_ENDPOINT");
     }
 
-    private void setGlobalSystemProperties(BenchmarkConfig config) {
-        if (config.isPartitionLevelCircuitBreakerEnabled()) {
+    private void setGlobalSystemProperties(TenantWorkloadConfig firstTenant) {
+        String circuitBreakerEnabled = firstTenant.getIsPartitionLevelCircuitBreakerEnabled();
+        if (circuitBreakerEnabled == null) circuitBreakerEnabled = "true";
+        if (Boolean.parseBoolean(circuitBreakerEnabled)) {
             System.setProperty("COSMOS.PARTITION_LEVEL_CIRCUIT_BREAKER_CONFIG",
                 "{\"isPartitionLevelCircuitBreakerEnabled\": true, "
                     + "\"circuitBreakerType\": \"CONSECUTIVE_EXCEPTION_COUNT_BASED\","
@@ -361,21 +368,21 @@ private void setGlobalSystemProperties(BenchmarkConfig config) {
             System.setProperty("COSMOS.ALLOWED_PARTITION_UNAVAILABILITY_DURATION_IN_SECONDS", "30");
         }
 
-        if (config.isPerPartitionAutomaticFailoverRequired()) {
+        String ppafEnabled = firstTenant.getIsPerPartitionAutomaticFailoverRequired();
+        if (ppafEnabled == null) ppafEnabled = "true";
+        if (Boolean.parseBoolean(ppafEnabled)) {
             System.setProperty("COSMOS.IS_PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED", "true");
             System.setProperty("COSMOS.IS_SESSION_TOKEN_FALSE_PROGRESS_MERGE_ENABLED", "true");
             System.setProperty("COSMOS.E2E_TIMEOUT_ERROR_HIT_THRESHOLD_FOR_PPAF", "5");
             System.setProperty("COSMOS.E2E_TIMEOUT_ERROR_HIT_TIME_WINDOW_IN_SECONDS_FOR_PPAF", "120");
         }
 
-        if (config.getMinConnectionPoolSizePerEndpoint() >= 1) {
+        if (firstTenant.getMinConnectionPoolSizePerEndpoint() >= 1) {
             System.setProperty("COSMOS.MIN_CONNECTION_POOL_SIZE_PER_ENDPOINT",
-                String.valueOf(config.getMinConnectionPoolSizePerEndpoint()));
+                String.valueOf(firstTenant.getMinConnectionPoolSizePerEndpoint()));
         }
 
         logger.info("Global system properties set (circuit breaker: {}, PPAF: {}, minConnPoolSize: {})",
-            config.isPartitionLevelCircuitBreakerEnabled(),
-            config.isPerPartitionAutomaticFailoverRequired(),
-            config.getMinConnectionPoolSizePerEndpoint());
+            circuitBreakerEnabled, ppafEnabled, firstTenant.getMinConnectionPoolSizePerEndpoint());
     }
 }
diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java
index c28c2da81173..48fc05a993e6 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java
@@ -122,12 +122,6 @@ public class TenantWorkloadConfig {
     @JsonProperty("nonPointOperationLatencyThresholdMs")
     private Integer nonPointOperationLatencyThresholdMs;
 
-    /**
-     * Per-client flag: controls region-scoped session capturing on the CosmosClientBuilder.
-     * Unlike JVM-global system properties (circuit breaker, PPAF, minConnectionPoolSize),
-     * this is set per-client via {@code CosmosClientBuilderAccessor.setRegionScopedSessionCapturingEnabled}
-     * and can genuinely differ per tenant.
-     */
     @JsonProperty("isRegionScopedSessionContainerEnabled")
     private Boolean isRegionScopedSessionContainerEnabled;
 
@@ -152,6 +146,9 @@ public class TenantWorkloadConfig {
     @JsonProperty("aggressiveWarmupDuration")
     private String aggressiveWarmupDuration;
 
+    @JsonProperty("minConnectionPoolSizePerEndpoint")
+    private Integer minConnectionPoolSizePerEndpoint;
+
     // ======== Connection params ========
 
     @JsonProperty("connectionMode")
@@ -179,6 +176,14 @@ public class TenantWorkloadConfig {
     /** Cosmos SDK micrometer registry (set by orchestrator, not from JSON). */
     private transient MeterRegistry cosmosMicrometerRegistry;
 
+    // ======== System property flags ========
+
+    @JsonProperty("isPartitionLevelCircuitBreakerEnabled")
+    private String isPartitionLevelCircuitBreakerEnabled;
+
+    @JsonProperty("isPerPartitionAutomaticFailoverRequired")
+    private String isPerPartitionAutomaticFailoverRequired;
+
     public TenantWorkloadConfig() {}
 
 
@@ -265,6 +270,10 @@ public Duration getAggressiveWarmupDuration() {
         return Duration.parse(aggressiveWarmupDuration);
     }
 
+    public int getMinConnectionPoolSizePerEndpoint() {
+        return minConnectionPoolSizePerEndpoint != null ? minConnectionPoolSizePerEndpoint : 0;
+    }
+
     public ConnectionMode getConnectionMode() {
         if (connectionMode == null) return ConnectionMode.DIRECT;
         return ConnectionMode.valueOf(connectionMode.toUpperCase());
@@ -291,6 +300,9 @@ public List<String> getPreferredRegionsList() {
     public boolean isSuppressCleanup() { return suppressCleanup; }
     public MeterRegistry getCosmosMicrometerRegistry() { return cosmosMicrometerRegistry; }
 
+    public String getIsPartitionLevelCircuitBreakerEnabled() { return isPartitionLevelCircuitBreakerEnabled; }
+    public String getIsPerPartitionAutomaticFailoverRequired() { return isPerPartitionAutomaticFailoverRequired; }
+
     /**
      * Builds a TokenCredential for managed identity authentication.
      */
@@ -413,6 +425,8 @@ private void applyField(String key, String value, boolean overwrite) {
                     if (overwrite || proactiveConnectionRegionsCount == null) proactiveConnectionRegionsCount = Integer.parseInt(value); break;
                 case "aggressiveWarmupDuration":
                     if (overwrite || aggressiveWarmupDuration == null) aggressiveWarmupDuration = value; break;
+                case "minConnectionPoolSizePerEndpoint":
+                    if (overwrite || minConnectionPoolSizePerEndpoint == null) minConnectionPoolSizePerEndpoint = Integer.parseInt(value); break;
                 case "connectionMode":
                     if (overwrite || connectionMode == null) connectionMode = value; break;
                 case "consistencyLevel":
@@ -423,12 +437,10 @@ private void applyField(String key, String value, boolean overwrite) {
                     if (overwrite || preferredRegionsList == null) preferredRegionsList = value; break;
                 case "manageDatabase":
                     if (overwrite || manageDatabase == null) manageDatabase = Boolean.parseBoolean(value); break;
-                // JVM-global properties (minConnectionPoolSizePerEndpoint, isPartitionLevelCircuitBreakerEnabled,
-                // isPerPartitionAutomaticFailoverRequired) are handled in BenchmarkConfig, not per-tenant.
-                case "minConnectionPoolSizePerEndpoint":
                 case "isPartitionLevelCircuitBreakerEnabled":
+                    if (overwrite || isPartitionLevelCircuitBreakerEnabled == null) isPartitionLevelCircuitBreakerEnabled = value; break;
                 case "isPerPartitionAutomaticFailoverRequired":
-                    break;
+                    if (overwrite || isPerPartitionAutomaticFailoverRequired == null) isPerPartitionAutomaticFailoverRequired = value; break;
                 default:
                     logger.debug("Unknown config key '{}' (value: {})", key, value);
                     break;
@@ -500,14 +512,15 @@ public static TenantWorkloadConfig fromConfiguration(Configuration cfg) {
         if (cfg.getAggressiveWarmupDuration() != null) {
             t.aggressiveWarmupDuration = cfg.getAggressiveWarmupDuration().toString();
         }
+        t.minConnectionPoolSizePerEndpoint = cfg.getMinConnectionPoolSizePerEndpoint();
 
         // Connection
         t.preferredRegionsList = cfg.getPreferredRegionsList() != null
             ? String.join(",", cfg.getPreferredRegionsList()) : null;
 
-        // Note: JVM-global system properties (isPartitionLevelCircuitBreakerEnabled,
-        // isPerPartitionAutomaticFailoverRequired, minConnectionPoolSizePerEndpoint)
-        // are handled in BenchmarkConfig, not per-tenant.
+        // System property flags
+        t.isPartitionLevelCircuitBreakerEnabled = String.valueOf(cfg.isPartitionLevelCircuitBreakerEnabled());
+        t.isPerPartitionAutomaticFailoverRequired = String.valueOf(cfg.isPerPartitionAutomaticFailoverRequired());
 
         return t;
     }
@@ -539,7 +552,6 @@ public static List<TenantWorkloadConfig> parseTenantsFile(File tenantsFile) thro
             for (JsonNode tenantNode : tenantsNode) {
                 TenantWorkloadConfig tenant = OBJECT_MAPPER.treeToValue(tenantNode, TenantWorkloadConfig.class);
                 tenant.applyMap(globalDefaults, false);
-                validateTenantConfig(tenant);
                 tenants.add(tenant);
             }
         }
@@ -547,29 +559,4 @@ public static List<TenantWorkloadConfig> parseTenantsFile(File tenantsFile) thro
         logger.info("Parsed {} tenants from {}", tenants.size(), tenantsFile.getName());
         return tenants;
     }
-
-    private static void validateTenantConfig(TenantWorkloadConfig tenant) {
-        List<String> missing = new ArrayList<>();
-        if (isNullOrEmpty(tenant.getServiceEndpoint())) {
-            missing.add("serviceEndpoint");
-        }
-        if (isNullOrEmpty(tenant.getDatabaseId())) {
-            missing.add("databaseId");
-        }
-        if (isNullOrEmpty(tenant.getContainerId())) {
-            missing.add("containerId");
-        }
-        if (!tenant.isManagedIdentityRequired()
-            && isNullOrEmpty(tenant.getMasterKey())) {
-            missing.add("masterKey (required when isManagedIdentityRequired is not true)");
-        }
-        if (!missing.isEmpty()) {
-            throw new IllegalArgumentException(
-                "Tenant '" + tenant.getId() + "' is missing required configuration: " + missing);
-        }
-    }
-
-    private static boolean isNullOrEmpty(String value) {
-        return value == null || value.isEmpty();
-    }
 }
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md b/sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md
new file mode 100644
index 000000000000..baaeae94e16c
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md
@@ -0,0 +1,154 @@
+# Multi-Tenancy Benchmark Results Tracker
+
+## Summary
+
+| Run ID | Date (UTC) | Scenario | Branch | Commit | Threads | Verdict |
+|--------|-----------|----------|--------|--------|---------|---------|
+| R6 | 2026-02-20 20:01 | CHURN 3-cycle (90s settle) | fix-a1-telemetry-close | 8bc3caa168e | 0 growth | PASS - no leak |
+
+---
+
+## R6 — Multi-cycle CHURN with 90s settle (2026-02-20) — CONCLUSIVE
+
+- **Branch**: `fix-a1-telemetry-close` (A1/A2 fix reverted, benchmark enhancements kept)
+- **Commit**: `8bc3caa168e`
+- **VM**: `benchuser@4.154.169.45`
+- **Results dir**: `results/20260220T200151-CHURN-90s-settle/`
+- **Settle time**: 90s per cycle (> 60s BoundedElastic evictor TTL)
+
+### Per-Cycle Results
+
+| Cycle | Threads | Thread delta | Heap (MB) | FDs |
+|-------|---------|-------------|-----------|-----|
+| 1 | 63 | +57 | 28 | 58 |
+| 2 | 63 | +57 | 38 | 57 |
+| 3 | 63 | +57 | 38 | 57 |
+
+### Leak Verdict
+
+| Metric | Value | Result |
+|--------|-------|--------|
+| Thread growth (cycle 1 to 3) | **0** | PASS |
+| `transport-response-bounded-elastic` after settle | **0** (fully evicted) | PASS |
+| Heap stable across cycles 2-3 | 38 MB = 38 MB | PASS |
+| FDs stable | 57 = 57 | PASS |
+
+### Conclusion
+
+**No thread leak exists.** The thread growth seen in R4/R5 (5s settle) was entirely due to
+BoundedElastic evictor TTL timing. With 90s settle (> 60s TTL), all idle workers are
+reclaimed and thread count is perfectly stable at 63 across all cycles.
+
+The `transport-response-bounded-elastic` threads that showed growth in R4 (8 to 17) and R5
+(8 to 37) do NOT appear at all after 90s settle -- they were all temporary workers that
+the evictor correctly cleaned up.
+
+---
+
+<!-- TEMPLATE: Copy below for new runs -->
+<!--
+## RN — SCENARIO (YYYY-MM-DD)
+
+- **Branch**: `<branch>`
+- **Commit**: `<commit>` — "<message>"
+- **VM**: `<vm>`
+- **Results dir**: `<dir>`
+
+### Resource Snapshots
+
+| Phase | Timestamp | Heap (MB) | Threads | Direct Mem (MB) | FDs | GC Count | GC Time (ms) |
+|-------|-----------|-----------|---------|-----------------|-----|----------|-------------|
+| PRE_CREATE (Baseline) | | | | | | | |
+| POST_CREATE | | | | | | | |
+| Peak | | | | | | | |
+| POST_CLOSE (Final) | | | | | | | |
+
+### Leak Check
+
+| Metric | Value | Threshold | Result |
+|--------|-------|-----------|--------|
+| Thread delta (final − baseline) | | ≤ 2 | |
+| Heap ratio (final / baseline) | | ≤ 1.1× | |
+
+### Analysis
+
+-
+-->
+
+---
+
+## Findings & Action Items
+
+### F1: reactor-http-epoll Threads (Shared LoopResources)
+
+16 `reactor-http-epoll` threads appear after any Cosmos client usage and persist for the JVM lifetime.
+NOT a leak -- Reactor Netty's global default event-loop pool (`LoopResources.DEFAULT`).
+
+- Default thread count: `Runtime.getRuntime().availableProcessors()` (16 on D16s_v5 VM)
+- Shared by ALL `ReactorNettyClient` instances (Gateway HTTP client, IMDS metadata client)
+- Cosmos SDK does NOT customize `LoopResources` -- relies on reactor-netty defaults
+- Configurable via system property: `reactor.netty.ioWorkerCount=<N>`
+
+**Multi-tenancy concern**: 100+ clients sharing 16 event-loop threads could cause contention.
+
+| Action | Description | Complexity |
+|--------|-------------|------------|
+| A23 | Benchmark event-loop contention at 100+ clients with varying `ioWorkerCount` | Low |
+| A24 | Consider exposing `ioWorkerCount` as a Cosmos SDK config for multi-tenant scenarios | Low |
+
+### F2: ClientTelemetry Cleanup Opportunities
+
+`ClientTelemetry` has significant dead code. Still instantiated per-client, creating an
+IMDS `HttpClient` pool each time even though metadata is cached statically after first call.
+
+**Active code (6)**: constructor, `init()`, `recordValue()`, `getClientTelemetryConfig()`,
+`isClientMetricsEnabled()`, `getMachineId()`
+
+**Dead code (7)**: `close()` (was no-op with wrong log message), `blockingGetOrLoadMachineId()`,
+`DEFAULT_CLIENT_TELEMETRY_ENABLED`, 4x `TCP_NEW_CHANNEL_LATENCY_*` constants
+
+**IMDS HttpClient should be static**: `metadataHttpClient` is per-instance but
+`azureVmMetaDataSingleton` is static -- only first call does HTTP. All subsequent instances
+create unused `ConnectionProvider` objects.
+
+| Action | Description | Complexity |
+|--------|-------------|------------|
+| A25 | Make IMDS `metadataHttpClient` a static singleton (lazy-init) | Low |
+| A26 | Remove dead code: `blockingGetOrLoadMachineId()`, `TCP_NEW_CHANNEL_LATENCY_*` constants | Low |
+| A27 | Fix `close()` log message (says "GlobalEndpointManager closed" -- copy-paste error) | Trivial |
+
+### F3: transport-response-bounded-elastic Thread Growth
+
+Only thread group that grows across CHURN cycles (8 after 1 cycle, 17 after 5 with fix, 37 without).
+
+Reactor `BoundedElasticScheduler` creates workers on-demand, evicts after 60s TTL.
+CHURN cycle settle time (5s) is shorter than evictor TTL, so workers accumulate temporarily.
+
+**CONFIRMED NOT A LEAK (R6)**: With 90s settle (> 60s TTL), thread count is perfectly
+stable at 63 across all cycles. All bounded-elastic workers are properly evicted.
+The growth seen in R4/R5 was purely timing -- 5s settle was too short for the 60s evictor.
+
+| Action | Description | Status |
+|--------|-------------|--------|
+| A28 | Increase CHURN settle time to 60s+ and verify bounded-elastic threads stabilize | DONE (R6) -- CONFIRMED |
+| A29 | Investigate if any bounded-elastic schedulers should be disposed on client close | NOT NEEDED -- no leak |
+
+### F4: A1/A2 Fix Impact Reassessment
+
+**Original hypothesis**: `ClientTelemetry.close()` no-op leaked IMDS pool threads and
+GlobalEndpointManager scheduler threads per client lifecycle.
+
+**Revised understanding**:
+
+| Resource | Actual behavior |
+|----------|----------------|
+| IMDS HttpClient pool | Created per-instance but unused after first client (metadata cached statically). Leaked `ConnectionProvider` wastes minor memory, no FDs or threads. |
+| GlobalEndpointManager scheduler | Already properly closed via `RxDocumentClientImpl.close()`. Never broken. |
+| reactor-http-epoll threads | Shared `LoopResources.DEFAULT` singleton -- same count (16) fix vs no-fix. |
+| Thread count diff (80 vs 100) | Entirely in `transport-response-bounded-elastic` (17 vs 37). Likely timing-related. |
+
+**Conclusion**: A1/A2 fix is correct for code hygiene but has minimal observable impact on
+threads, heap, or FDs in CHURN testing. Real multi-tenancy concerns are:
+- Event-loop contention at scale (A23)
+- Unnecessary per-instance IMDS client creation (A25)
+- Bounded-elastic growth during rapid churn (A28/A29)
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-results/README.md b/sdk/cosmos/azure-cosmos-benchmark/test-results/README.md
new file mode 100644
index 000000000000..9dee7631785c
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/test-results/README.md
@@ -0,0 +1,39 @@
+# Test Results
+
+This directory stores benchmark results downloaded from the VM.
+
+Each run is stored in a subdirectory named by run ID:
+```
+test-results/
+  R2-20260220T055015-CHURN/
+    resource_snapshots.csv
+    benchmark.log
+    gc.log
+    git-info.json
+    thread-dumps/
+    heap-dumps/
+  R3-20260220T064730-CHURN-prefix/
+    ...
+```
+
+## Downloading Results from VM
+
+```bash
+# Download a specific run:
+scp -r benchuser@<VM_IP>:~/azure-sdk-for-java/results/<run-dir> test-results/
+
+# Download all results:
+scp -r benchuser@<VM_IP>:~/azure-sdk-for-java/results/* test-results/
+```
+
+## Analyzing Results
+
+Use the multi-tenancy-benchmark-analyze skill or run:
+```bash
+# Compare heap dumps:
+python3 .github/skills/multi-tenancy-benchmark-heapdump/references/parse_hprof.py \
+  --diff test-results/<run>/heap-dumps/heap-PRE_CLOSE-*.hprof \
+         test-results/<run>/heap-dumps/heap-POST_CLOSE-*.hprof --top 20
+```
+
+All files in this directory are gitignored.
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md b/sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md
new file mode 100644
index 000000000000..2081b0ad515c
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md
@@ -0,0 +1,39 @@
+# Test Setup Files
+
+This directory contains configuration files needed to run multi-tenancy benchmarks.
+
+## Files
+
+| File | Purpose | Checked in? |
+|------|---------|-------------|
+| `tenants-sample.json` | Template for tenant configuration | Yes (reference) |
+| `tenants.json` | Actual tenant config with endpoints and keys | No (gitignored) |
+| `clientHostAndKey.txt` | Raw credentials CSV used to generate tenants.json | No (gitignored) |
+| `vm-config.env` | VM connection info (IP, user, key path) | No (gitignored) |
+
+## Setup Steps
+
+1. Copy `tenants-sample.json` to `tenants.json`
+2. Fill in your Cosmos DB account endpoints and master keys
+3. Optionally create `clientHostAndKey.txt` in CSV format:
+   ```
+   <account-name>,<endpoint>,<master-key>
+   ```
+   Then use the benchmark setup skill to auto-generate `tenants.json` from it.
+
+## VM Connection
+
+After provisioning a benchmark VM, save connection info:
+```bash
+# Created by provision-benchmark-vm.sh
+VM_IP=<ip>
+VM_USER=benchuser
+VM_KEY_PATH=~/.ssh/id_rsa
+```
+
+## Deploying to VM
+
+Copy setup files to the VM:
+```bash
+scp -i $VM_KEY_PATH test-setup/tenants.json $VM_USER@$VM_IP:~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/tenants.json
+```
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants-sample.json b/sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants-sample.json
new file mode 100644
index 000000000000..ee3b44d065cb
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants-sample.json
@@ -0,0 +1,51 @@
+{
+    "globalDefaults": {
+        "connectionMode": "GATEWAY",
+        "consistencyLevel": "SESSION",
+        "concurrency": "20",
+        "numberOfOperations": "100000",
+        "operation": "ReadThroughput",
+        "numberOfPreCreatedDocuments": "1000",
+        "connectionSharingAcrossClientsEnabled": "false",
+        "maxConnectionPoolSize": "1000",
+        "applicationName": "mt-bench"
+    },
+    "tenants": [
+        {
+            "id": "tenant-0",
+            "serviceEndpoint": "https://account0.documents.azure.com:443/",
+            "masterKey": "key0==",
+            "databaseId": "benchdb",
+            "containerId": "benchcol",
+            "overrides": {
+                "concurrency": "50"
+            }
+        },
+        {
+            "id": "tenant-1",
+            "serviceEndpoint": "https://account1.documents.azure.com:443/",
+            "masterKey": "key1==",
+            "databaseId": "benchdb",
+            "containerId": "benchcol"
+        },
+        {
+            "id": "tenant-2-managed-identity",
+            "serviceEndpoint": "https://account2.documents.azure.com:443/",
+            "databaseId": "benchdb",
+            "containerId": "benchcol",
+            "overrides": {
+                "isManagedIdentityRequired": "true",
+                "aadManagedIdentityClientId": "client-id-for-tenant-2",
+                "aadTenantId": "tenant-id-for-tenant-2"
+            }
+        }
+    ],
+    "tenantTemplate": {
+        "enabled": false,
+        "count": 100,
+        "endpointPattern": "https://account{i}.documents.azure.com:443/",
+        "keyEnvVarPattern": "COSMOS_KEY_{i}",
+        "databaseId": "benchdb",
+        "containerIdPattern": "benchcol-{i}"
+    }
+}

From 4158424d705e23a3764a29ee89f88377d4126127 Mon Sep 17 00:00:00 2001
From: Annie Liang <xinlian@microsoft.com>
Date: Fri, 27 Feb 2026 11:40:25 -0800
Subject: [PATCH 02/22] Add cosmos-benchmark agent with 5 workflow-aligned
 skills

Agent routing file dispatches to 5 skills covering the full
benchmark/DR drill lifecycle:

- provision: Cosmos DB accounts, App Insights, Azure VMs
- setup: JDK/Maven install, repo clone, config generation, build
- run: CHURN preset execution, multi-VM parallel, App Insights config
- analyze: CSV metrics, run comparison, heap/thread dumps, Kusto export
- status: resource health, run overview, App Insights verification

Also includes skill-creator utility for authoring new skills.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .github/agents/cosmos-benchmark.agent.md      |  50 +++
 .../skills/cosmos-benchmark-analyze/SKILL.md  | 196 ++++++++++
 .../references/kusto-schema.md                | 113 ++++++
 .../references/parse_hprof.py                 | 234 ++++++++++++
 .../references/thresholds.md                  |  49 +++
 .../cosmos-benchmark-provision/SKILL.md       | 215 +++++++++++
 .../references/vm-sizing.md                   |  22 ++
 .github/skills/cosmos-benchmark-run/SKILL.md  | 140 +++++++
 .../references/presets.md                     |  29 ++
 .../references/scenarios.md                   |  75 ++++
 .../skills/cosmos-benchmark-setup/SKILL.md    | 178 +++++++++
 .../skills/cosmos-benchmark-status/SKILL.md   |  61 +++
 .github/skills/skill-creator/LICENSE.txt      | 202 ++++++++++
 .github/skills/skill-creator/SKILL.md         | 357 ++++++++++++++++++
 .../references/output-patterns.md             |  82 ++++
 .../skill-creator/references/workflows.md     |  28 ++
 .../skill-creator/scripts/init_skill.py       | 303 +++++++++++++++
 .../skill-creator/scripts/package_skill.py    | 110 ++++++
 .../skill-creator/scripts/quick_validate.py   | 103 +++++
 19 files changed, 2547 insertions(+)
 create mode 100644 .github/agents/cosmos-benchmark.agent.md
 create mode 100644 .github/skills/cosmos-benchmark-analyze/SKILL.md
 create mode 100644 .github/skills/cosmos-benchmark-analyze/references/kusto-schema.md
 create mode 100644 .github/skills/cosmos-benchmark-analyze/references/parse_hprof.py
 create mode 100644 .github/skills/cosmos-benchmark-analyze/references/thresholds.md
 create mode 100644 .github/skills/cosmos-benchmark-provision/SKILL.md
 create mode 100644 .github/skills/cosmos-benchmark-provision/references/vm-sizing.md
 create mode 100644 .github/skills/cosmos-benchmark-run/SKILL.md
 create mode 100644 .github/skills/cosmos-benchmark-run/references/presets.md
 create mode 100644 .github/skills/cosmos-benchmark-run/references/scenarios.md
 create mode 100644 .github/skills/cosmos-benchmark-setup/SKILL.md
 create mode 100644 .github/skills/cosmos-benchmark-status/SKILL.md
 create mode 100644 .github/skills/skill-creator/LICENSE.txt
 create mode 100644 .github/skills/skill-creator/SKILL.md
 create mode 100644 .github/skills/skill-creator/references/output-patterns.md
 create mode 100644 .github/skills/skill-creator/references/workflows.md
 create mode 100644 .github/skills/skill-creator/scripts/init_skill.py
 create mode 100644 .github/skills/skill-creator/scripts/package_skill.py
 create mode 100644 .github/skills/skill-creator/scripts/quick_validate.py

diff --git a/.github/agents/cosmos-benchmark.agent.md b/.github/agents/cosmos-benchmark.agent.md
new file mode 100644
index 000000000000..24e7ec17880e
--- /dev/null
+++ b/.github/agents/cosmos-benchmark.agent.md
@@ -0,0 +1,50 @@
+---
+name: Cosmos Benchmark
+description: Cosmos DB benchmark agent — provision infrastructure, set up environments, run benchmarks, and analyze results. Supports both single-tenant and multi-tenant configurations. Use for benchmark/DR drill workflows.
+tools: ['readFile', 'listDir', 'runInTerminal', 'search', 'grep', 'fileSearch', 'agent']
+argument-hint: "provision accounts, setup VM, run benchmark, analyze results, or check status"
+---
+
+# Cosmos Benchmark Agent
+
+You are a Cosmos DB benchmark specialist. You help with the full benchmark/DR drill lifecycle: provisioning infrastructure, setting up environments, running benchmarks, and analyzing results.
+
+## Routing
+
+Determine user intent and follow the matching workflow:
+
+| User wants to... | Skill to load |
+|---|---|
+| Provision Azure resources (Cosmos accounts, App Insights, VMs) | Read `.github/skills/cosmos-benchmark-provision/SKILL.md` |
+| Set up environment (install tools, clone repo, generate config, build) | Read `.github/skills/cosmos-benchmark-setup/SKILL.md` |
+| Run a benchmark (checkout branch/PR, select scenario, execute) | Read `.github/skills/cosmos-benchmark-run/SKILL.md` |
+| Analyze results (CSV, compare runs, heap/thread dumps, reports, Kusto) | Read `.github/skills/cosmos-benchmark-analyze/SKILL.md` |
+| Check status (resources, runs, VM, build, config overview) | Read `.github/skills/cosmos-benchmark-status/SKILL.md` |
+
+When a skill references files in its `references/` directory, read them from the skill's directory (e.g., `.github/skills/cosmos-benchmark-analyze/references/thresholds.md`).
+
+## Subagent Usage
+
+For complex multi-step workflows, use subagents to keep context clean:
+
+- **Analyze after run**: Spawn a subagent to analyze results so run context doesn't pollute analysis.
+- **Parallel analysis**: Spawn parallel subagents for multiple result directories.
+- **Provision + setup**: For full DR drill setup, spawn sequential subagents for provision → setup.
+
+## Benchmark Modes
+
+The framework supports two modes — the choice is purely configuration:
+
+- **Single-tenant**: Pass connection details directly via CLI flags
+- **Multi-tenant**: Pass `-tenantsFile tenants.json` with multiple account configurations
+
+Both use the same JAR, orchestrator, and monitoring infrastructure.
+
+## Workflow Chaining
+
+After completing one task, suggest the natural next step:
+
+- After **provision** → suggest **setup**
+- After **setup** → suggest **run**
+- After **run** → suggest **analyze**
+- After **analyze** (if baseline exists) → suggest comparing with previous run
diff --git a/.github/skills/cosmos-benchmark-analyze/SKILL.md b/.github/skills/cosmos-benchmark-analyze/SKILL.md
new file mode 100644
index 000000000000..f1f1afaaf79f
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-analyze/SKILL.md
@@ -0,0 +1,196 @@
+---
+name: cosmos-benchmark-analyze
+description: Analyze Cosmos DB benchmark results — download from VM, parse CSV metrics, compare runs, analyze heap/thread dumps, generate markdown reports, export to Kusto, and query Application Insights. Triggers on "analyze results", "compare runs", "leak check", "did it pass", "heap dump", "thread dump", "generate report", "export to kusto", "regression check", monitor.csv, or result directories.
+---
+
+# Analyze Benchmark Results
+
+Comprehensive post-run analysis: download results, CSV metrics, run comparison, heap/thread dumps, reports, and Kusto export.
+
+## 1. Download Results from VM
+
+Auto-detect VM connection:
+```bash
+VM_IP=$(cat .vm-ip); VM_USER=$(cat .vm-user); VM_KEY=$(cat .vm-key)
+```
+
+Download a run's results:
+```bash
+scp -i $VM_KEY -r $VM_USER@$VM_IP:~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<run-name> \
+  ./results/<run-name>
+```
+
+Each run lives in its own directory. Never overwrite previous results — this enables baseline tracking.
+
+## 2. CSV Metrics Analysis
+
+### Workflow
+
+1. Find `monitor.csv` in the result directory.
+2. Parse CSV columns (see `references/thresholds.md` for column definitions).
+3. Cross-reference with lifecycle events from the benchmark log file (pattern: `[LIFECYCLE] <event> timestamp=<ISO>`).
+4. Extract snapshots:
+   - **Baseline** = first row after `PRE_CREATE`
+   - **Peak** = row with highest `heap_used_kb`
+   - **Final** = last row (after `POST_CLOSE` + settle)
+5. Compute: `thread_delta`, `heap_ratio`
+6. Apply thresholds from `references/thresholds.md`.
+
+### Output Format
+
+```
+📊 Benchmark Results: <directory>
+Branch: <branch>  Commit: <commit>  Scenario: <scenario>
+
+HEAP:    Baseline=<X>MB  Peak=<Y>MB  After close=<Z>MB
+THREADS: Baseline=<X>  Peak=<Y>  After close=<Z>
+FDs:     Peak=<X>
+GC:      Count=<X>  Time=<Y>ms
+
+✅/🔴 Thread leak: delta=<N> (threshold: ≤2)
+✅/🔴 Memory leak: ratio=<N> (threshold: ≤1.1)
+
+Overall: ✅ PASSED / 🔴 FAILED
+```
+
+## 3. Compare Two Runs
+
+1. Read `monitor.csv` from both directories.
+2. Extract baseline, peak, final for each.
+3. Read `git-info.json` from each for branch/commit.
+4. Compute deltas.
+
+### Output
+
+```
+📊 Comparing:
+   Before: <dir1> (branch: <branch1>, commit: <commit1>)
+   After:  <dir2> (branch: <branch2>, commit: <commit2>)
+
+| Metric                    | Before | After  | Delta   | Status |
+|---------------------------|--------|--------|---------|--------|
+| Threads after close       |    218 |     19 |    -199 | ✅     |
+| Heap after close (MB)     |    342 |    134 |    -208 | ✅     |
+| Heap ratio                |   2.67 |   1.05 |  -1.62  | ✅     |
+| Peak FDs                  |   4428 |   4312 |    -116 | ✅     |
+| GC count                  |    847 |    812 |     -35 | ✅     |
+
+Overall: ✅ Fix validated / 🔴 Regression detected
+```
+
+Status: ✅ improved, 🟡 marginal (<10%), 🔴 regressed (>10% worse)
+
+## 4. Heap Dump Analysis
+
+### Locate heap dumps
+
+```
+results/<run-name>/heap-dumps/heap-PRE_CLOSE-*.hprof
+results/<run-name>/heap-dumps/heap-POST_CLOSE-*.hprof
+```
+
+### Option A: HeapDumpAnalyzer (built into benchmark JAR)
+
+```bash
+java -cp azure-cosmos-benchmark-*-jar-with-dependencies.jar \
+  com.azure.cosmos.benchmark.HeapDumpAnalyzer <pre.hprof> <post.hprof>
+```
+
+### Option B: Python hprof parser (lightweight, no deps)
+
+```bash
+python3 references/parse_hprof.py <pre.hprof> --top 30
+python3 references/parse_hprof.py --diff <pre.hprof> <post.hprof> --top 20
+```
+
+### Option C: YourKit (detailed, requires license)
+
+If YourKit is installed on the VM:
+
+```bash
+# Open snapshot in YourKit CLI
+<yourkit-dir>/bin/profiler.sh -export -snapshot=<file.hprof> -csv -outdir=<output>
+```
+
+Or analyze interactively via YourKit GUI by downloading the .hprof files locally.
+
+### Interpret results
+
+Look for classes with more instances/bytes after close — these indicate objects not released during `CosmosAsyncClient.close()`. Common suspects: Reactor schedulers, Netty connection pools, background threads, unbounded caches.
+
+## 5. Thread Dump Analysis
+
+### Capture thread dump during benchmark
+
+```bash
+# On the VM, while benchmark is running:
+jcmd <pid> Thread.print > results/<run-name>/thread-dump-$(date +%s).txt
+
+# Or using jstack:
+jstack <pid> > results/<run-name>/thread-dump-$(date +%s).txt
+```
+
+Capture multiple dumps at intervals to identify stuck or leaked threads:
+```bash
+for i in 1 2 3; do jcmd <pid> Thread.print > results/<run-name>/thread-dump-$i.txt; sleep 30; done
+```
+
+### Analyze thread dumps
+
+Look for:
+- **Thread count growth**: Compare total thread counts across dumps
+- **Stuck threads**: Same thread in same stack across multiple dumps
+- **Leaked pools**: Thread pool threads that should have been shut down after client close
+- **Daemon vs non-daemon**: Non-daemon threads prevent JVM exit
+
+Key thread name patterns in Cosmos SDK:
+- `cosmos-parallel-*` — SDK parallel scheduler
+- `reactor-http-*` — Reactor Netty event loop
+- `boundedElastic-*` — Reactor bounded elastic pool
+- `globalEndpointManager-*` — Cosmos endpoint refresh
+
+## 6. Generate Markdown Report
+
+Produce a markdown report with embedded charts (as inline base64 images or Plotly HTML):
+
+### Using generate-dashboard.py
+
+```bash
+python3 scripts/generate-dashboard.py \
+  <results-dir>/metrics \
+  <results-dir>/benchmark.log \
+  <results-dir>/report.html \
+  <results-dir>/monitor.csv
+```
+
+Arguments: `<metrics-dir> <log-file> <output-html> [monitor.csv]`
+
+Then reference the HTML dashboard in the markdown report or generate a standalone markdown file with metrics tables and verdicts.
+
+## 7. Export to Kusto
+
+See `references/kusto-schema.md` for:
+- Table schema (`BenchmarkResults`, `BenchmarkSummary`)
+- CSV enrichment commands (add run metadata to monitor.csv)
+- `.ingest` commands for Azure Data Explorer
+- Sample queries (latest runs, compare runs, trend over time)
+
+## 8. Application Insights Queries
+
+If the benchmark was run with `APPLICATIONINSIGHTS_CONNECTION_STRING`, query metrics:
+
+```bash
+az monitor app-insights query \
+  --app <app-insights-name> \
+  --resource-group rg-cosmos-benchmark \
+  --analytics-query "<KQL-query>"
+```
+
+Placeholder: user will provide specific KQL queries for latency percentiles, error rates, and throughput over time.
+
+## References
+
+- **Pass/fail thresholds and CSV columns**: `references/thresholds.md`
+- **Python hprof parser**: `references/parse_hprof.py`
+- **Kusto table schema & ingestion**: `references/kusto-schema.md`
+- **Dashboard generator**: `sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py`
diff --git a/.github/skills/cosmos-benchmark-analyze/references/kusto-schema.md b/.github/skills/cosmos-benchmark-analyze/references/kusto-schema.md
new file mode 100644
index 000000000000..540f15392a38
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-analyze/references/kusto-schema.md
@@ -0,0 +1,113 @@
+# Kusto Table Schema for Benchmark Results
+
+## Table: BenchmarkResults
+
+```kql
+.create table BenchmarkResults (
+    RunId: string,
+    Timestamp: datetime,
+    Branch: string,
+    CommitSha: string,
+    Scenario: string,
+    TenantCount: int,
+    Operation: string,
+    ConnectionMode: string,
+    // Monitor metrics (from monitor.csv)
+    Threads: int,
+    FileDescriptors: int,
+    RssKb: long,
+    CpuPct: real,
+    HeapUsedKb: long,
+    HeapMaxKb: long,
+    GcCount: int,
+    GcTimeMs: long,
+    // Computed metrics
+    Phase: string,
+    ThreadDelta: int,
+    HeapRatio: real,
+    // Verdict
+    Passed: bool,
+    FailReason: string
+)
+```
+
+## Ingestion from CSV
+
+### Step 1: Prepare CSV for ingestion
+
+Add run metadata columns to monitor.csv:
+
+```bash
+# On the VM or locally after downloading results:
+RUN_ID="<run-name>"
+BRANCH=$(jq -r .branch results/<run-name>/git-info.json)
+COMMIT=$(jq -r .commitId results/<run-name>/git-info.json)
+
+awk -v run="$RUN_ID" -v branch="$BRANCH" -v commit="$COMMIT" \
+  'NR>1 {print run","$0","branch","commit}' \
+  results/<run-name>/monitor.csv > results/<run-name>/monitor-enriched.csv
+```
+
+### Step 2: Ingest into Kusto
+
+```kql
+.ingest into table BenchmarkResults (
+    h'https://<storage-account>.blob.core.windows.net/<container>/monitor-enriched.csv'
+) with (format='csv', ignoreFirstRecord=true)
+```
+
+Or inline from local file via Kusto Explorer / Azure Data Explorer web UI.
+
+### Step 3: Query examples
+
+```kql
+// Latest runs summary
+BenchmarkResults
+| summarize MaxThreads=max(Threads), MaxHeapMB=max(HeapUsedKb)/1024,
+            FinalThreads=arg_max(Timestamp, Threads), FinalHeapKb=arg_max(Timestamp, HeapUsedKb)
+  by RunId, Branch, Scenario
+| order by Timestamp desc
+
+// Compare two runs
+let baseline = "20260226-CHURN-main-baseline";
+let fix = "20260226-CHURN-fix-leak";
+BenchmarkResults
+| where RunId in (baseline, fix)
+| summarize MaxThreads=max(Threads), MaxHeapMB=max(HeapUsedKb)/1024 by RunId
+| order by RunId
+
+// Trend over time (multiple runs)
+BenchmarkResults
+| where Scenario == "CHURN"
+| summarize FinalThreads=arg_max(Timestamp, Threads) by RunId, Branch
+| order by Timestamp asc
+| render timechart
+```
+
+## Table: BenchmarkSummary
+
+Aggregated per-run summary (one row per run):
+
+```kql
+.create table BenchmarkSummary (
+    RunId: string,
+    Timestamp: datetime,
+    Branch: string,
+    CommitSha: string,
+    Scenario: string,
+    TenantCount: int,
+    BaselineThreads: int,
+    PeakThreads: int,
+    FinalThreads: int,
+    ThreadDelta: int,
+    BaselineHeapKb: long,
+    PeakHeapKb: long,
+    FinalHeapKb: long,
+    HeapRatio: real,
+    PeakFDs: int,
+    TotalGcCount: int,
+    TotalGcTimeMs: long,
+    Passed: bool,
+    FailReasons: string
+)
+```
diff --git a/.github/skills/cosmos-benchmark-analyze/references/parse_hprof.py b/.github/skills/cosmos-benchmark-analyze/references/parse_hprof.py
new file mode 100644
index 000000000000..f58fd86be534
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-analyze/references/parse_hprof.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Lightweight .hprof histogram parser for benchmark heap dump analysis.
+Does NOT load the entire heap into memory -- streams through the binary format.
+
+Usage:
+  python3 parse_hprof.py <file.hprof> [--top N]
+  python3 parse_hprof.py --diff <pre.hprof> <post.hprof> [--top N]
+"""
+
+import struct
+import sys
+import argparse
+from collections import defaultdict
+
+# HPROF binary format constants
+HPROF_UTF8          = 0x01
+HPROF_LOAD_CLASS    = 0x02
+HPROF_HEAP_DUMP     = 0x0C
+HPROF_HEAP_DUMP_SEG = 0x1C
+HPROF_HEAP_DUMP_END = 0x2C
+
+# Heap dump sub-tags
+HPROF_GC_CLASS_DUMP        = 0x20
+HPROF_GC_INSTANCE_DUMP     = 0x21
+HPROF_GC_OBJ_ARRAY_DUMP    = 0x22
+HPROF_GC_PRIM_ARRAY_DUMP   = 0x23
+
+TYPE_SIZES = {2: 4, 4: 1, 5: 2, 6: 4, 7: 8, 8: 1, 9: 2, 10: 4, 11: 8}
+
+
+def read_id(f, id_size):
+    data = f.read(id_size)
+    if len(data) < id_size:
+        return None
+    if id_size == 4:
+        return struct.unpack('>I', data)[0]
+    return struct.unpack('>Q', data)[0]
+
+
+def parse_histogram(filepath):
+    """Parse an hprof file and return a dict of {class_name: (instance_count, total_bytes)}."""
+    strings = {}       # id -> utf8 string
+    class_names = {}   # class_serial -> string_id
+    class_id_to_name = {}  # class_obj_id -> class_name
+    class_instance_sizes = {}  # class_obj_id -> instance_size
+    histogram = defaultdict(lambda: [0, 0])  # class_name -> [count, bytes]
+
+    with open(filepath, 'rb') as f:
+        # Read header
+        header = b''
+        while True:
+            c = f.read(1)
+            if c == b'\x00':
+                break
+            header += c
+
+        id_size = struct.unpack('>I', f.read(4))[0]
+        f.read(8)  # timestamp
+
+        while True:
+            tag_data = f.read(1)
+            if not tag_data:
+                break
+            tag = tag_data[0]
+            f.read(4)  # timestamp
+            length = struct.unpack('>I', f.read(4))[0]
+
+            if tag == HPROF_UTF8:
+                str_id = read_id(f, id_size)
+                name = f.read(length - id_size).decode('utf-8', errors='replace')
+                strings[str_id] = name
+
+            elif tag == HPROF_LOAD_CLASS:
+                serial = struct.unpack('>I', f.read(4))[0]
+                class_obj_id = read_id(f, id_size)
+                f.read(4)  # stack trace serial
+                name_id = read_id(f, id_size)
+                if name_id in strings:
+                    class_id_to_name[class_obj_id] = strings[name_id]
+
+            elif tag in (HPROF_HEAP_DUMP, HPROF_HEAP_DUMP_SEG):
+                end_pos = f.tell() + length
+                while f.tell() < end_pos:
+                    sub_tag_data = f.read(1)
+                    if not sub_tag_data:
+                        break
+                    sub_tag = sub_tag_data[0]
+
+                    if sub_tag == HPROF_GC_CLASS_DUMP:
+                        class_obj_id = read_id(f, id_size)
+                        f.read(4)  # stack serial
+                        read_id(f, id_size)  # super
+                        read_id(f, id_size)  # classloader
+                        read_id(f, id_size)  # signer
+                        read_id(f, id_size)  # protection domain
+                        read_id(f, id_size)  # reserved1
+                        read_id(f, id_size)  # reserved2
+                        inst_size = struct.unpack('>I', f.read(4))[0]
+                        class_instance_sizes[class_obj_id] = inst_size
+
+                        # constant pool
+                        cp_count = struct.unpack('>H', f.read(2))[0]
+                        for _ in range(cp_count):
+                            f.read(2)  # index
+                            tp = f.read(1)[0]
+                            f.read(TYPE_SIZES.get(tp, id_size))
+
+                        # static fields
+                        sf_count = struct.unpack('>H', f.read(2))[0]
+                        for _ in range(sf_count):
+                            read_id(f, id_size)  # name
+                            tp = f.read(1)[0]
+                            f.read(TYPE_SIZES.get(tp, id_size))
+
+                        # instance fields
+                        if_count = struct.unpack('>H', f.read(2))[0]
+                        for _ in range(if_count):
+                            read_id(f, id_size)  # name
+                            f.read(1)  # type
+
+                    elif sub_tag == HPROF_GC_INSTANCE_DUMP:
+                        read_id(f, id_size)  # obj id
+                        f.read(4)  # stack serial
+                        class_id = read_id(f, id_size)
+                        data_len = struct.unpack('>I', f.read(4))[0]
+                        f.read(data_len)
+
+                        name = class_id_to_name.get(class_id, f'unknown_0x{class_id:x}')
+                        inst_size = class_instance_sizes.get(class_id, data_len)
+                        histogram[name][0] += 1
+                        histogram[name][1] += inst_size + data_len
+
+                    elif sub_tag == HPROF_GC_OBJ_ARRAY_DUMP:
+                        read_id(f, id_size)  # obj id
+                        f.read(4)  # stack serial
+                        num_elements = struct.unpack('>I', f.read(4))[0]
+                        array_class_id = read_id(f, id_size)
+                        f.read(num_elements * id_size)
+
+                        name = class_id_to_name.get(array_class_id, 'Object[]')
+                        histogram[name + '[]'][0] += 1
+                        histogram[name + '[]'][1] += num_elements * id_size
+
+                    elif sub_tag == HPROF_GC_PRIM_ARRAY_DUMP:
+                        read_id(f, id_size)  # obj id
+                        f.read(4)  # stack serial
+                        num_elements = struct.unpack('>I', f.read(4))[0]
+                        elem_type = f.read(1)[0]
+                        elem_size = TYPE_SIZES.get(elem_type, 1)
+                        f.read(num_elements * elem_size)
+
+                        type_names = {4: 'boolean', 5: 'char', 6: 'float', 7: 'double',
+                                      8: 'byte', 9: 'short', 10: 'int', 11: 'long'}
+                        name = type_names.get(elem_type, 'unknown') + '[]'
+                        histogram[name][0] += 1
+                        histogram[name][1] += num_elements * elem_size
+
+                    elif sub_tag == 0xFF:
+                        read_id(f, id_size)  # obj id
+                    elif sub_tag in (0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08):
+                        read_id(f, id_size)  # obj id
+                    else:
+                        # Unknown sub-tag, try to skip to end
+                        remaining = end_pos - f.tell()
+                        if remaining > 0:
+                            f.seek(end_pos)
+                        break
+
+            else:
+                f.read(length)
+
+    return dict(histogram)
+
+
+def print_histogram(hist, top_n=30):
+    sorted_items = sorted(hist.items(), key=lambda x: x[1][1], reverse=True)
+    print(f"{'Class':<60} {'Count':>10} {'Bytes':>12}")
+    print('-' * 84)
+    for name, (count, bytes_) in sorted_items[:top_n]:
+        print(f"{name:<60} {count:>10} {bytes_:>12}")
+    total_count = sum(v[0] for v in hist.values())
+    total_bytes = sum(v[1] for v in hist.values())
+    print('-' * 84)
+    print(f"{'TOTAL':<60} {total_count:>10} {total_bytes:>12}")
+
+
+def print_diff(pre_hist, post_hist, top_n=20):
+    all_classes = set(pre_hist.keys()) | set(post_hist.keys())
+    diffs = []
+    for cls in all_classes:
+        pre_count, pre_bytes = pre_hist.get(cls, (0, 0))
+        post_count, post_bytes = post_hist.get(cls, (0, 0))
+        d_count = post_count - pre_count
+        d_bytes = post_bytes - pre_bytes
+        if d_bytes != 0:
+            diffs.append((cls, pre_count, post_count, d_count, pre_bytes, post_bytes, d_bytes))
+
+    # Sort by absolute byte delta descending
+    diffs.sort(key=lambda x: abs(x[6]), reverse=True)
+
+    print(f"{'Class':<55} {'PRE cnt':>8} {'POST cnt':>9} {'D cnt':>7} {'D bytes':>12}")
+    print('-' * 93)
+    for cls, pre_c, post_c, d_c, pre_b, post_b, d_b in diffs[:top_n]:
+        sign = '+' if d_b > 0 else ''
+        print(f"{cls:<55} {pre_c:>8} {post_c:>9} {d_c:>+7} {sign}{d_b:>11}")
+    print('-' * 93)
+
+    total_pre = sum(v[1] for v in pre_hist.values())
+    total_post = sum(v[1] for v in post_hist.values())
+    print(f"Total: PRE={total_pre/1024/1024:.1f}MB  POST={total_post/1024/1024:.1f}MB  Delta={sign}{(total_post-total_pre)/1024/1024:.1f}MB")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Analyze .hprof heap dump files')
+    parser.add_argument('files', nargs='+', help='.hprof file(s)')
+    parser.add_argument('--top', type=int, default=30, help='Top N classes to show')
+    parser.add_argument('--diff', action='store_true', help='Compare two hprof files')
+    args = parser.parse_args()
+
+    if args.diff and len(args.files) == 2:
+        print(f"Parsing {args.files[0]}...")
+        pre = parse_histogram(args.files[0])
+        print(f"Parsing {args.files[1]}...")
+        post = parse_histogram(args.files[1])
+        print()
+        print_diff(pre, post, args.top)
+    elif len(args.files) == 1:
+        print(f"Parsing {args.files[0]}...")
+        hist = parse_histogram(args.files[0])
+        print()
+        print_histogram(hist, args.top)
+    else:
+        parser.print_help()
diff --git a/.github/skills/cosmos-benchmark-analyze/references/thresholds.md b/.github/skills/cosmos-benchmark-analyze/references/thresholds.md
new file mode 100644
index 000000000000..3532aabbf108
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-analyze/references/thresholds.md
@@ -0,0 +1,49 @@
+# Pass/Fail Thresholds
+
+## Hard Fail
+
+| Metric | Threshold | Verdict |
+|---|---|---|
+| Threads after close > baseline + 2 | Hard fail | 🔴 LEAK DETECTED |
+| Heap after close > baseline × 1.1 | Hard fail | 🔴 MEMORY LEAK |
+
+## Soft Warn
+
+| Metric | Threshold | Verdict |
+|---|---|---|
+| P99 latency at N=100 > 5× P99 at N=1 | Soft warn | 🟡 INVESTIGATE |
+| Throughput at N=100 < 0.7× throughput at N=1 | Soft warn | 🟡 INVESTIGATE |
+| GC pause max > 200ms | Soft warn | 🟡 Tune GC |
+
+## CSV Columns (monitor.csv from monitor.sh)
+
+| Column | Type | Description |
+|---|---|---|
+| `timestamp` | ISO 8601 | Snapshot time (UTC) |
+| `threads` | int | Live thread count (from /proc/PID/task) |
+| `fds` | int | Open file descriptors (from /proc/PID/fd) |
+| `rss_kb` | int | Resident set size in KB |
+| `cpu_pct` | float | CPU usage percentage |
+| `heap_used_kb` | long | Used heap (S1U+EU+OU from jstat) |
+| `heap_max_kb` | long | Max heap capacity (S0C+S1C+EC+OC from jstat) |
+| `gc_count` | int | Cumulative GC count (YGC+FGC+CGC) |
+| `gc_time_ms` | long | Cumulative GC time in ms |
+
+## Key Snapshots
+
+Identify snapshots using lifecycle events from the benchmark log file (pattern: `[LIFECYCLE] <event> timestamp=<ISO>`):
+
+- **Baseline** = first monitor.csv row after `PRE_CREATE` lifecycle event
+- **Peak** = row with highest `heap_used_kb`
+- **Final** = last monitor.csv row (after `POST_CLOSE` lifecycle event + settle time)
+
+## Computed Metrics
+
+- `thread_delta = final.threads - baseline.threads`
+- `heap_ratio = final.heap_used_kb / baseline.heap_used_kb`
+
+## Status Indicators
+
+- ✅ = passed / improved (delta ≤ 0 or within threshold)
+- 🟡 = marginal (<10% change)
+- 🔴 = failed / regressed (>10% worse or threshold exceeded)
diff --git a/.github/skills/cosmos-benchmark-provision/SKILL.md b/.github/skills/cosmos-benchmark-provision/SKILL.md
new file mode 100644
index 000000000000..84dcfa0356c8
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-provision/SKILL.md
@@ -0,0 +1,215 @@
+---
+name: cosmos-benchmark-provision
+description: Provision Azure infrastructure for Cosmos DB benchmarks — create or reuse Cosmos DB accounts, Application Insights, and Azure VMs. Verifies region capacity before creating resources. Use when the user needs to create benchmark resources, reuse existing ones, provision infrastructure, or set up a DR drill. Triggers on "provision", "create accounts", "create VM", "create app insights", "setup infrastructure", "DR drill setup", "reuse existing".
+---
+
+# Provision Benchmark Infrastructure
+
+Create or reuse Azure resources needed for a benchmark or DR drill. All resources must be in the **same region**.
+
+## Before You Start
+
+### 1. Choose region
+
+Ask the user for the target Azure region. All resources will be co-located.
+
+### 2. Verify capacity
+
+**Check quotas before creating anything.** If any check fails, prompt the user to choose a different region.
+
+```bash
+# VM vCPU quota
+az vm list-usage --location <region> -o table | grep -i "Standard DSv5"
+
+# Cosmos DB account count (limit is typically 50 per subscription)
+az cosmosdb list --query "length(@)"
+
+# App Insights limit check (200 per subscription per region)
+az monitor app-insights component list --query "[?location=='<region>'] | length(@)"
+```
+
+### 3. Choose create vs. reuse
+
+For each resource type, ask the user:
+- **Create new** — provision fresh resources
+- **Reuse existing** — point to config files or discover from Azure
+
+## 1. Cosmos DB Accounts
+
+### Create new
+
+Ask user for: count (N), naming prefix, consistency level (default: Session), throughput (default: 10000 RU/s).
+
+```bash
+# Create resource group (shared by all benchmark resources)
+az group create --name rg-cosmos-benchmark --location <region>
+
+# Create N accounts in parallel
+for i in $(seq 0 $((N-1))); do
+  az cosmosdb create \
+    --resource-group rg-cosmos-benchmark \
+    --name "${PREFIX}${i}" \
+    --locations regionName=<region> failoverPriority=0 \
+    --default-consistency-level Session \
+    --kind GlobalDocumentDB &
+done
+wait
+
+# Create database + container in each
+for i in $(seq 0 $((N-1))); do
+  az cosmosdb sql database create \
+    --resource-group rg-cosmos-benchmark \
+    --account-name "${PREFIX}${i}" \
+    --name benchdb
+
+  az cosmosdb sql container create \
+    --resource-group rg-cosmos-benchmark \
+    --account-name "${PREFIX}${i}" \
+    --database-name benchdb \
+    --name benchcol \
+    --partition-key-path /id \
+    --throughput 10000
+done
+```
+
+### Export credentials
+
+Generate `clientHostAndKey.txt` for the **setup** skill:
+
+```bash
+for i in $(seq 0 $((N-1))); do
+  ENDPOINT=$(az cosmosdb show -g rg-cosmos-benchmark -n "${PREFIX}${i}" --query documentEndpoint -o tsv)
+  KEY=$(az cosmosdb keys list -g rg-cosmos-benchmark -n "${PREFIX}${i}" --query primaryMasterKey -o tsv)
+  echo "${PREFIX}${i},${ENDPOINT},${KEY}"
+done > clientHostAndKey.txt
+```
+
+### Reuse existing
+
+Ask user how to discover existing accounts:
+- **Option A**: Point to an existing `clientHostAndKey.txt` file
+- **Option B**: Discover from resource group:
+  ```bash
+  az cosmosdb list -g <rg> --query "[].{name:name, endpoint:documentEndpoint}" -o table
+  ```
+  Then export credentials using the loop above with the discovered account names.
+
+## 2. Application Insights
+
+### Create new
+
+```bash
+az monitor app-insights component create \
+  --app <app-insights-name> \
+  --location <region> \
+  --resource-group rg-cosmos-benchmark \
+  --kind web \
+  --application-type web
+```
+
+### Get connection string
+
+```bash
+AI_CONN_STR=$(az monitor app-insights component show \
+  --app <app-insights-name> \
+  --resource-group rg-cosmos-benchmark \
+  --query connectionString -o tsv)
+echo "$AI_CONN_STR" > test-setup/app-insights-connection-string.txt
+```
+
+The **run** skill uses this via environment variable `APPLICATIONINSIGHTS_CONNECTION_STRING`.
+
+### Reuse existing
+
+Ask user how to discover:
+- **Option A**: Provide the connection string directly
+- **Option B**: Discover from resource group:
+  ```bash
+  az monitor app-insights component list -g <rg> --query "[].{name:name, connectionString:connectionString}" -o table
+  ```
+
+Save to `test-setup/app-insights-connection-string.txt`.
+
+## 3. Azure VMs
+
+### Create new (via provision script)
+
+```bash
+bash sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh \
+  --new --location <region> --create-key \
+  [--size Standard_D16s_v5] \
+  [--disk-size 256] \
+  [--rg rg-cosmos-benchmark] \
+  [--vm-name vm-benchmark-01] \
+  [--skip-setup]
+```
+
+Script flags:
+| Flag | Default | Description |
+|---|---|---|
+| `--new` | — | Create a new VM |
+| `--location` | eastus | Azure region |
+| `--create-key [path]` | — | Generate SSH key pair (optional path) |
+| `--ssh-key <pub>` | — | Use existing public key |
+| `--size` | Standard_D16s_v5 | VM SKU |
+| `--disk-size` | 256 | OS disk in GB |
+| `--rg` | rg-cosmos-benchmark | Resource group |
+| `--vm-name` | vm-benchmark-01 | VM name |
+| `--skip-setup` | false | Skip auto-running setup-benchmark-vm.sh |
+
+### Create new (manual Azure CLI)
+
+```bash
+az vm create \
+  --resource-group rg-cosmos-benchmark \
+  --name vm-benchmark-01 \
+  --image Ubuntu2204 \
+  --size Standard_D16s_v5 \
+  --accelerated-networking true \
+  --admin-username benchuser \
+  --generate-ssh-keys \
+  --os-disk-size-gb 256 \
+  --storage-sku Premium_LRS
+az vm open-port --resource-group rg-cosmos-benchmark --name vm-benchmark-01 --port 22
+```
+
+### Reuse existing VM
+
+```bash
+# Option A: provide IP directly
+bash sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh \
+  --existing --ip <VM_IP> --user benchuser --key ~/.ssh/id_rsa
+
+# Option B: discover from resource group + VM name
+bash sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh \
+  --existing --rg <rg> --vm-name <name> --key ~/.ssh/id_rsa
+```
+
+### Connection info saved
+
+The provision script saves:
+- `.vm-ip` — VM public IP
+- `.vm-user` — SSH username
+- `.vm-key` — path to SSH private key
+- `test-setup/vm-config.env` — all three in `KEY=VALUE` format
+
+Read `references/vm-sizing.md` for workload-specific VM sizing.
+
+## 4. Resource Group Cleanup
+
+When done with all benchmarks:
+```bash
+az group delete --name rg-cosmos-benchmark --yes --no-wait
+```
+
+## After Provisioning
+
+Use the **cosmos-benchmark-setup** skill to:
+- Install JDK/Maven/tools on the VM
+- Generate `tenants.json` from `clientHostAndKey.txt`
+- Clone repo and build the benchmark JAR
+
+## References
+
+- **VM sizing by workload**: `references/vm-sizing.md`
+- **Provision script**: `sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh`
diff --git a/.github/skills/cosmos-benchmark-provision/references/vm-sizing.md b/.github/skills/cosmos-benchmark-provision/references/vm-sizing.md
new file mode 100644
index 000000000000..08f35f89d955
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-provision/references/vm-sizing.md
@@ -0,0 +1,22 @@
+# VM Sizing Reference
+
+## Recommended configurations by workload
+
+| Tenants | Scenario | VM Size | vCPUs | RAM | Notes |
+|---------|----------|---------|-------|-----|-------|
+| 1-10 | Quick test | Standard_D4s_v5 | 4 | 16 GB | Minimal |
+| 10-50 | CHURN, SCALING | Standard_D16s_v5 | 16 | 64 GB | Default recommendation |
+| 50-100 | POOL_PRESSURE | Standard_D32s_v5 | 32 | 128 GB | High concurrency |
+| 100+ | SOAK (24h) | Standard_D16s_v5 | 16 | 64 GB | Long-running, moderate load |
+
+## Cost considerations
+
+- Use **spot instances** for non-critical test runs (~60-80% cheaper)
+- **Deallocate** when not in use: `az vm deallocate -g <rg> -n <vm>`
+- **Start** when needed: `az vm start -g <rg> -n <vm>`
+- Consider **auto-shutdown** for overnight runs
+
+## Region selection
+
+Choose a region **co-located** with your Cosmos DB accounts to minimize network latency.
+If accounts are in multiple regions, pick the region with the most accounts.
diff --git a/.github/skills/cosmos-benchmark-run/SKILL.md b/.github/skills/cosmos-benchmark-run/SKILL.md
new file mode 100644
index 000000000000..b3822ff908cf
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-run/SKILL.md
@@ -0,0 +1,140 @@
+---
+name: cosmos-benchmark-run
+description: Run a Cosmos DB benchmark scenario. Supports branch/PR/commit checkout, CHURN preset and custom scenarios, auto-configures App Insights monitoring from provision output, runs in tmux on remote VMs, and supports multi-VM parallel execution. Triggers on "run benchmark", "execute test", "test fix on branch", "run from PR", "start benchmark", "DR drill".
+---
+
+# Run a Benchmark
+
+Execute a benchmark on one or more VMs. Always uses `run-benchmark.sh` wrapper (includes `monitor.sh` + git metadata capture).
+
+## VM Connection
+
+Auto-detect from provision output:
+
+```bash
+VM_IP=$(cat .vm-ip)
+VM_USER=$(cat .vm-user)
+VM_KEY=$(cat .vm-key)
+SSH_CMD="ssh -i $VM_KEY $VM_USER@$VM_IP"
+```
+
+## 1. Configure Monitoring
+
+### Application Insights (auto-configure from provision output)
+
+If `test-setup/app-insights-connection-string.txt` exists:
+
+```bash
+AI_CONN_STR=$(cat test-setup/app-insights-connection-string.txt)
+$SSH_CMD "echo 'export APPLICATIONINSIGHTS_CONNECTION_STRING=\"$AI_CONN_STR\"' >> ~/.bashrc"
+```
+
+If not found, ask the user whether to:
+- Provide an App Insights connection string
+- Skip App Insights (local CSV metrics only)
+
+### Graphite (optional)
+
+```bash
+$SSH_CMD "echo 'export GRAPHITE_SERVICE_ADDRESS=\"<host>:<port>\"' >> ~/.bashrc"
+```
+
+## 2. Scenario Selection
+
+Read `references/presets.md` for preset flag recipes.
+
+### CHURN preset (default — leak detection)
+
+Tests client create/close resource leaks (threads, connections, memory).
+
+```
+-cycles 5 -numberOfOperations 500
+```
+
+The harness auto-applies when cycles > 1:
+- `settleTimeMs=90000`
+- `suppressCleanup=true`
+- `gcBetweenCycles=true`
+
+### Custom scenarios
+
+Read `references/scenarios.md` for the full operation catalog (20 types) and tuning parameters.
+Users can pass any combination of CLI flags for custom workloads.
+
+## 3. Execute on Single VM
+
+Always run inside tmux so the benchmark survives SSH disconnection.
+
+```bash
+$SSH_CMD "tmux new-session -d -s bench 'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
+  bash scripts/run-benchmark.sh CHURN ~/tenants.json ./results/<run-name> [extra-flags]'"
+```
+
+### Monitor progress
+
+```bash
+# Attach to tmux session
+$SSH_CMD -t "tmux attach -t bench"
+
+# Or peek at output without attaching
+$SSH_CMD "tmux capture-pane -t bench -p | tail -30"
+
+# Check if monitor.csv is growing
+$SSH_CMD "wc -l ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<run-name>/monitor.csv"
+```
+
+### Run naming convention
+
+Use descriptive names: `<date>-<scenario>-<branch>`, e.g.:
+```
+20260226-CHURN-fix-telemetry-leak
+20260226-CHURN-main-baseline
+```
+
+## 4. Execute on Multiple VMs (parallel)
+
+For comparing versions or running different scenarios simultaneously:
+
+```bash
+# VM 1: baseline (main branch)
+ssh -i $VM_KEY $VM_USER@<VM1_IP> "tmux new-session -d -s bench \
+  'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
+   bash scripts/run-benchmark.sh CHURN ~/tenants.json ./results/baseline-main'"
+
+# VM 2: fix branch
+ssh -i $VM_KEY $VM_USER@<VM2_IP> "tmux new-session -d -s bench \
+  'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
+   bash scripts/run-benchmark.sh CHURN ~/tenants.json ./results/fix-branch'"
+```
+
+To test different SDK versions on different VMs, use the **setup** skill on each VM with different branch/PR/commit targets before running.
+
+## 5. What run-benchmark.sh Does
+
+The wrapper script (`scripts/run-benchmark.sh`) handles:
+1. Captures git metadata (branch, commit) → `git-info.json`
+2. Launches JVM with `-Xmx8g -XX:+UseG1GC` + GC logging
+3. Spawns `monitor.sh` in parallel for external JVM monitoring → `monitor.csv`
+4. Cleans up monitoring on exit
+
+### Output directory structure
+
+```
+results/<run-name>/
+├── git-info.json          # branch, commit SHA
+├── monitor.csv            # external JVM metrics (threads, heap, FDs, GC)
+├── metrics/               # Codahale CSV metrics (latency histograms, throughput)
+├── gc.log                 # G1GC log
+└── heap-dumps/            # if OOM or manually triggered
+```
+
+## After Run
+
+Suggest using the **cosmos-benchmark-analyze** skill to analyze results.
+
+## References
+
+- **Preset flag recipes**: `references/presets.md`
+- **Full operation catalog & custom scenarios**: `references/scenarios.md`
+- **Run script**: `sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh`
+- **Trigger script**: `sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh`
diff --git a/.github/skills/cosmos-benchmark-run/references/presets.md b/.github/skills/cosmos-benchmark-run/references/presets.md
new file mode 100644
index 000000000000..52460cdc05d3
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-run/references/presets.md
@@ -0,0 +1,29 @@
+# Benchmark Scenario Presets
+
+CLI flag recipes for benchmark scenarios.
+The benchmark harness uses smart defaults: when cycles > 1, it
+automatically sets settleTimeMs=90000, suppressCleanup=true,
+and gcBetweenCycles=true unless explicitly overridden.
+
+## CHURN (Leak Detection)
+
+Tests client lifecycle resource leaks (threads, connections, memory).
+
+```
+-cycles 5 -numberOfOperations 500
+```
+
+The harness auto-applies:
+- settleTimeMs=90000 (> 60s BoundedElastic evictor TTL)
+- suppressCleanup=true (keep DB/container across cycles)
+- gcBetweenCycles=true (separate real leaks from GC-eligible garbage)
+
+To override any default: `-cycles 5 -settleTimeMs 120000`
+
+## Adding New Presets
+
+New presets can be added here as needed. Each preset should define:
+1. A name and description of what it tests
+2. The minimal CLI flags needed
+3. Which defaults the harness auto-applies
+4. When to use this preset
diff --git a/.github/skills/cosmos-benchmark-run/references/scenarios.md b/.github/skills/cosmos-benchmark-run/references/scenarios.md
new file mode 100644
index 000000000000..c9d0c402c71f
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-run/references/scenarios.md
@@ -0,0 +1,75 @@
+# Operation Catalog & Custom Scenarios
+
+## All 20 Operation Types
+
+| Operation | Description | Use Case |
+|---|---|---|
+| `ReadThroughput` | Point reads, max throughput | Baseline read performance |
+| `ReadLatency` | Point reads, measure latency | P50/P99 latency profiling |
+| `WriteThroughput` | Point writes, max throughput | Write performance |
+| `WriteLatency` | Point writes, measure latency | Write latency profiling |
+| `QuerySingle` | Single-partition query | Query baseline |
+| `QuerySingleMany` | Single-partition query, many results | Large result set handling |
+| `QueryParallel` | Parallel cross-partition query | Fanout query performance |
+| `QueryCross` | Cross-partition query | Cross-partition overhead |
+| `QueryInClauseParallel` | IN clause with parallel execution | Batch lookup pattern |
+| `QueryOrderby` | Query with ORDER BY | Sort performance |
+| `QueryAggregate` | Aggregate query (COUNT, SUM) | Aggregation overhead |
+| `QueryAggregateTopOrderby` | Aggregate + TOP + ORDER BY | Complex query performance |
+| `QueryTopOrderby` | TOP + ORDER BY | Pagination pattern |
+| `Mixed` | 90% read, 9% write, 1% query | Realistic mixed workload |
+| `ReadMyWrites` | Write then read same document | Session consistency validation |
+| `ReadManyLatency` | ReadMany API, measure latency | Batch read latency |
+| `ReadManyThroughput` | ReadMany API, max throughput | Batch read throughput |
+| `ReadAllItemsOfLogicalPartition` | Read all items in a partition | Partition scan performance |
+| `CtlWorkload` | CTL benchmark workload | Internal benchmark |
+| `LinkedInCtlWorkload` | LinkedIn-specific CTL variant | Partner-specific workload |
+
+## Custom Scenario Examples
+
+### High-concurrency write stress test
+
+```bash
+-operation WriteThroughput -concurrency 100 -numberOfOperations 100000 \
+-documentDataFieldSize 1024 -documentDataFieldCount 5
+```
+
+### Mixed workload with large documents
+
+```bash
+-operation Mixed -concurrency 50 -numberOfOperations 50000 \
+-documentDataFieldSize 4096 -documentDataFieldCount 10 \
+-readWriteQueryReadManyPct "70,20,5,5"
+```
+
+### Query performance profiling
+
+```bash
+-operation QueryParallel -concurrency 20 -numberOfOperations 10000 \
+-numberOfPreCreatedDocuments 10000
+```
+
+### Encryption benchmark
+
+```bash
+-operation ReadThroughput -concurrency 20 -numberOfOperations 10000 \
+-encryptionEnabled true -encryptedStringFieldCount 3 -encryptedLongFieldCount 2
+```
+
+## Key Tuning Parameters
+
+| Flag | Default | Description |
+|---|---|---|
+| `-concurrency` | 20 | Concurrent operations |
+| `-numberOfOperations` | 100000 | Total operations per run |
+| `-numberOfPreCreatedDocuments` | 1000 | Documents seeded before benchmark |
+| `-documentDataFieldSize` | 20 | Bytes per data field |
+| `-documentDataFieldCount` | 1 | Number of data fields per document |
+| `-connectionMode` | GATEWAY | GATEWAY or DIRECT |
+| `-consistencyLevel` | SESSION | Session, Eventual, Strong, etc. |
+| `-maxConnectionPoolSize` | 1000 | Max HTTP connections per client |
+| `-cycles` | 1 | Lifecycle repetitions (>1 enables leak detection) |
+| `-settleTimeMs` | 90000 | Wait between cycles (auto when cycles>1) |
+| `-gcBetweenCycles` | true | Force GC between cycles (auto when cycles>1) |
+| `-printingInterval` | 10 | Metrics reporting interval (seconds) |
+| `-maxRunningTimeDuration` | - | Max wall-clock time (ISO 8601 duration) |
diff --git a/.github/skills/cosmos-benchmark-setup/SKILL.md b/.github/skills/cosmos-benchmark-setup/SKILL.md
new file mode 100644
index 000000000000..098f0aedb9a0
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-setup/SKILL.md
@@ -0,0 +1,178 @@
+---
+name: cosmos-benchmark-setup
+description: Set up the benchmark execution environment — install tools on VM, clone repo at a specific branch/PR/commit/tag, generate tenants.json, copy files, build the benchmark JAR, and verify readiness. Auto-detects VM from provision output. Triggers on "setup benchmark", "install JDK", "create tenants.json", "create config", "copy files to VM", "verify build", "clone repo on VM".
+---
+
+# Benchmark Environment Setup
+
+Prepare the execution environment after infrastructure is provisioned.
+
+## VM Connection
+
+Auto-detect VM connection info from provision output:
+
+```bash
+VM_IP=$(cat .vm-ip)
+VM_USER=$(cat .vm-user)
+VM_KEY=$(cat .vm-key)
+SSH_CMD="ssh -i $VM_KEY $VM_USER@$VM_IP"
+```
+
+If `.vm-ip` doesn't exist, ask the user for VM IP and SSH credentials.
+
+All remote commands below use `$SSH_CMD` as shorthand.
+
+## 1. Install Dependencies on VM
+
+Run the setup script:
+
+```bash
+$SSH_CMD 'bash -s' < sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh
+```
+
+### What gets installed
+
+| Component | Version | Location |
+|-----------|---------|----------|
+| OpenJDK | 21 | System package |
+| Maven | 3.9.12 | `/opt/apache-maven-3.9.12` |
+| async-profiler | 3.0 | `/opt/async-profiler-3.0-linux-x64` |
+| git, net-tools, sysstat, tmux | latest | System packages |
+
+### Verify
+
+```bash
+$SSH_CMD 'java -version 2>&1 | head -1; /opt/apache-maven-3.9.12/bin/mvn --version 2>&1 | head -1; tmux -V; df -h / | tail -1'
+```
+
+## 2. Clone/Update Repo on VM
+
+### From a branch
+
+```bash
+$SSH_CMD "git clone --depth 1 -b <branch> <repo-url> ~/azure-sdk-for-java"
+```
+
+If already cloned:
+```bash
+$SSH_CMD "cd ~/azure-sdk-for-java && git fetch --depth 1 origin <branch> && git checkout <branch> && git pull origin <branch>"
+```
+
+### From a PR number
+
+```bash
+$SSH_CMD "cd ~/azure-sdk-for-java && git fetch origin pull/<pr-number>/head:pr-<pr-number> && git checkout pr-<pr-number>"
+```
+
+### From a commit SHA
+
+```bash
+$SSH_CMD "cd ~/azure-sdk-for-java && git fetch origin && git checkout <commit-sha>"
+```
+
+### From a tag
+
+```bash
+$SSH_CMD "cd ~/azure-sdk-for-java && git fetch --tags origin && git checkout tags/<tag-name>"
+```
+
+Ask the user which ref type they want. Default to branch if unspecified.
+
+## 3. Generate Benchmark Configuration
+
+### Multi-tenant mode (tenants.json)
+
+If `clientHostAndKey.txt` exists (created by provision skill), generate `tenants.json`:
+
+#### Sample tenants.json template
+
+```json
+{
+  "globalDefaults": {
+    "connectionMode": "GATEWAY",
+    "consistencyLevel": "SESSION",
+    "concurrency": "20",
+    "numberOfOperations": "100000",
+    "operation": "ReadThroughput",
+    "numberOfPreCreatedDocuments": "1000",
+    "connectionSharingAcrossClientsEnabled": "false",
+    "maxConnectionPoolSize": "1000",
+    "applicationName": "cosmos-bench"
+  },
+  "tenants": []
+}
+```
+
+#### Generation steps
+
+1. Read `clientHostAndKey.txt` — each line: `<name>,<endpoint>,<key>`
+2. For each line, create a tenant entry:
+   ```json
+   { "id": "tenant-<index>", "serviceEndpoint": "<endpoint>", "masterKey": "<key>", "databaseId": "benchdb", "containerId": "benchcol" }
+   ```
+3. Ask user if they want to customize `globalDefaults` (operation type, concurrency, connection mode, etc.) or use the template defaults.
+4. Write to `sdk/cosmos/azure-cosmos-benchmark/tenants.json`
+5. Verify: parse output, confirm tenant count, first/last endpoints.
+
+### Single-tenant mode
+
+No config file needed. The **run** skill will construct CLI flags directly:
+```
+-serviceEndpoint <endpoint> -masterKey <key> -databaseId benchdb -containerId benchcol
+```
+
+### Gitignore
+
+Ensure `clientHostAndKey.txt` and `tenants.json` are in `.gitignore` — they contain secrets.
+
+## 4. Copy Config Files to VM
+
+```bash
+scp -i $VM_KEY -o StrictHostKeyChecking=no tenants.json $VM_USER@$VM_IP:~/tenants.json
+```
+
+## 5. Build the Benchmark JAR
+
+All builds run in a tmux session to survive SSH disconnection.
+
+Common Maven flags: `MAVEN_FLAGS="-e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true"`
+
+```bash
+$SSH_CMD "tmux new-session -d -s build 'export PATH=/opt/apache-maven-3.9.12/bin:\$PATH && \
+  cd ~/azure-sdk-for-java/sdk/cosmos && \
+  mvn $MAVEN_FLAGS -pl ,azure-cosmos -am clean install && \
+  mvn $MAVEN_FLAGS -pl ,azure-cosmos-test clean install && \
+  mvn $MAVEN_FLAGS -pl ,azure-cosmos-encryption clean install && \
+  mvn $MAVEN_FLAGS -pl ,azure-cosmos-benchmark clean package -P package-assembly && \
+  echo BUILD_COMPLETE'"
+```
+
+Monitor build progress:
+```bash
+$SSH_CMD "tmux capture-pane -t build -p | tail -20"
+```
+
+## 6. Verify Readiness
+
+```bash
+$SSH_CMD "echo '=== VM Check ==='; \
+  java -version 2>&1 | head -1; \
+  /opt/apache-maven-3.9.12/bin/mvn --version 2>&1 | head -1; \
+  df -h / | tail -1; \
+  ls ~/tenants.json 2>/dev/null && echo 'Config: ✅' || echo 'Config: ❌ MISSING'; \
+  ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/target/*jar-with-dependencies.jar 2>/dev/null \
+    && echo 'JAR: ✅' || echo 'JAR: ❌ MISSING'; \
+  cd ~/azure-sdk-for-java && echo \"Branch: \$(git rev-parse --abbrev-ref HEAD)  Commit: \$(git rev-parse --short HEAD)\""
+```
+
+Checklist:
+- ✅ JDK 21 installed
+- ✅ Maven 3.8.1+ installed
+- ✅ Repo cloned (correct branch/PR/commit)
+- ✅ Benchmark JAR built
+- ✅ Config file present (tenants.json or single-tenant credentials)
+- ✅ Disk space >10 GB free
+
+## After Setup
+
+Suggest: "Ready to run. Use the **cosmos-benchmark-run** skill to start a benchmark."
diff --git a/.github/skills/cosmos-benchmark-status/SKILL.md b/.github/skills/cosmos-benchmark-status/SKILL.md
new file mode 100644
index 000000000000..37cd4efc179d
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-status/SKILL.md
@@ -0,0 +1,61 @@
+---
+name: cosmos-benchmark-status
+description: Show the current state of the Cosmos DB benchmark environment — Azure resources, recent runs, VM status, build status, config, and App Insights health. Use when the user asks "what runs do I have", "benchmark status", "is the VM up", "show recent results", "list accounts", or wants an overview before starting a benchmark session.
+---
+
+# Benchmark Environment Status
+
+Check and report the current state of the entire benchmark environment.
+
+## Checks
+
+1. **Azure Resources** (requires `az` CLI):
+   - Cosmos DB accounts: `az cosmosdb list -g rg-cosmos-benchmark --query "[].{name:name, endpoint:documentEndpoint}" -o table`
+   - Application Insights: `az monitor app-insights component list -g rg-cosmos-benchmark --query "[].{name:name, connectionString:connectionString}" -o table`
+   - VMs: `az vm list -g rg-cosmos-benchmark -d --query "[].{name:name, ip:publicIps, state:powerState}" -o table`
+
+2. **App Insights Health**: Verify metrics are being received:
+   ```bash
+   az monitor app-insights query \
+     --app <app-insights-name> \
+     --resource-group rg-cosmos-benchmark \
+     --analytics-query "customMetrics | where timestamp > ago(1h) | summarize count() by bin(timestamp, 5m) | order by timestamp desc | take 5"
+   ```
+   If count > 0, metrics are flowing. If empty, App Insights may not be configured or the benchmark hasn't reported yet.
+
+3. **Recent results**: List directories under `sdk/cosmos/azure-cosmos-benchmark/results/` and `./results/`.
+   - For each: check if `monitor.csv` exists (📊 = complete, ❌ = incomplete)
+   - Read `git-info.json` for branch/commit if present
+   - Show most recent 5–10 runs
+
+4. **Benchmark VM**: Check for `.vm-ip` file in workspace root.
+   - If found, SSH to verify: `ssh -i $(cat .vm-key) $(cat .vm-user)@$(cat .vm-ip) "echo OK; uptime; java -version 2>&1 | head -1; ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/ 2>/dev/null | tail -5"`
+   - Report: reachable/unreachable, uptime, JDK version, runs on VM
+
+5. **Build status**: Check for `sdk/cosmos/azure-cosmos-benchmark/target/azure-cosmos-benchmark-*-jar-with-dependencies.jar`.
+   - Report: found (with file timestamp) or "Not built"
+
+6. **Config**: Check for `tenants.json` in workspace root and `sdk/cosmos/azure-cosmos-benchmark/`.
+   - Report: found/not found, count of tenants if parseable
+
+## Output Format
+
+```
+☁️ Azure Resources:
+  Cosmos DB: 3 accounts (cosmosdb-bench-0, -1, -2)
+  App Insights: ✅ cosmos-bench-ai (receiving metrics: 142 events/5min)
+  VMs: ✅ vm-benchmark-01 (running, 4.154.169.45)
+
+📊 Recent Runs (local):
+  📊 20260226-CHURN-fix-leak/     branch: fix-leak  commit: abc1234
+  📊 20260225-CHURN-main-base/    branch: main      commit: def5678
+  ❌ 20260226-CHURN-experiment/   (incomplete)
+
+📊 Recent Runs (VM):
+  📊 20260226-CHURN-fix-leak/
+  📊 20260225-CHURN-main-base/
+
+🖥️ Benchmark VM: ✅ 4.154.169.45 (up 3 days, JDK 21.0.10)
+🔨 Build: ✅ JAR found (2026-02-26 14:30)
+📋 Config: ✅ tenants.json (3 tenants)
+```
diff --git a/.github/skills/skill-creator/LICENSE.txt b/.github/skills/skill-creator/LICENSE.txt
new file mode 100644
index 000000000000..7a4a3ea2424c
--- /dev/null
+++ b/.github/skills/skill-creator/LICENSE.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/.github/skills/skill-creator/SKILL.md b/.github/skills/skill-creator/SKILL.md
new file mode 100644
index 000000000000..158979709538
--- /dev/null
+++ b/.github/skills/skill-creator/SKILL.md
@@ -0,0 +1,357 @@
+---
+name: skill-creator
+description: Guide for creating effective skills. This skill should be used when users want to create a new skill (or update an existing skill) that extends Claude's capabilities with specialized knowledge, workflows, or tool integrations.
+license: Complete terms in LICENSE.txt
+---
+
+# Skill Creator
+
+This skill provides guidance for creating effective skills.
+
+## About Skills
+
+Skills are modular, self-contained packages that extend Claude's capabilities by providing
+specialized knowledge, workflows, and tools. Think of them as "onboarding guides" for specific
+domains or tasks—they transform Claude from a general-purpose agent into a specialized agent
+equipped with procedural knowledge that no model can fully possess.
+
+### What Skills Provide
+
+1. Specialized workflows - Multi-step procedures for specific domains
+2. Tool integrations - Instructions for working with specific file formats or APIs
+3. Domain expertise - Company-specific knowledge, schemas, business logic
+4. Bundled resources - Scripts, references, and assets for complex and repetitive tasks
+
+## Core Principles
+
+### Concise is Key
+
+The context window is a public good. Skills share the context window with everything else Claude needs: system prompt, conversation history, other Skills' metadata, and the actual user request.
+
+**Default assumption: Claude is already very smart.** Only add context Claude doesn't already have. Challenge each piece of information: "Does Claude really need this explanation?" and "Does this paragraph justify its token cost?"
+
+Prefer concise examples over verbose explanations.
+
+### Set Appropriate Degrees of Freedom
+
+Match the level of specificity to the task's fragility and variability:
+
+**High freedom (text-based instructions)**: Use when multiple approaches are valid, decisions depend on context, or heuristics guide the approach.
+
+**Medium freedom (pseudocode or scripts with parameters)**: Use when a preferred pattern exists, some variation is acceptable, or configuration affects behavior.
+
+**Low freedom (specific scripts, few parameters)**: Use when operations are fragile and error-prone, consistency is critical, or a specific sequence must be followed.
+
+Think of Claude as exploring a path: a narrow bridge with cliffs needs specific guardrails (low freedom), while an open field allows many routes (high freedom).
+
+### Anatomy of a Skill
+
+Every skill consists of a required SKILL.md file and optional bundled resources:
+
+```
+skill-name/
+├── SKILL.md (required)
+│   ├── YAML frontmatter metadata (required)
+│   │   ├── name: (required)
+│   │   ├── description: (required)
+│   │   └── compatibility: (optional, rarely needed)
+│   └── Markdown instructions (required)
+└── Bundled Resources (optional)
+    ├── scripts/          - Executable code (Python/Bash/etc.)
+    ├── references/       - Documentation intended to be loaded into context as needed
+    └── assets/           - Files used in output (templates, icons, fonts, etc.)
+```
+
+#### SKILL.md (required)
+
+Every SKILL.md consists of:
+
+- **Frontmatter** (YAML): Contains `name` and `description` fields (required), plus optional fields like `license`, `metadata`, and `compatibility`. Only `name` and `description` are read by Claude to determine when the skill triggers, so be clear and comprehensive about what the skill is and when it should be used. The `compatibility` field is for noting environment requirements (target product, system packages, etc.) but most skills don't need it.
+- **Body** (Markdown): Instructions and guidance for using the skill. Only loaded AFTER the skill triggers (if at all).
+
+#### Bundled Resources (optional)
+
+##### Scripts (`scripts/`)
+
+Executable code (Python/Bash/etc.) for tasks that require deterministic reliability or are repeatedly rewritten.
+
+- **When to include**: When the same code is being rewritten repeatedly or deterministic reliability is needed
+- **Example**: `scripts/rotate_pdf.py` for PDF rotation tasks
+- **Benefits**: Token efficient, deterministic, may be executed without loading into context
+- **Note**: Scripts may still need to be read by Claude for patching or environment-specific adjustments
+
+##### References (`references/`)
+
+Documentation and reference material intended to be loaded as needed into context to inform Claude's process and thinking.
+
+- **When to include**: For documentation that Claude should reference while working
+- **Examples**: `references/finance.md` for financial schemas, `references/mnda.md` for company NDA template, `references/policies.md` for company policies, `references/api_docs.md` for API specifications
+- **Use cases**: Database schemas, API documentation, domain knowledge, company policies, detailed workflow guides
+- **Benefits**: Keeps SKILL.md lean, loaded only when Claude determines it's needed
+- **Best practice**: If files are large (>10k words), include grep search patterns in SKILL.md
+- **Avoid duplication**: Information should live in either SKILL.md or references files, not both. Prefer references files for detailed information unless it's truly core to the skill—this keeps SKILL.md lean while making information discoverable without hogging the context window. Keep only essential procedural instructions and workflow guidance in SKILL.md; move detailed reference material, schemas, and examples to references files.
+
+##### Assets (`assets/`)
+
+Files not intended to be loaded into context, but rather used within the output Claude produces.
+
+- **When to include**: When the skill needs files that will be used in the final output
+- **Examples**: `assets/logo.png` for brand assets, `assets/slides.pptx` for PowerPoint templates, `assets/frontend-template/` for HTML/React boilerplate, `assets/font.ttf` for typography
+- **Use cases**: Templates, images, icons, boilerplate code, fonts, sample documents that get copied or modified
+- **Benefits**: Separates output resources from documentation, enables Claude to use files without loading them into context
+
+#### What to Not Include in a Skill
+
+A skill should only contain essential files that directly support its functionality. Do NOT create extraneous documentation or auxiliary files, including:
+
+- README.md
+- INSTALLATION_GUIDE.md
+- QUICK_REFERENCE.md
+- CHANGELOG.md
+- etc.
+
+The skill should only contain the information needed for an AI agent to do the job at hand. It should not contain auxilary context about the process that went into creating it, setup and testing procedures, user-facing documentation, etc. Creating additional documentation files just adds clutter and confusion.
+
+### Progressive Disclosure Design Principle
+
+Skills use a three-level loading system to manage context efficiently:
+
+1. **Metadata (name + description)** - Always in context (~100 words)
+2. **SKILL.md body** - When skill triggers (<5k words)
+3. **Bundled resources** - As needed by Claude (Unlimited because scripts can be executed without reading into context window)
+
+#### Progressive Disclosure Patterns
+
+Keep SKILL.md body to the essentials and under 500 lines to minimize context bloat. Split content into separate files when approaching this limit. When splitting out content into other files, it is very important to reference them from SKILL.md and describe clearly when to read them, to ensure the reader of the skill knows they exist and when to use them.
+
+**Key principle:** When a skill supports multiple variations, frameworks, or options, keep only the core workflow and selection guidance in SKILL.md. Move variant-specific details (patterns, examples, configuration) into separate reference files.
+
+**Pattern 1: High-level guide with references**
+
+```markdown
+# PDF Processing
+
+## Quick start
+
+Extract text with pdfplumber:
+[code example]
+
+## Advanced features
+
+- **Form filling**: See [FORMS.md](FORMS.md) for complete guide
+- **API reference**: See [REFERENCE.md](REFERENCE.md) for all methods
+- **Examples**: See [EXAMPLES.md](EXAMPLES.md) for common patterns
+```
+
+Claude loads FORMS.md, REFERENCE.md, or EXAMPLES.md only when needed.
+
+**Pattern 2: Domain-specific organization**
+
+For Skills with multiple domains, organize content by domain to avoid loading irrelevant context:
+
+```
+bigquery-skill/
+├── SKILL.md (overview and navigation)
+└── reference/
+    ├── finance.md (revenue, billing metrics)
+    ├── sales.md (opportunities, pipeline)
+    ├── product.md (API usage, features)
+    └── marketing.md (campaigns, attribution)
+```
+
+When a user asks about sales metrics, Claude only reads sales.md.
+
+Similarly, for skills supporting multiple frameworks or variants, organize by variant:
+
+```
+cloud-deploy/
+├── SKILL.md (workflow + provider selection)
+└── references/
+    ├── aws.md (AWS deployment patterns)
+    ├── gcp.md (GCP deployment patterns)
+    └── azure.md (Azure deployment patterns)
+```
+
+When the user chooses AWS, Claude only reads aws.md.
+
+**Pattern 3: Conditional details**
+
+Show basic content, link to advanced content:
+
+```markdown
+# DOCX Processing
+
+## Creating documents
+
+Use docx-js for new documents. See [DOCX-JS.md](DOCX-JS.md).
+
+## Editing documents
+
+For simple edits, modify the XML directly.
+
+**For tracked changes**: See [REDLINING.md](REDLINING.md)
+**For OOXML details**: See [OOXML.md](OOXML.md)
+```
+
+Claude reads REDLINING.md or OOXML.md only when the user needs those features.
+
+**Important guidelines:**
+
+- **Avoid deeply nested references** - Keep references one level deep from SKILL.md. All reference files should link directly from SKILL.md.
+- **Structure longer reference files** - For files longer than 100 lines, include a table of contents at the top so Claude can see the full scope when previewing.
+
+## Skill Creation Process
+
+Skill creation involves these steps:
+
+1. Understand the skill with concrete examples
+2. Plan reusable skill contents (scripts, references, assets)
+3. Initialize the skill (run init_skill.py)
+4. Edit the skill (implement resources and write SKILL.md)
+5. Package the skill (run package_skill.py)
+6. Iterate based on real usage
+
+Follow these steps in order, skipping only if there is a clear reason why they are not applicable.
+
+### Step 1: Understanding the Skill with Concrete Examples
+
+Skip this step only when the skill's usage patterns are already clearly understood. It remains valuable even when working with an existing skill.
+
+To create an effective skill, clearly understand concrete examples of how the skill will be used. This understanding can come from either direct user examples or generated examples that are validated with user feedback.
+
+For example, when building an image-editor skill, relevant questions include:
+
+- "What functionality should the image-editor skill support? Editing, rotating, anything else?"
+- "Can you give some examples of how this skill would be used?"
+- "I can imagine users asking for things like 'Remove the red-eye from this image' or 'Rotate this image'. Are there other ways you imagine this skill being used?"
+- "What would a user say that should trigger this skill?"
+
+To avoid overwhelming users, avoid asking too many questions in a single message. Start with the most important questions and follow up as needed for better effectiveness.
+
+Conclude this step when there is a clear sense of the functionality the skill should support.
+
+### Step 2: Planning the Reusable Skill Contents
+
+To turn concrete examples into an effective skill, analyze each example by:
+
+1. Considering how to execute on the example from scratch
+2. Identifying what scripts, references, and assets would be helpful when executing these workflows repeatedly
+
+Example: When building a `pdf-editor` skill to handle queries like "Help me rotate this PDF," the analysis shows:
+
+1. Rotating a PDF requires re-writing the same code each time
+2. A `scripts/rotate_pdf.py` script would be helpful to store in the skill
+
+Example: When designing a `frontend-webapp-builder` skill for queries like "Build me a todo app" or "Build me a dashboard to track my steps," the analysis shows:
+
+1. Writing a frontend webapp requires the same boilerplate HTML/React each time
+2. An `assets/hello-world/` template containing the boilerplate HTML/React project files would be helpful to store in the skill
+
+Example: When building a `big-query` skill to handle queries like "How many users have logged in today?" the analysis shows:
+
+1. Querying BigQuery requires re-discovering the table schemas and relationships each time
+2. A `references/schema.md` file documenting the table schemas would be helpful to store in the skill
+
+To establish the skill's contents, analyze each concrete example to create a list of the reusable resources to include: scripts, references, and assets.
+
+### Step 3: Initializing the Skill
+
+At this point, it is time to actually create the skill.
+
+Skip this step only if the skill being developed already exists, and iteration or packaging is needed. In this case, continue to the next step.
+
+When creating a new skill from scratch, always run the `init_skill.py` script. The script conveniently generates a new template skill directory that automatically includes everything a skill requires, making the skill creation process much more efficient and reliable.
+
+Usage:
+
+```bash
+scripts/init_skill.py <skill-name> --path <output-directory>
+```
+
+The script:
+
+- Creates the skill directory at the specified path
+- Generates a SKILL.md template with proper frontmatter and TODO placeholders
+- Creates example resource directories: `scripts/`, `references/`, and `assets/`
+- Adds example files in each directory that can be customized or deleted
+
+After initialization, customize or remove the generated SKILL.md and example files as needed.
+
+### Step 4: Edit the Skill
+
+When editing the (newly-generated or existing) skill, remember that the skill is being created for another instance of Claude to use. Include information that would be beneficial and non-obvious to Claude. Consider what procedural knowledge, domain-specific details, or reusable assets would help another Claude instance execute these tasks more effectively.
+
+#### Learn Proven Design Patterns
+
+Consult these helpful guides based on your skill's needs:
+
+- **Multi-step processes**: See references/workflows.md for sequential workflows and conditional logic
+- **Specific output formats or quality standards**: See references/output-patterns.md for template and example patterns
+
+These files contain established best practices for effective skill design.
+
+#### Start with Reusable Skill Contents
+
+To begin implementation, start with the reusable resources identified above: `scripts/`, `references/`, and `assets/` files. Note that this step may require user input. For example, when implementing a `brand-guidelines` skill, the user may need to provide brand assets or templates to store in `assets/`, or documentation to store in `references/`.
+
+Added scripts must be tested by actually running them to ensure there are no bugs and that the output matches what is expected. If there are many similar scripts, only a representative sample needs to be tested to ensure confidence that they all work while balancing time to completion.
+
+Any example files and directories not needed for the skill should be deleted. The initialization script creates example files in `scripts/`, `references/`, and `assets/` to demonstrate structure, but most skills won't need all of them.
+
+#### Update SKILL.md
+
+**Writing Guidelines:** Always use imperative/infinitive form.
+
+##### Frontmatter
+
+Write the YAML frontmatter with `name` and `description`:
+
+- `name`: The skill name
+- `description`: This is the primary triggering mechanism for your skill, and helps Claude understand when to use the skill.
+  - Include both what the Skill does and specific triggers/contexts for when to use it.
+  - Include all "when to use" information here - Not in the body. The body is only loaded after triggering, so "When to Use This Skill" sections in the body are not helpful to Claude.
+  - Example description for a `docx` skill: "Comprehensive document creation, editing, and analysis with support for tracked changes, comments, formatting preservation, and text extraction. Use when Claude needs to work with professional documents (.docx files) for: (1) Creating new documents, (2) Modifying or editing content, (3) Working with tracked changes, (4) Adding comments, or any other document tasks"
+
+Do not include any other fields in YAML frontmatter.
+
+##### Body
+
+Write instructions for using the skill and its bundled resources.
+
+### Step 5: Packaging a Skill
+
+Once development of the skill is complete, it must be packaged into a distributable .skill file that gets shared with the user. The packaging process automatically validates the skill first to ensure it meets all requirements:
+
+```bash
+scripts/package_skill.py <path/to/skill-folder>
+```
+
+Optional output directory specification:
+
+```bash
+scripts/package_skill.py <path/to/skill-folder> ./dist
+```
+
+The packaging script will:
+
+1. **Validate** the skill automatically, checking:
+
+   - YAML frontmatter format and required fields
+   - Skill naming conventions and directory structure
+   - Description completeness and quality
+   - File organization and resource references
+
+2. **Package** the skill if validation passes, creating a .skill file named after the skill (e.g., `my-skill.skill`) that includes all files and maintains the proper directory structure for distribution. The .skill file is a zip file with a .skill extension.
+
+If validation fails, the script will report the errors and exit without creating a package. Fix any validation errors and run the packaging command again.
+
+### Step 6: Iterate
+
+After testing the skill, users may request improvements. Often this happens right after using the skill, with fresh context of how the skill performed.
+
+**Iteration workflow:**
+
+1. Use the skill on real tasks
+2. Notice struggles or inefficiencies
+3. Identify how SKILL.md or bundled resources should be updated
+4. Implement changes and test again
diff --git a/.github/skills/skill-creator/references/output-patterns.md b/.github/skills/skill-creator/references/output-patterns.md
new file mode 100644
index 000000000000..073ddda5f039
--- /dev/null
+++ b/.github/skills/skill-creator/references/output-patterns.md
@@ -0,0 +1,82 @@
+# Output Patterns
+
+Use these patterns when skills need to produce consistent, high-quality output.
+
+## Template Pattern
+
+Provide templates for output format. Match the level of strictness to your needs.
+
+**For strict requirements (like API responses or data formats):**
+
+```markdown
+## Report structure
+
+ALWAYS use this exact template structure:
+
+# [Analysis Title]
+
+## Executive summary
+[One-paragraph overview of key findings]
+
+## Key findings
+- Finding 1 with supporting data
+- Finding 2 with supporting data
+- Finding 3 with supporting data
+
+## Recommendations
+1. Specific actionable recommendation
+2. Specific actionable recommendation
+```
+
+**For flexible guidance (when adaptation is useful):**
+
+```markdown
+## Report structure
+
+Here is a sensible default format, but use your best judgment:
+
+# [Analysis Title]
+
+## Executive summary
+[Overview]
+
+## Key findings
+[Adapt sections based on what you discover]
+
+## Recommendations
+[Tailor to the specific context]
+
+Adjust sections as needed for the specific analysis type.
+```
+
+## Examples Pattern
+
+For skills where output quality depends on seeing examples, provide input/output pairs:
+
+```markdown
+## Commit message format
+
+Generate commit messages following these examples:
+
+**Example 1:**
+Input: Added user authentication with JWT tokens
+Output:
+```
+feat(auth): implement JWT-based authentication
+
+Add login endpoint and token validation middleware
+```
+
+**Example 2:**
+Input: Fixed bug where dates displayed incorrectly in reports
+Output:
+```
+fix(reports): correct date formatting in timezone conversion
+
+Use UTC timestamps consistently across report generation
+```
+
+Follow this style: type(scope): brief description, then detailed explanation.
+```
+
+Examples help Claude understand the desired style and level of detail more clearly than descriptions alone.
diff --git a/.github/skills/skill-creator/references/workflows.md b/.github/skills/skill-creator/references/workflows.md
new file mode 100644
index 000000000000..a350c3cc8136
--- /dev/null
+++ b/.github/skills/skill-creator/references/workflows.md
@@ -0,0 +1,28 @@
+# Workflow Patterns
+
+## Sequential Workflows
+
+For complex tasks, break operations into clear, sequential steps. It is often helpful to give Claude an overview of the process towards the beginning of SKILL.md:
+
+```markdown
+Filling a PDF form involves these steps:
+
+1. Analyze the form (run analyze_form.py)
+2. Create field mapping (edit fields.json)
+3. Validate mapping (run validate_fields.py)
+4. Fill the form (run fill_form.py)
+5. Verify output (run verify_output.py)
+```
+
+## Conditional Workflows
+
+For tasks with branching logic, guide Claude through decision points:
+
+```markdown
+1. Determine the modification type:
+   **Creating new content?** → Follow "Creation workflow" below
+   **Editing existing content?** → Follow "Editing workflow" below
+
+2. Creation workflow: [steps]
+3. Editing workflow: [steps]
+```
\ No newline at end of file
diff --git a/.github/skills/skill-creator/scripts/init_skill.py b/.github/skills/skill-creator/scripts/init_skill.py
new file mode 100644
index 000000000000..c544fc725d72
--- /dev/null
+++ b/.github/skills/skill-creator/scripts/init_skill.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Skill Initializer - Creates a new skill from template
+
+Usage:
+    init_skill.py <skill-name> --path <path>
+
+Examples:
+    init_skill.py my-new-skill --path skills/public
+    init_skill.py my-api-helper --path skills/private
+    init_skill.py custom-skill --path /custom/location
+"""
+
+import sys
+from pathlib import Path
+
+
+SKILL_TEMPLATE = """---
+name: {skill_name}
+description: [TODO: Complete and informative explanation of what the skill does and when to use it. Include WHEN to use this skill - specific scenarios, file types, or tasks that trigger it.]
+---
+
+# {skill_title}
+
+## Overview
+
+[TODO: 1-2 sentences explaining what this skill enables]
+
+## Structuring This Skill
+
+[TODO: Choose the structure that best fits this skill's purpose. Common patterns:
+
+**1. Workflow-Based** (best for sequential processes)
+- Works well when there are clear step-by-step procedures
+- Example: DOCX skill with "Workflow Decision Tree" → "Reading" → "Creating" → "Editing"
+- Structure: ## Overview → ## Workflow Decision Tree → ## Step 1 → ## Step 2...
+
+**2. Task-Based** (best for tool collections)
+- Works well when the skill offers different operations/capabilities
+- Example: PDF skill with "Quick Start" → "Merge PDFs" → "Split PDFs" → "Extract Text"
+- Structure: ## Overview → ## Quick Start → ## Task Category 1 → ## Task Category 2...
+
+**3. Reference/Guidelines** (best for standards or specifications)
+- Works well for brand guidelines, coding standards, or requirements
+- Example: Brand styling with "Brand Guidelines" → "Colors" → "Typography" → "Features"
+- Structure: ## Overview → ## Guidelines → ## Specifications → ## Usage...
+
+**4. Capabilities-Based** (best for integrated systems)
+- Works well when the skill provides multiple interrelated features
+- Example: Product Management with "Core Capabilities" → numbered capability list
+- Structure: ## Overview → ## Core Capabilities → ### 1. Feature → ### 2. Feature...
+
+Patterns can be mixed and matched as needed. Most skills combine patterns (e.g., start with task-based, add workflow for complex operations).
+
+Delete this entire "Structuring This Skill" section when done - it's just guidance.]
+
+## [TODO: Replace with the first main section based on chosen structure]
+
+[TODO: Add content here. See examples in existing skills:
+- Code samples for technical skills
+- Decision trees for complex workflows
+- Concrete examples with realistic user requests
+- References to scripts/templates/references as needed]
+
+## Resources
+
+This skill includes example resource directories that demonstrate how to organize different types of bundled resources:
+
+### scripts/
+Executable code (Python/Bash/etc.) that can be run directly to perform specific operations.
+
+**Examples from other skills:**
+- PDF skill: `fill_fillable_fields.py`, `extract_form_field_info.py` - utilities for PDF manipulation
+- DOCX skill: `document.py`, `utilities.py` - Python modules for document processing
+
+**Appropriate for:** Python scripts, shell scripts, or any executable code that performs automation, data processing, or specific operations.
+
+**Note:** Scripts may be executed without loading into context, but can still be read by Claude for patching or environment adjustments.
+
+### references/
+Documentation and reference material intended to be loaded into context to inform Claude's process and thinking.
+
+**Examples from other skills:**
+- Product management: `communication.md`, `context_building.md` - detailed workflow guides
+- BigQuery: API reference documentation and query examples
+- Finance: Schema documentation, company policies
+
+**Appropriate for:** In-depth documentation, API references, database schemas, comprehensive guides, or any detailed information that Claude should reference while working.
+
+### assets/
+Files not intended to be loaded into context, but rather used within the output Claude produces.
+
+**Examples from other skills:**
+- Brand styling: PowerPoint template files (.pptx), logo files
+- Frontend builder: HTML/React boilerplate project directories
+- Typography: Font files (.ttf, .woff2)
+
+**Appropriate for:** Templates, boilerplate code, document templates, images, icons, fonts, or any files meant to be copied or used in the final output.
+
+---
+
+**Any unneeded directories can be deleted.** Not every skill requires all three types of resources.
+"""
+
+EXAMPLE_SCRIPT = '''#!/usr/bin/env python3
+"""
+Example helper script for {skill_name}
+
+This is a placeholder script that can be executed directly.
+Replace with actual implementation or delete if not needed.
+
+Example real scripts from other skills:
+- pdf/scripts/fill_fillable_fields.py - Fills PDF form fields
+- pdf/scripts/convert_pdf_to_images.py - Converts PDF pages to images
+"""
+
+def main():
+    print("This is an example script for {skill_name}")
+    # TODO: Add actual script logic here
+    # This could be data processing, file conversion, API calls, etc.
+
+if __name__ == "__main__":
+    main()
+'''
+
+EXAMPLE_REFERENCE = """# Reference Documentation for {skill_title}
+
+This is a placeholder for detailed reference documentation.
+Replace with actual reference content or delete if not needed.
+
+Example real reference docs from other skills:
+- product-management/references/communication.md - Comprehensive guide for status updates
+- product-management/references/context_building.md - Deep-dive on gathering context
+- bigquery/references/ - API references and query examples
+
+## When Reference Docs Are Useful
+
+Reference docs are ideal for:
+- Comprehensive API documentation
+- Detailed workflow guides
+- Complex multi-step processes
+- Information too lengthy for main SKILL.md
+- Content that's only needed for specific use cases
+
+## Structure Suggestions
+
+### API Reference Example
+- Overview
+- Authentication
+- Endpoints with examples
+- Error codes
+- Rate limits
+
+### Workflow Guide Example
+- Prerequisites
+- Step-by-step instructions
+- Common patterns
+- Troubleshooting
+- Best practices
+"""
+
+EXAMPLE_ASSET = """# Example Asset File
+
+This placeholder represents where asset files would be stored.
+Replace with actual asset files (templates, images, fonts, etc.) or delete if not needed.
+
+Asset files are NOT intended to be loaded into context, but rather used within
+the output Claude produces.
+
+Example asset files from other skills:
+- Brand guidelines: logo.png, slides_template.pptx
+- Frontend builder: hello-world/ directory with HTML/React boilerplate
+- Typography: custom-font.ttf, font-family.woff2
+- Data: sample_data.csv, test_dataset.json
+
+## Common Asset Types
+
+- Templates: .pptx, .docx, boilerplate directories
+- Images: .png, .jpg, .svg, .gif
+- Fonts: .ttf, .otf, .woff, .woff2
+- Boilerplate code: Project directories, starter files
+- Icons: .ico, .svg
+- Data files: .csv, .json, .xml, .yaml
+
+Note: This is a text placeholder. Actual assets can be any file type.
+"""
+
+
+def title_case_skill_name(skill_name):
+    """Convert hyphenated skill name to Title Case for display."""
+    return ' '.join(word.capitalize() for word in skill_name.split('-'))
+
+
+def init_skill(skill_name, path):
+    """
+    Initialize a new skill directory with template SKILL.md.
+
+    Args:
+        skill_name: Name of the skill
+        path: Path where the skill directory should be created
+
+    Returns:
+        Path to created skill directory, or None if error
+    """
+    # Determine skill directory path
+    skill_dir = Path(path).resolve() / skill_name
+
+    # Check if directory already exists
+    if skill_dir.exists():
+        print(f"❌ Error: Skill directory already exists: {skill_dir}")
+        return None
+
+    # Create skill directory
+    try:
+        skill_dir.mkdir(parents=True, exist_ok=False)
+        print(f"✅ Created skill directory: {skill_dir}")
+    except Exception as e:
+        print(f"❌ Error creating directory: {e}")
+        return None
+
+    # Create SKILL.md from template
+    skill_title = title_case_skill_name(skill_name)
+    skill_content = SKILL_TEMPLATE.format(
+        skill_name=skill_name,
+        skill_title=skill_title
+    )
+
+    skill_md_path = skill_dir / 'SKILL.md'
+    try:
+        skill_md_path.write_text(skill_content)
+        print("✅ Created SKILL.md")
+    except Exception as e:
+        print(f"❌ Error creating SKILL.md: {e}")
+        return None
+
+    # Create resource directories with example files
+    try:
+        # Create scripts/ directory with example script
+        scripts_dir = skill_dir / 'scripts'
+        scripts_dir.mkdir(exist_ok=True)
+        example_script = scripts_dir / 'example.py'
+        example_script.write_text(EXAMPLE_SCRIPT.format(skill_name=skill_name))
+        example_script.chmod(0o755)
+        print("✅ Created scripts/example.py")
+
+        # Create references/ directory with example reference doc
+        references_dir = skill_dir / 'references'
+        references_dir.mkdir(exist_ok=True)
+        example_reference = references_dir / 'api_reference.md'
+        example_reference.write_text(EXAMPLE_REFERENCE.format(skill_title=skill_title))
+        print("✅ Created references/api_reference.md")
+
+        # Create assets/ directory with example asset placeholder
+        assets_dir = skill_dir / 'assets'
+        assets_dir.mkdir(exist_ok=True)
+        example_asset = assets_dir / 'example_asset.txt'
+        example_asset.write_text(EXAMPLE_ASSET)
+        print("✅ Created assets/example_asset.txt")
+    except Exception as e:
+        print(f"❌ Error creating resource directories: {e}")
+        return None
+
+    # Print next steps
+    print(f"\n✅ Skill '{skill_name}' initialized successfully at {skill_dir}")
+    print("\nNext steps:")
+    print("1. Edit SKILL.md to complete the TODO items and update the description")
+    print("2. Customize or delete the example files in scripts/, references/, and assets/")
+    print("3. Run the validator when ready to check the skill structure")
+
+    return skill_dir
+
+
+def main():
+    if len(sys.argv) < 4 or sys.argv[2] != '--path':
+        print("Usage: init_skill.py <skill-name> --path <path>")
+        print("\nSkill name requirements:")
+        print("  - Kebab-case identifier (e.g., 'my-data-analyzer')")
+        print("  - Lowercase letters, digits, and hyphens only")
+        print("  - Max 64 characters")
+        print("  - Must match directory name exactly")
+        print("\nExamples:")
+        print("  init_skill.py my-new-skill --path skills/public")
+        print("  init_skill.py my-api-helper --path skills/private")
+        print("  init_skill.py custom-skill --path /custom/location")
+        sys.exit(1)
+
+    skill_name = sys.argv[1]
+    path = sys.argv[3]
+
+    print(f"🚀 Initializing skill: {skill_name}")
+    print(f"   Location: {path}")
+    print()
+
+    result = init_skill(skill_name, path)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/skills/skill-creator/scripts/package_skill.py b/.github/skills/skill-creator/scripts/package_skill.py
new file mode 100644
index 000000000000..5cd36cb16e13
--- /dev/null
+++ b/.github/skills/skill-creator/scripts/package_skill.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""
+Skill Packager - Creates a distributable .skill file of a skill folder
+
+Usage:
+    python utils/package_skill.py <path/to/skill-folder> [output-directory]
+
+Example:
+    python utils/package_skill.py skills/public/my-skill
+    python utils/package_skill.py skills/public/my-skill ./dist
+"""
+
+import sys
+import zipfile
+from pathlib import Path
+from quick_validate import validate_skill
+
+
+def package_skill(skill_path, output_dir=None):
+    """
+    Package a skill folder into a .skill file.
+
+    Args:
+        skill_path: Path to the skill folder
+        output_dir: Optional output directory for the .skill file (defaults to current directory)
+
+    Returns:
+        Path to the created .skill file, or None if error
+    """
+    skill_path = Path(skill_path).resolve()
+
+    # Validate skill folder exists
+    if not skill_path.exists():
+        print(f"❌ Error: Skill folder not found: {skill_path}")
+        return None
+
+    if not skill_path.is_dir():
+        print(f"❌ Error: Path is not a directory: {skill_path}")
+        return None
+
+    # Validate SKILL.md exists
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        print(f"❌ Error: SKILL.md not found in {skill_path}")
+        return None
+
+    # Run validation before packaging
+    print("🔍 Validating skill...")
+    valid, message = validate_skill(skill_path)
+    if not valid:
+        print(f"❌ Validation failed: {message}")
+        print("   Please fix the validation errors before packaging.")
+        return None
+    print(f"✅ {message}\n")
+
+    # Determine output location
+    skill_name = skill_path.name
+    if output_dir:
+        output_path = Path(output_dir).resolve()
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = Path.cwd()
+
+    skill_filename = output_path / f"{skill_name}.skill"
+
+    # Create the .skill file (zip format)
+    try:
+        with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the skill directory
+            for file_path in skill_path.rglob('*'):
+                if file_path.is_file():
+                    # Calculate the relative path within the zip
+                    arcname = file_path.relative_to(skill_path.parent)
+                    zipf.write(file_path, arcname)
+                    print(f"  Added: {arcname}")
+
+        print(f"\n✅ Successfully packaged skill to: {skill_filename}")
+        return skill_filename
+
+    except Exception as e:
+        print(f"❌ Error creating .skill file: {e}")
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
+        print("\nExample:")
+        print("  python utils/package_skill.py skills/public/my-skill")
+        print("  python utils/package_skill.py skills/public/my-skill ./dist")
+        sys.exit(1)
+
+    skill_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+
+    print(f"📦 Packaging skill: {skill_path}")
+    if output_dir:
+        print(f"   Output directory: {output_dir}")
+    print()
+
+    result = package_skill(skill_path, output_dir)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/skills/skill-creator/scripts/quick_validate.py b/.github/skills/skill-creator/scripts/quick_validate.py
new file mode 100644
index 000000000000..ed8e1dddce77
--- /dev/null
+++ b/.github/skills/skill-creator/scripts/quick_validate.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+Quick validation script for skills - minimal version
+"""
+
+import sys
+import os
+import re
+import yaml
+from pathlib import Path
+
+def validate_skill(skill_path):
+    """Basic validation of a skill"""
+    skill_path = Path(skill_path)
+
+    # Check SKILL.md exists
+    skill_md = skill_path / 'SKILL.md'
+    if not skill_md.exists():
+        return False, "SKILL.md not found"
+
+    # Read and validate frontmatter
+    content = skill_md.read_text()
+    if not content.startswith('---'):
+        return False, "No YAML frontmatter found"
+
+    # Extract frontmatter
+    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
+    if not match:
+        return False, "Invalid frontmatter format"
+
+    frontmatter_text = match.group(1)
+
+    # Parse YAML frontmatter
+    try:
+        frontmatter = yaml.safe_load(frontmatter_text)
+        if not isinstance(frontmatter, dict):
+            return False, "Frontmatter must be a YAML dictionary"
+    except yaml.YAMLError as e:
+        return False, f"Invalid YAML in frontmatter: {e}"
+
+    # Define allowed properties
+    ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
+
+    # Check for unexpected properties (excluding nested keys under metadata)
+    unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
+    if unexpected_keys:
+        return False, (
+            f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
+            f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
+        )
+
+    # Check required fields
+    if 'name' not in frontmatter:
+        return False, "Missing 'name' in frontmatter"
+    if 'description' not in frontmatter:
+        return False, "Missing 'description' in frontmatter"
+
+    # Extract name for validation
+    name = frontmatter.get('name', '')
+    if not isinstance(name, str):
+        return False, f"Name must be a string, got {type(name).__name__}"
+    name = name.strip()
+    if name:
+        # Check naming convention (kebab-case: lowercase with hyphens)
+        if not re.match(r'^[a-z0-9-]+$', name):
+            return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
+        if name.startswith('-') or name.endswith('-') or '--' in name:
+            return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
+        # Check name length (max 64 characters per spec)
+        if len(name) > 64:
+            return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
+
+    # Extract and validate description
+    description = frontmatter.get('description', '')
+    if not isinstance(description, str):
+        return False, f"Description must be a string, got {type(description).__name__}"
+    description = description.strip()
+    if description:
+        # Check for angle brackets
+        if '<' in description or '>' in description:
+            return False, "Description cannot contain angle brackets (< or >)"
+        # Check description length (max 1024 characters per spec)
+        if len(description) > 1024:
+            return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
+
+    # Validate compatibility field if present (optional)
+    compatibility = frontmatter.get('compatibility', '')
+    if compatibility:
+        if not isinstance(compatibility, str):
+            return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
+        if len(compatibility) > 500:
+            return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
+
+    return True, "Skill is valid!"
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python quick_validate.py <skill_directory>")
+        sys.exit(1)
+    
+    valid, message = validate_skill(sys.argv[1])
+    print(message)
+    sys.exit(0 if valid else 1)
\ No newline at end of file

From c43f5b6744149be4c16c2aed71ad2fd15ad01cb6 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Fri, 27 Feb 2026 14:50:32 -0800
Subject: [PATCH 03/22] Reorganize benchmark copilot: co-locate scripts with
 skills, consolidate runtime config

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .gitignore                                    |   3 -
 sdk/cosmos/.gitignore                         |   2 -
 sdk/cosmos/azure-cosmos-benchmark/.gitignore  |  23 +--
 .../copilot}/agents/cosmos-benchmark.agent.md |  12 +-
 .../skills/cosmos-benchmark-analyze/SKILL.md  |   6 +-
 .../references/kusto-schema.md                |   0
 .../references/parse_hprof.py                 |   0
 .../references/thresholds.md                  |   0
 .../scripts/generate-dashboard.py             |   0
 .../cosmos-benchmark-provision/SKILL.md       |  20 +--
 .../references/vm-sizing.md                   |   9 +-
 .../scripts/provision-benchmark-vm.sh         |  16 +-
 .../scripts/setup-result-storage.sh           |   0
 .../skills/cosmos-benchmark-run/SKILL.md      |  22 +--
 .../references/presets.md                     |   0
 .../references/scenarios.md                   |   0
 .../scripts/capture-diagnostics.sh            |   0
 .../cosmos-benchmark-run}/scripts/monitor.sh  |   0
 .../scripts/run-benchmark.sh                  |   4 +-
 .../scripts/trigger-benchmark.sh              |   0
 .../skills/cosmos-benchmark-setup/SKILL.md    |  10 +-
 .../references}/tenants-sample.json           |   0
 .../scripts/setup-benchmark-vm.sh             |   2 +-
 .../skills/cosmos-benchmark-status/SKILL.md   |   4 +-
 .../copilot}/skills/skill-creator/LICENSE.txt |   0
 .../copilot}/skills/skill-creator/SKILL.md    |   0
 .../references/output-patterns.md             |   0
 .../skill-creator/references/workflows.md     |   0
 .../skill-creator/scripts/init_skill.py       |   0
 .../skill-creator/scripts/package_skill.py    |   0
 .../skill-creator/scripts/quick_validate.py   |   0
 .../cosmos/benchmark/BenchmarkConfig.java     |  69 ++++++--
 .../benchmark/BenchmarkOrchestrator.java      | 145 ++++++++---------
 .../benchmark/TenantWorkloadConfig.java       |  65 +++++---
 .../test-results/BENCHMARK_RESULTS.md         | 154 ------------------
 .../test-results/README.md                    |  39 -----
 .../test-setup/README.md                      |  39 -----
 37 files changed, 217 insertions(+), 427 deletions(-)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/agents/cosmos-benchmark.agent.md (77%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-analyze/SKILL.md (96%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-analyze/references/kusto-schema.md (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-analyze/references/parse_hprof.py (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-analyze/references/thresholds.md (100%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-analyze}/scripts/generate-dashboard.py (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-provision/SKILL.md (90%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-provision/references/vm-sizing.md (50%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-provision}/scripts/provision-benchmark-vm.sh (91%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-provision}/scripts/setup-result-storage.sh (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-run/SKILL.md (82%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-run/references/presets.md (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-run/references/scenarios.md (100%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-run}/scripts/capture-diagnostics.sh (100%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-run}/scripts/monitor.sh (100%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-run}/scripts/run-benchmark.sh (95%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-run}/scripts/trigger-benchmark.sh (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-setup/SKILL.md (94%)
 rename sdk/cosmos/azure-cosmos-benchmark/{test-setup => copilot/skills/cosmos-benchmark-setup/references}/tenants-sample.json (100%)
 rename sdk/cosmos/azure-cosmos-benchmark/{ => copilot/skills/cosmos-benchmark-setup}/scripts/setup-benchmark-vm.sh (97%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/cosmos-benchmark-status/SKILL.md (88%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/skill-creator/LICENSE.txt (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/skill-creator/SKILL.md (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/skill-creator/references/output-patterns.md (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/skill-creator/references/workflows.md (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/skill-creator/scripts/init_skill.py (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/skill-creator/scripts/package_skill.py (100%)
 rename {.github => sdk/cosmos/azure-cosmos-benchmark/copilot}/skills/skill-creator/scripts/quick_validate.py (100%)
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/test-results/README.md
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md

diff --git a/.gitignore b/.gitignore
index bd8efd5068c2..997c0e0648a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -126,6 +126,3 @@ stress-test-addons*
 
 # Temp typespec files
 TempTypeSpecFiles/
-
-# Copilot skills (local agent config)
-.github/skills/
diff --git a/sdk/cosmos/.gitignore b/sdk/cosmos/.gitignore
index 3e6db8ffea6a..1ea74182f6bb 100644
--- a/sdk/cosmos/.gitignore
+++ b/sdk/cosmos/.gitignore
@@ -2,5 +2,3 @@
 
 metastore_db/*
 spark-warehouse/*
-
-multi-tenancy-analysis.md
diff --git a/sdk/cosmos/azure-cosmos-benchmark/.gitignore b/sdk/cosmos/azure-cosmos-benchmark/.gitignore
index ffe22f016492..88bdda5b2ee0 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/.gitignore
+++ b/sdk/cosmos/azure-cosmos-benchmark/.gitignore
@@ -1,24 +1,5 @@
-# Test setup (contains secrets and tenant configs)
-test-setup/
-!test-setup/README.md
-!test-setup/tenants-sample.json
-
-# Test results (downloaded from VM)
-test-results/
-!test-results/README.md
-
-# VM connection files
-.vm-ip
-.vm-user
-.vm-key
+# Benchmark config (contains secrets, tenant configs, VM connection info)
+benchmark-config/
 
 # Results directory
 results/
-
-# Scripts
-scripts/
-
-# Benchmark documentation
-BENCHMARK_RESULTS.md
-IMPLEMENTATION_GUIDE.md
-MULTI_TENANCY_TEST_PLAN.md
diff --git a/.github/agents/cosmos-benchmark.agent.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md
similarity index 77%
rename from .github/agents/cosmos-benchmark.agent.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md
index 24e7ec17880e..70f8ef25752f 100644
--- a/.github/agents/cosmos-benchmark.agent.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md
@@ -15,13 +15,13 @@ Determine user intent and follow the matching workflow:
 
 | User wants to... | Skill to load |
 |---|---|
-| Provision Azure resources (Cosmos accounts, App Insights, VMs) | Read `.github/skills/cosmos-benchmark-provision/SKILL.md` |
-| Set up environment (install tools, clone repo, generate config, build) | Read `.github/skills/cosmos-benchmark-setup/SKILL.md` |
-| Run a benchmark (checkout branch/PR, select scenario, execute) | Read `.github/skills/cosmos-benchmark-run/SKILL.md` |
-| Analyze results (CSV, compare runs, heap/thread dumps, reports, Kusto) | Read `.github/skills/cosmos-benchmark-analyze/SKILL.md` |
-| Check status (resources, runs, VM, build, config overview) | Read `.github/skills/cosmos-benchmark-status/SKILL.md` |
+| Provision Azure resources (Cosmos accounts, App Insights, VMs) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md` |
+| Set up environment (install tools, clone repo, generate config, build) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md` |
+| Run a benchmark (checkout branch/PR, select scenario, execute) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md` |
+| Analyze results (CSV, compare runs, heap/thread dumps, reports, Kusto) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md` |
+| Check status (resources, runs, VM, build, config overview) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md` |
 
-When a skill references files in its `references/` directory, read them from the skill's directory (e.g., `.github/skills/cosmos-benchmark-analyze/references/thresholds.md`).
+When a skill references files in its `references/` directory, read them from the skill's directory (e.g., `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/thresholds.md`).
 
 ## Subagent Usage
 
diff --git a/.github/skills/cosmos-benchmark-analyze/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md
similarity index 96%
rename from .github/skills/cosmos-benchmark-analyze/SKILL.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md
index f1f1afaaf79f..675b6b4a1356 100644
--- a/.github/skills/cosmos-benchmark-analyze/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md
@@ -11,7 +11,7 @@ Comprehensive post-run analysis: download results, CSV metrics, run comparison,
 
 Auto-detect VM connection:
 ```bash
-VM_IP=$(cat .vm-ip); VM_USER=$(cat .vm-user); VM_KEY=$(cat .vm-key)
+VM_IP=$(cat benchmark-config/vm-ip); VM_USER=$(cat benchmark-config/vm-user); VM_KEY=$(cat benchmark-config/vm-key)
 ```
 
 Download a run's results:
@@ -156,7 +156,7 @@ Produce a markdown report with embedded charts (as inline base64 images or Plotl
 ### Using generate-dashboard.py
 
 ```bash
-python3 scripts/generate-dashboard.py \
+python3 copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py \
   <results-dir>/metrics \
   <results-dir>/benchmark.log \
   <results-dir>/report.html \
@@ -193,4 +193,4 @@ Placeholder: user will provide specific KQL queries for latency percentiles, err
 - **Pass/fail thresholds and CSV columns**: `references/thresholds.md`
 - **Python hprof parser**: `references/parse_hprof.py`
 - **Kusto table schema & ingestion**: `references/kusto-schema.md`
-- **Dashboard generator**: `sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py`
+- **Dashboard generator**: `scripts/generate-dashboard.py`
diff --git a/.github/skills/cosmos-benchmark-analyze/references/kusto-schema.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/kusto-schema.md
similarity index 100%
rename from .github/skills/cosmos-benchmark-analyze/references/kusto-schema.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/kusto-schema.md
diff --git a/.github/skills/cosmos-benchmark-analyze/references/parse_hprof.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/parse_hprof.py
similarity index 100%
rename from .github/skills/cosmos-benchmark-analyze/references/parse_hprof.py
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/parse_hprof.py
diff --git a/.github/skills/cosmos-benchmark-analyze/references/thresholds.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/thresholds.md
similarity index 100%
rename from .github/skills/cosmos-benchmark-analyze/references/thresholds.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/thresholds.md
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/generate-dashboard.py
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py
diff --git a/.github/skills/cosmos-benchmark-provision/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md
similarity index 90%
rename from .github/skills/cosmos-benchmark-provision/SKILL.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md
index 84dcfa0356c8..ad36bf5ad6e1 100644
--- a/.github/skills/cosmos-benchmark-provision/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md
@@ -114,7 +114,7 @@ AI_CONN_STR=$(az monitor app-insights component show \
   --app <app-insights-name> \
   --resource-group rg-cosmos-benchmark \
   --query connectionString -o tsv)
-echo "$AI_CONN_STR" > test-setup/app-insights-connection-string.txt
+echo "$AI_CONN_STR" > benchmark-config/app-insights-connection-string.txt
 ```
 
 The **run** skill uses this via environment variable `APPLICATIONINSIGHTS_CONNECTION_STRING`.
@@ -128,14 +128,14 @@ Ask user how to discover:
   az monitor app-insights component list -g <rg> --query "[].{name:name, connectionString:connectionString}" -o table
   ```
 
-Save to `test-setup/app-insights-connection-string.txt`.
+Save to `benchmark-config/app-insights-connection-string.txt`.
 
 ## 3. Azure VMs
 
 ### Create new (via provision script)
 
 ```bash
-bash sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh \
+bash copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh \
   --new --location <region> --create-key \
   [--size Standard_D16s_v5] \
   [--disk-size 256] \
@@ -177,21 +177,21 @@ az vm open-port --resource-group rg-cosmos-benchmark --name vm-benchmark-01 --po
 
 ```bash
 # Option A: provide IP directly
-bash sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh \
+bash copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh \
   --existing --ip <VM_IP> --user benchuser --key ~/.ssh/id_rsa
 
 # Option B: discover from resource group + VM name
-bash sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh \
+bash copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh \
   --existing --rg <rg> --vm-name <name> --key ~/.ssh/id_rsa
 ```
 
 ### Connection info saved
 
 The provision script saves:
-- `.vm-ip` — VM public IP
-- `.vm-user` — SSH username
-- `.vm-key` — path to SSH private key
-- `test-setup/vm-config.env` — all three in `KEY=VALUE` format
+- `benchmark-config/vm-ip` — VM public IP
+- `benchmark-config/vm-user` — SSH username
+- `benchmark-config/vm-key` — path to SSH private key
+- `benchmark-config/vm-config.env` — all three in `KEY=VALUE` format
 
 Read `references/vm-sizing.md` for workload-specific VM sizing.
 
@@ -212,4 +212,4 @@ Use the **cosmos-benchmark-setup** skill to:
 ## References
 
 - **VM sizing by workload**: `references/vm-sizing.md`
-- **Provision script**: `sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh`
+- **Provision script**: `scripts/provision-benchmark-vm.sh`
diff --git a/.github/skills/cosmos-benchmark-provision/references/vm-sizing.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/references/vm-sizing.md
similarity index 50%
rename from .github/skills/cosmos-benchmark-provision/references/vm-sizing.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/references/vm-sizing.md
index 08f35f89d955..b6a21a251f3d 100644
--- a/.github/skills/cosmos-benchmark-provision/references/vm-sizing.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/references/vm-sizing.md
@@ -1,13 +1,8 @@
 # VM Sizing Reference
 
-## Recommended configurations by workload
+## Default VM size
 
-| Tenants | Scenario | VM Size | vCPUs | RAM | Notes |
-|---------|----------|---------|-------|-----|-------|
-| 1-10 | Quick test | Standard_D4s_v5 | 4 | 16 GB | Minimal |
-| 10-50 | CHURN, SCALING | Standard_D16s_v5 | 16 | 64 GB | Default recommendation |
-| 50-100 | POOL_PRESSURE | Standard_D32s_v5 | 32 | 128 GB | High concurrency |
-| 100+ | SOAK (24h) | Standard_D16s_v5 | 16 | 64 GB | Long-running, moderate load |
+Use **Standard_D16s_v5** (16 vCPUs, 64 GB RAM) for all benchmark workloads.
 
 ## Cost considerations
 
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh
similarity index 91%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh
index 8b9a425848c1..c3d18a9ca7a5 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/scripts/provision-benchmark-vm.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh
@@ -111,13 +111,13 @@ else
     exit 1
 fi
 
-echo "$VM_IP" > .vm-ip
-echo "$SSH_USER" > .vm-user
-echo "$SSH_PRIVATE_KEY" > .vm-key
+echo "$VM_IP" > benchmark-config/vm-ip
+echo "$SSH_USER" > benchmark-config/vm-user
+echo "$SSH_PRIVATE_KEY" > benchmark-config/vm-key
 
-# Also save to test-setup/ for organized access
-mkdir -p test-setup
-echo "VM_IP=$VM_IP" > test-setup/vm-config.env
-echo "VM_USER=$SSH_USER" >> test-setup/vm-config.env
-echo "VM_KEY_PATH=$SSH_PRIVATE_KEY" >> test-setup/vm-config.env
+# Also save to benchmark-config/ for organized access
+mkdir -p benchmark-config
+echo "VM_IP=$VM_IP" > benchmark-config/vm-config.env
+echo "VM_USER=$SSH_USER" >> benchmark-config/vm-config.env
+echo "VM_KEY_PATH=$SSH_PRIVATE_KEY" >> benchmark-config/vm-config.env
 echo "=== Ready: $(ssh_cmd) ${SSH_USER}@${VM_IP} ==="
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-result-storage.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/setup-result-storage.sh
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/setup-result-storage.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/setup-result-storage.sh
diff --git a/.github/skills/cosmos-benchmark-run/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
similarity index 82%
rename from .github/skills/cosmos-benchmark-run/SKILL.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
index b3822ff908cf..dc60bf3899b9 100644
--- a/.github/skills/cosmos-benchmark-run/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
@@ -12,9 +12,9 @@ Execute a benchmark on one or more VMs. Always uses `run-benchmark.sh` wrapper (
 Auto-detect from provision output:
 
 ```bash
-VM_IP=$(cat .vm-ip)
-VM_USER=$(cat .vm-user)
-VM_KEY=$(cat .vm-key)
+VM_IP=$(cat benchmark-config/vm-ip)
+VM_USER=$(cat benchmark-config/vm-user)
+VM_KEY=$(cat benchmark-config/vm-key)
 SSH_CMD="ssh -i $VM_KEY $VM_USER@$VM_IP"
 ```
 
@@ -22,10 +22,10 @@ SSH_CMD="ssh -i $VM_KEY $VM_USER@$VM_IP"
 
 ### Application Insights (auto-configure from provision output)
 
-If `test-setup/app-insights-connection-string.txt` exists:
+If `benchmark-config/app-insights-connection-string.txt` exists:
 
 ```bash
-AI_CONN_STR=$(cat test-setup/app-insights-connection-string.txt)
+AI_CONN_STR=$(cat benchmark-config/app-insights-connection-string.txt)
 $SSH_CMD "echo 'export APPLICATIONINSIGHTS_CONNECTION_STRING=\"$AI_CONN_STR\"' >> ~/.bashrc"
 ```
 
@@ -67,7 +67,7 @@ Always run inside tmux so the benchmark survives SSH disconnection.
 
 ```bash
 $SSH_CMD "tmux new-session -d -s bench 'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
-  bash scripts/run-benchmark.sh CHURN ~/tenants.json ./results/<run-name> [extra-flags]'"
+  bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh CHURN ~/tenants.json ./results/<run-name> [extra-flags]'"
 ```
 
 ### Monitor progress
@@ -99,19 +99,19 @@ For comparing versions or running different scenarios simultaneously:
 # VM 1: baseline (main branch)
 ssh -i $VM_KEY $VM_USER@<VM1_IP> "tmux new-session -d -s bench \
   'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
-   bash scripts/run-benchmark.sh CHURN ~/tenants.json ./results/baseline-main'"
+   bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh CHURN ~/tenants.json ./results/baseline-main'"
 
 # VM 2: fix branch
 ssh -i $VM_KEY $VM_USER@<VM2_IP> "tmux new-session -d -s bench \
   'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
-   bash scripts/run-benchmark.sh CHURN ~/tenants.json ./results/fix-branch'"
+   bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh CHURN ~/tenants.json ./results/fix-branch'"
 ```
 
 To test different SDK versions on different VMs, use the **setup** skill on each VM with different branch/PR/commit targets before running.
 
 ## 5. What run-benchmark.sh Does
 
-The wrapper script (`scripts/run-benchmark.sh`) handles:
+The wrapper script (`copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh`) handles:
 1. Captures git metadata (branch, commit) → `git-info.json`
 2. Launches JVM with `-Xmx8g -XX:+UseG1GC` + GC logging
 3. Spawns `monitor.sh` in parallel for external JVM monitoring → `monitor.csv`
@@ -136,5 +136,5 @@ Suggest using the **cosmos-benchmark-analyze** skill to analyze results.
 
 - **Preset flag recipes**: `references/presets.md`
 - **Full operation catalog & custom scenarios**: `references/scenarios.md`
-- **Run script**: `sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh`
-- **Trigger script**: `sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh`
+- **Run script**: `scripts/run-benchmark.sh`
+- **Trigger script**: `scripts/trigger-benchmark.sh`
diff --git a/.github/skills/cosmos-benchmark-run/references/presets.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/references/presets.md
similarity index 100%
rename from .github/skills/cosmos-benchmark-run/references/presets.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/references/presets.md
diff --git a/.github/skills/cosmos-benchmark-run/references/scenarios.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/references/scenarios.md
similarity index 100%
rename from .github/skills/cosmos-benchmark-run/references/scenarios.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/references/scenarios.md
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/capture-diagnostics.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/capture-diagnostics.sh
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/capture-diagnostics.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/capture-diagnostics.sh
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/monitor.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/monitor.sh
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/monitor.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/monitor.sh
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
similarity index 95%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
index 67414bd8046e..6039a1975d34 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/scripts/run-benchmark.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
@@ -8,8 +8,8 @@ set -euo pipefail
 
 SCENARIO=${1:-SCALING}
 TENANTS_FILE=${2:-tenants.json}
-if [[ ! -f "$TENANTS_FILE" && -f "sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants.json" ]]; then
-    TENANTS_FILE="sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants.json"
+if [[ ! -f "$TENANTS_FILE" && -f "sdk/cosmos/azure-cosmos-benchmark/benchmark-config/tenants.json" ]]; then
+    TENANTS_FILE="sdk/cosmos/azure-cosmos-benchmark/benchmark-config/tenants.json"
 fi
 
 OUTPUT_DIR=${3:-./results/$(date +%Y%m%dT%H%M%S)-${SCENARIO}}
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/trigger-benchmark.sh
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/trigger-benchmark.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/trigger-benchmark.sh
diff --git a/.github/skills/cosmos-benchmark-setup/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md
similarity index 94%
rename from .github/skills/cosmos-benchmark-setup/SKILL.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md
index 098f0aedb9a0..d172db221b47 100644
--- a/.github/skills/cosmos-benchmark-setup/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md
@@ -12,13 +12,13 @@ Prepare the execution environment after infrastructure is provisioned.
 Auto-detect VM connection info from provision output:
 
 ```bash
-VM_IP=$(cat .vm-ip)
-VM_USER=$(cat .vm-user)
-VM_KEY=$(cat .vm-key)
+VM_IP=$(cat benchmark-config/vm-ip)
+VM_USER=$(cat benchmark-config/vm-user)
+VM_KEY=$(cat benchmark-config/vm-key)
 SSH_CMD="ssh -i $VM_KEY $VM_USER@$VM_IP"
 ```
 
-If `.vm-ip` doesn't exist, ask the user for VM IP and SSH credentials.
+If `benchmark-config/vm-ip` doesn't exist, ask the user for VM IP and SSH credentials.
 
 All remote commands below use `$SSH_CMD` as shorthand.
 
@@ -27,7 +27,7 @@ All remote commands below use `$SSH_CMD` as shorthand.
 Run the setup script:
 
 ```bash
-$SSH_CMD 'bash -s' < sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh
+$SSH_CMD 'bash -s' < copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
 ```
 
 ### What gets installed
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants-sample.json b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/references/tenants-sample.json
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/test-setup/tenants-sample.json
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/references/tenants-sample.json
diff --git a/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
similarity index 97%
rename from sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
index 371cb9bc72df..8fbb6c6cf148 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/scripts/setup-benchmark-vm.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
@@ -33,4 +33,4 @@ mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true
 mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos-benchmark clean package -P package-assembly
 
 echo "=== Setup complete ==="
-echo "Next: Set APPLICATIONINSIGHTS_CONNECTION_STRING and copy tenants.json from test-setup/ to the VM"
+echo "Next: Set APPLICATIONINSIGHTS_CONNECTION_STRING and copy tenants.json from benchmark-config/ to the VM"
diff --git a/.github/skills/cosmos-benchmark-status/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md
similarity index 88%
rename from .github/skills/cosmos-benchmark-status/SKILL.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md
index 37cd4efc179d..b4178ed22119 100644
--- a/.github/skills/cosmos-benchmark-status/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md
@@ -28,8 +28,8 @@ Check and report the current state of the entire benchmark environment.
    - Read `git-info.json` for branch/commit if present
    - Show most recent 5–10 runs
 
-4. **Benchmark VM**: Check for `.vm-ip` file in workspace root.
-   - If found, SSH to verify: `ssh -i $(cat .vm-key) $(cat .vm-user)@$(cat .vm-ip) "echo OK; uptime; java -version 2>&1 | head -1; ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/ 2>/dev/null | tail -5"`
+4. **Benchmark VM**: Check for `benchmark-config/vm-ip` file in workspace root.
+   - If found, SSH to verify: `ssh -i $(cat benchmark-config/vm-key) $(cat benchmark-config/vm-user)@$(cat benchmark-config/vm-ip) "echo OK; uptime; java -version 2>&1 | head -1; ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/ 2>/dev/null | tail -5"`
    - Report: reachable/unreachable, uptime, JDK version, runs on VM
 
 5. **Build status**: Check for `sdk/cosmos/azure-cosmos-benchmark/target/azure-cosmos-benchmark-*-jar-with-dependencies.jar`.
diff --git a/.github/skills/skill-creator/LICENSE.txt b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/LICENSE.txt
similarity index 100%
rename from .github/skills/skill-creator/LICENSE.txt
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/LICENSE.txt
diff --git a/.github/skills/skill-creator/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/SKILL.md
similarity index 100%
rename from .github/skills/skill-creator/SKILL.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/SKILL.md
diff --git a/.github/skills/skill-creator/references/output-patterns.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/references/output-patterns.md
similarity index 100%
rename from .github/skills/skill-creator/references/output-patterns.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/references/output-patterns.md
diff --git a/.github/skills/skill-creator/references/workflows.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/references/workflows.md
similarity index 100%
rename from .github/skills/skill-creator/references/workflows.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/references/workflows.md
diff --git a/.github/skills/skill-creator/scripts/init_skill.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/scripts/init_skill.py
similarity index 100%
rename from .github/skills/skill-creator/scripts/init_skill.py
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/scripts/init_skill.py
diff --git a/.github/skills/skill-creator/scripts/package_skill.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/scripts/package_skill.py
similarity index 100%
rename from .github/skills/skill-creator/scripts/package_skill.py
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/scripts/package_skill.py
diff --git a/.github/skills/skill-creator/scripts/quick_validate.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/scripts/quick_validate.py
similarity index 100%
rename from .github/skills/skill-creator/scripts/quick_validate.py
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator/scripts/quick_validate.py
diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java
index 484bb3b6128a..a1e564e2ef85 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkConfig.java
@@ -3,6 +3,8 @@
 
 package com.azure.cosmos.benchmark;
 
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -35,8 +37,6 @@ public class BenchmarkConfig {
 
     // -- Reporting --
     private String reportingDirectory;
-    private String graphiteEndpoint;
-    private int graphiteEndpointPort = 2003;
     private int printingInterval = 10;
     private String resultUploadEndpoint;
     private String resultUploadKey;
@@ -48,6 +48,11 @@ public class BenchmarkConfig {
     private String branchName = "";
     private String commitId = "";
 
+    // -- JVM-global system properties (apply to all tenants, set once at startup) --
+    private boolean isPartitionLevelCircuitBreakerEnabled = true;
+    private boolean isPerPartitionAutomaticFailoverRequired = true;
+    private int minConnectionPoolSizePerEndpoint = 0;
+
     // -- Tenants (each carries its full effective config) --
     private List<TenantWorkloadConfig> tenantWorkloads = Collections.emptyList();
 
@@ -65,8 +70,13 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
         config.suppressCleanup = cfg.isSuppressCleanup();
 
         if (config.cycles > 1) {
-          config.settleTimeMs = Math.max(DEFAULT_SETTLE_TIME_MS, cfg.getSettleTimeMs());
-          config.suppressCleanup = true; // suppress container/database cleanup
+            long configuredSettleTimeMs = cfg.getSettleTimeMs();
+            // Only apply the default settle time when the configuration uses the sentinel -1.
+            // An explicit value (including 0 to disable settling) should be respected.
+            config.settleTimeMs = (configuredSettleTimeMs == -1)
+                ? DEFAULT_SETTLE_TIME_MS
+                : configuredSettleTimeMs;
+            config.suppressCleanup = true; // suppress container/database cleanup
         }
 
         config.gcBetweenCycles = cfg.isGcBetweenCycles();
@@ -75,10 +85,6 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
         // Reporting
         config.reportingDirectory = cfg.getReportingDirectory() != null
             ? cfg.getReportingDirectory().getPath() : null;
-        if (cfg.getGraphiteEndpoint() != null) {
-            config.graphiteEndpoint = cfg.getGraphiteEndpoint();
-            config.graphiteEndpointPort = cfg.getGraphiteEndpointPort();
-        }
         config.printingInterval = cfg.getPrintingInterval();
         config.resultUploadEndpoint = cfg.getServiceEndpointForRunResultsUploadAccount();
         config.resultUploadKey = cfg.getMasterKeyForRunResultsUploadAccount();
@@ -97,10 +103,18 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
             logger.info("Loading tenant configs from {}. " +
                 "Workload parameters from tenants.json will take priority over CLI args.", tenantsFile);
             config.tenantWorkloads = TenantWorkloadConfig.parseTenantsFile(new File(tenantsFile));
+
+            // Extract JVM-global system properties from globalDefaults
+            config.loadGlobalSystemPropertiesFromTenantsFile(new File(tenantsFile));
         } else {
             // Single tenant from CLI args - use fromConfiguration() to copy ALL fields
             config.tenantWorkloads = Collections.singletonList(
                 TenantWorkloadConfig.fromConfiguration(cfg));
+
+            // JVM-global system properties from CLI
+            config.isPartitionLevelCircuitBreakerEnabled = cfg.isPartitionLevelCircuitBreakerEnabled();
+            config.isPerPartitionAutomaticFailoverRequired = cfg.isPerPartitionAutomaticFailoverRequired();
+            config.minConnectionPoolSizePerEndpoint = cfg.getMinConnectionPoolSizePerEndpoint();
         }
 
         return config;
@@ -115,8 +129,6 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
     public boolean isEnableJvmStats() { return enableJvmStats; }
 
     public String getReportingDirectory() { return reportingDirectory; }
-    public String getGraphiteEndpoint() { return graphiteEndpoint; }
-    public int getGraphiteEndpointPort() { return graphiteEndpointPort; }
     public int getPrintingInterval() { return printingInterval; }
     public String getResultUploadEndpoint() { return resultUploadEndpoint; }
     public String getResultUploadKey() { return resultUploadKey; }
@@ -127,14 +139,47 @@ public static BenchmarkConfig fromConfiguration(Configuration cfg) throws IOExce
     public String getBranchName() { return branchName; }
     public String getCommitId() { return commitId; }
 
+    public boolean isPartitionLevelCircuitBreakerEnabled() { return isPartitionLevelCircuitBreakerEnabled; }
+    public boolean isPerPartitionAutomaticFailoverRequired() { return isPerPartitionAutomaticFailoverRequired; }
+    public int getMinConnectionPoolSizePerEndpoint() { return minConnectionPoolSizePerEndpoint; }
+
     public List<TenantWorkloadConfig> getTenantWorkloads() { return tenantWorkloads; }
 
     @Override
     public String toString() {
         return String.format(
             "BenchmarkConfig{cycles=%d, settleTimeMs=%d, suppressCleanup=%s, " +
-            "gcBetweenCycles=%s, tenants=%d, reportingDirectory=%s}",
+            "gcBetweenCycles=%s, tenants=%d, reportingDirectory=%s, " +
+            "circuitBreaker=%s, ppaf=%s, minConnPoolSize=%d}",
             cycles, settleTimeMs, suppressCleanup, gcBetweenCycles,
-            tenantWorkloads.size(), reportingDirectory);
+            tenantWorkloads.size(), reportingDirectory,
+            isPartitionLevelCircuitBreakerEnabled, isPerPartitionAutomaticFailoverRequired,
+            minConnectionPoolSizePerEndpoint);
+    }
+
+    /**
+     * Reads JVM-global system properties from the globalDefaults section of a tenants.json file.
+     * These properties are JVM-wide and cannot vary per tenant.
+     */
+    private void loadGlobalSystemPropertiesFromTenantsFile(File tenantsFile) throws IOException {
+        ObjectMapper mapper = new ObjectMapper();
+        JsonNode root = mapper.readTree(tenantsFile);
+        JsonNode defaults = root.get("globalDefaults");
+        if (defaults == null || !defaults.isObject()) {
+            return;
+        }
+
+        if (defaults.has("isPartitionLevelCircuitBreakerEnabled")) {
+            isPartitionLevelCircuitBreakerEnabled =
+                Boolean.parseBoolean(defaults.get("isPartitionLevelCircuitBreakerEnabled").asText());
+        }
+        if (defaults.has("isPerPartitionAutomaticFailoverRequired")) {
+            isPerPartitionAutomaticFailoverRequired =
+                Boolean.parseBoolean(defaults.get("isPerPartitionAutomaticFailoverRequired").asText());
+        }
+        if (defaults.has("minConnectionPoolSizePerEndpoint")) {
+            minConnectionPoolSizePerEndpoint =
+                Integer.parseInt(defaults.get("minConnectionPoolSizePerEndpoint").asText());
+        }
     }
 }
diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
index f266fedc11d8..e8f840f0c481 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
@@ -6,9 +6,6 @@
 import com.codahale.metrics.ConsoleReporter;
 import com.codahale.metrics.CsvReporter;
 import com.codahale.metrics.ScheduledReporter;
-import com.codahale.metrics.graphite.Graphite;
-import com.codahale.metrics.graphite.GraphiteReporter;
-import com.codahale.metrics.MetricFilter;
 import com.azure.cosmos.CosmosClient;
 import com.azure.cosmos.CosmosClientBuilder;
 import com.codahale.metrics.MetricRegistry;
@@ -20,7 +17,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.net.InetSocketAddress;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -33,6 +29,7 @@
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 
 /**
  * Benchmark orchestrator. Sets up infrastructure (metrics, reporters, system properties),
@@ -57,7 +54,7 @@ public void run(BenchmarkConfig config) throws Exception {
             return;
         }
 
-        setGlobalSystemProperties(config.getTenantWorkloads().get(0));
+        setGlobalSystemProperties(config);
 
         // Set up shared metric registry
         MetricRegistry registry = new MetricRegistry();
@@ -72,20 +69,9 @@ public void run(BenchmarkConfig config) throws Exception {
         // Prepare all tenants (inject shared state, set defaults)
         prepareTenants(config);
 
-        // Reporter selection: Graphite > CSV > Console (same pattern as original AsyncBenchmark)
+        // Reporter selection: CSV > Console
         ScheduledReporter reporter;
-        if (config.getGraphiteEndpoint() != null) {
-            Graphite graphite = new Graphite(new InetSocketAddress(
-                config.getGraphiteEndpoint(),
-                config.getGraphiteEndpointPort()));
-            reporter = GraphiteReporter.forRegistry(registry)
-                .convertDurationsTo(TimeUnit.MILLISECONDS)
-                .convertRatesTo(TimeUnit.SECONDS)
-                .filter(MetricFilter.ALL)
-                .build(graphite);
-            logger.info("Graphite reporter started -> {}:{}",
-                config.getGraphiteEndpoint(), config.getGraphiteEndpointPort());
-        } else if (config.getReportingDirectory() != null) {
+        if (config.getReportingDirectory() != null) {
             Path metricsDir = Paths.get(config.getReportingDirectory(), "metrics");
             Files.createDirectories(metricsDir);
             reporter = CsvReporter.forRegistry(registry)
@@ -167,41 +153,65 @@ private void runLifecycleLoop(BenchmarkConfig config, MetricRegistry registry,
         logger.info("Starting benchmark: {} cycles x {} tenants", totalCycles, tenants.size());
         long startTime = System.currentTimeMillis();
 
-        for (int cycle = 1; cycle <= totalCycles; cycle++) {
-            reporter.report();
-            logger.info("[LIFECYCLE] CYCLE_START cycle={} timestamp={}", cycle, Instant.now());
-
-            // 1. Create clients
-            List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry);
-            reporter.report();
-            logger.info("[LIFECYCLE] POST_CREATE cycle={} clients={} timestamp={}",
-                cycle, benchmarks.size(), Instant.now());
-
-            // 2. Run workload in parallel
-            runWorkload(benchmarks, cycle);
-            reporter.report();
-            logger.info("[LIFECYCLE] POST_WORKLOAD cycle={} timestamp={}", cycle, Instant.now());
-
-            // 3. Close all clients
-            shutdownBenchmarks(benchmarks, cycle);
-            reporter.report();
-            logger.info("[LIFECYCLE] POST_CLOSE cycle={} timestamp={}", cycle, Instant.now());
-
-            // 4. Settle
-            if (config.getSettleTimeMs() > 0) {
-                logger.info("  Settling for {}ms...", config.getSettleTimeMs());
-                long halfSettle = config.getSettleTimeMs() / 2;
-                Thread.sleep(halfSettle);
-                if (config.isGcBetweenCycles()) {
-                    System.gc();
+        AtomicInteger threadCounter = new AtomicInteger(0);
+        ExecutorService executor = Executors.newFixedThreadPool(tenants.size(), r -> {
+            Thread t = new Thread(r, "tenant-worker-" + threadCounter.getAndIncrement());
+            t.setDaemon(false);
+            return t;
+        });
+
+        try {
+            for (int cycle = 1; cycle <= totalCycles; cycle++) {
+                reporter.report();
+                logger.info("[LIFECYCLE] CYCLE_START cycle={} timestamp={}", cycle, Instant.now());
+
+                // 1. Create clients
+                List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry);
+                reporter.report();
+                logger.info("[LIFECYCLE] POST_CREATE cycle={} clients={} timestamp={}",
+                    cycle, benchmarks.size(), Instant.now());
+
+                // 2. Run workload in parallel
+                runWorkload(benchmarks, cycle, executor);
+                reporter.report();
+                logger.info("[LIFECYCLE] POST_WORKLOAD cycle={} timestamp={}", cycle, Instant.now());
+
+                // 3. Close all clients
+                shutdownBenchmarks(benchmarks, cycle);
+                reporter.report();
+                logger.info("[LIFECYCLE] POST_CLOSE cycle={} timestamp={}", cycle, Instant.now());
+
+                // 4. Settle
+                if (config.getSettleTimeMs() > 0) {
+                    logger.info("  Settling for {}ms...", config.getSettleTimeMs());
+                    long halfSettle = config.getSettleTimeMs() / 2;
+                    Thread.sleep(halfSettle);
+                    if (config.isGcBetweenCycles()) {
+                        System.gc();
+                    }
+                    Thread.sleep(config.getSettleTimeMs() - halfSettle);
+                    if (config.isGcBetweenCycles()) {
+                        System.gc();
+                    }
                 }
-                Thread.sleep(config.getSettleTimeMs() - halfSettle);
-                if (config.isGcBetweenCycles()) {
-                    System.gc();
+                reporter.report();
+                logger.info("[LIFECYCLE] POST_SETTLE cycle={} timestamp={}", cycle, Instant.now());
+            }
+        } finally {
+            executor.shutdown();
+            try {
+                if (!executor.awaitTermination(60, TimeUnit.SECONDS)) {
+                    logger.warn("Executor did not terminate within the timeout");
+                    executor.shutdownNow();
+                    if (!executor.awaitTermination(60, TimeUnit.SECONDS)) {
+                        logger.error("Executor did not terminate after shutdownNow");
+                    }
                 }
+            } catch (InterruptedException e) {
+                logger.warn("Interrupted while awaiting executor termination", e);
+                executor.shutdownNow();
+                Thread.currentThread().interrupt();
             }
-            reporter.report();
-            logger.info("[LIFECYCLE] POST_SETTLE cycle={} timestamp={}", cycle, Instant.now());
         }
 
         long durationSec = (System.currentTimeMillis() - startTime) / 1000;
@@ -217,12 +227,7 @@ private List<AsyncBenchmark<?>> createBenchmarks(BenchmarkConfig config, MetricR
         return benchmarks;
     }
 
-    private void runWorkload(List<AsyncBenchmark<?>> benchmarks, int cycle) throws Exception {
-        ExecutorService executor = Executors.newFixedThreadPool(benchmarks.size(), r -> {
-            Thread t = new Thread(r, "tenant-worker");
-            t.setDaemon(false);
-            return t;
-        });
+    private void runWorkload(List<AsyncBenchmark<?>> benchmarks, int cycle, ExecutorService executor) throws Exception {
         List<Future<?>> futures = new ArrayList<>();
         final int currentCycle = cycle;
         for (AsyncBenchmark<?> benchmark : benchmarks) {
@@ -237,7 +242,6 @@ private void runWorkload(List<AsyncBenchmark<?>> benchmarks, int cycle) throws E
         for (Future<?> f : futures) {
             f.get();
         }
-        executor.shutdown();
     }
 
     private void shutdownBenchmarks(List<AsyncBenchmark<?>> benchmarks, int cycle) {
@@ -330,15 +334,6 @@ private MeterRegistry buildCosmosMicrometerRegistry() {
             return tempCfg.getAzureMonitorMeterRegistry();
         }
 
-        String graphiteAddress = System.getProperty("azure.cosmos.monitoring.graphite.serviceAddress",
-            StringUtils.defaultString(
-                com.google.common.base.Strings.emptyToNull(
-                    System.getenv("GRAPHITE_SERVICE_ADDRESS")), null));
-        if (graphiteAddress != null) {
-            Configuration tempCfg = new Configuration();
-            return tempCfg.getGraphiteMeterRegistry();
-        }
-
         return null;
     }
 
@@ -355,10 +350,8 @@ private void clearGlobalSystemProperties() {
         System.clearProperty("COSMOS.MIN_CONNECTION_POOL_SIZE_PER_ENDPOINT");
     }
 
-    private void setGlobalSystemProperties(TenantWorkloadConfig firstTenant) {
-        String circuitBreakerEnabled = firstTenant.getIsPartitionLevelCircuitBreakerEnabled();
-        if (circuitBreakerEnabled == null) circuitBreakerEnabled = "true";
-        if (Boolean.parseBoolean(circuitBreakerEnabled)) {
+    private void setGlobalSystemProperties(BenchmarkConfig config) {
+        if (config.isPartitionLevelCircuitBreakerEnabled()) {
             System.setProperty("COSMOS.PARTITION_LEVEL_CIRCUIT_BREAKER_CONFIG",
                 "{\"isPartitionLevelCircuitBreakerEnabled\": true, "
                     + "\"circuitBreakerType\": \"CONSECUTIVE_EXCEPTION_COUNT_BASED\","
@@ -368,21 +361,21 @@ private void setGlobalSystemProperties(TenantWorkloadConfig firstTenant) {
             System.setProperty("COSMOS.ALLOWED_PARTITION_UNAVAILABILITY_DURATION_IN_SECONDS", "30");
         }
 
-        String ppafEnabled = firstTenant.getIsPerPartitionAutomaticFailoverRequired();
-        if (ppafEnabled == null) ppafEnabled = "true";
-        if (Boolean.parseBoolean(ppafEnabled)) {
+        if (config.isPerPartitionAutomaticFailoverRequired()) {
             System.setProperty("COSMOS.IS_PER_PARTITION_AUTOMATIC_FAILOVER_ENABLED", "true");
             System.setProperty("COSMOS.IS_SESSION_TOKEN_FALSE_PROGRESS_MERGE_ENABLED", "true");
             System.setProperty("COSMOS.E2E_TIMEOUT_ERROR_HIT_THRESHOLD_FOR_PPAF", "5");
             System.setProperty("COSMOS.E2E_TIMEOUT_ERROR_HIT_TIME_WINDOW_IN_SECONDS_FOR_PPAF", "120");
         }
 
-        if (firstTenant.getMinConnectionPoolSizePerEndpoint() >= 1) {
+        if (config.getMinConnectionPoolSizePerEndpoint() >= 1) {
             System.setProperty("COSMOS.MIN_CONNECTION_POOL_SIZE_PER_ENDPOINT",
-                String.valueOf(firstTenant.getMinConnectionPoolSizePerEndpoint()));
+                String.valueOf(config.getMinConnectionPoolSizePerEndpoint()));
         }
 
         logger.info("Global system properties set (circuit breaker: {}, PPAF: {}, minConnPoolSize: {})",
-            circuitBreakerEnabled, ppafEnabled, firstTenant.getMinConnectionPoolSizePerEndpoint());
+            config.isPartitionLevelCircuitBreakerEnabled(),
+            config.isPerPartitionAutomaticFailoverRequired(),
+            config.getMinConnectionPoolSizePerEndpoint());
     }
 }
diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java
index 48fc05a993e6..c28c2da81173 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/TenantWorkloadConfig.java
@@ -122,6 +122,12 @@ public class TenantWorkloadConfig {
     @JsonProperty("nonPointOperationLatencyThresholdMs")
     private Integer nonPointOperationLatencyThresholdMs;
 
+    /**
+     * Per-client flag: controls region-scoped session capturing on the CosmosClientBuilder.
+     * Unlike JVM-global system properties (circuit breaker, PPAF, minConnectionPoolSize),
+     * this is set per-client via {@code CosmosClientBuilderAccessor.setRegionScopedSessionCapturingEnabled}
+     * and can genuinely differ per tenant.
+     */
     @JsonProperty("isRegionScopedSessionContainerEnabled")
     private Boolean isRegionScopedSessionContainerEnabled;
 
@@ -146,9 +152,6 @@ public class TenantWorkloadConfig {
     @JsonProperty("aggressiveWarmupDuration")
     private String aggressiveWarmupDuration;
 
-    @JsonProperty("minConnectionPoolSizePerEndpoint")
-    private Integer minConnectionPoolSizePerEndpoint;
-
     // ======== Connection params ========
 
     @JsonProperty("connectionMode")
@@ -176,14 +179,6 @@ public class TenantWorkloadConfig {
     /** Cosmos SDK micrometer registry (set by orchestrator, not from JSON). */
     private transient MeterRegistry cosmosMicrometerRegistry;
 
-    // ======== System property flags ========
-
-    @JsonProperty("isPartitionLevelCircuitBreakerEnabled")
-    private String isPartitionLevelCircuitBreakerEnabled;
-
-    @JsonProperty("isPerPartitionAutomaticFailoverRequired")
-    private String isPerPartitionAutomaticFailoverRequired;
-
     public TenantWorkloadConfig() {}
 
 
@@ -270,10 +265,6 @@ public Duration getAggressiveWarmupDuration() {
         return Duration.parse(aggressiveWarmupDuration);
     }
 
-    public int getMinConnectionPoolSizePerEndpoint() {
-        return minConnectionPoolSizePerEndpoint != null ? minConnectionPoolSizePerEndpoint : 0;
-    }
-
     public ConnectionMode getConnectionMode() {
         if (connectionMode == null) return ConnectionMode.DIRECT;
         return ConnectionMode.valueOf(connectionMode.toUpperCase());
@@ -300,9 +291,6 @@ public List<String> getPreferredRegionsList() {
     public boolean isSuppressCleanup() { return suppressCleanup; }
     public MeterRegistry getCosmosMicrometerRegistry() { return cosmosMicrometerRegistry; }
 
-    public String getIsPartitionLevelCircuitBreakerEnabled() { return isPartitionLevelCircuitBreakerEnabled; }
-    public String getIsPerPartitionAutomaticFailoverRequired() { return isPerPartitionAutomaticFailoverRequired; }
-
     /**
      * Builds a TokenCredential for managed identity authentication.
      */
@@ -425,8 +413,6 @@ private void applyField(String key, String value, boolean overwrite) {
                     if (overwrite || proactiveConnectionRegionsCount == null) proactiveConnectionRegionsCount = Integer.parseInt(value); break;
                 case "aggressiveWarmupDuration":
                     if (overwrite || aggressiveWarmupDuration == null) aggressiveWarmupDuration = value; break;
-                case "minConnectionPoolSizePerEndpoint":
-                    if (overwrite || minConnectionPoolSizePerEndpoint == null) minConnectionPoolSizePerEndpoint = Integer.parseInt(value); break;
                 case "connectionMode":
                     if (overwrite || connectionMode == null) connectionMode = value; break;
                 case "consistencyLevel":
@@ -437,10 +423,12 @@ private void applyField(String key, String value, boolean overwrite) {
                     if (overwrite || preferredRegionsList == null) preferredRegionsList = value; break;
                 case "manageDatabase":
                     if (overwrite || manageDatabase == null) manageDatabase = Boolean.parseBoolean(value); break;
+                // JVM-global properties (minConnectionPoolSizePerEndpoint, isPartitionLevelCircuitBreakerEnabled,
+                // isPerPartitionAutomaticFailoverRequired) are handled in BenchmarkConfig, not per-tenant.
+                case "minConnectionPoolSizePerEndpoint":
                 case "isPartitionLevelCircuitBreakerEnabled":
-                    if (overwrite || isPartitionLevelCircuitBreakerEnabled == null) isPartitionLevelCircuitBreakerEnabled = value; break;
                 case "isPerPartitionAutomaticFailoverRequired":
-                    if (overwrite || isPerPartitionAutomaticFailoverRequired == null) isPerPartitionAutomaticFailoverRequired = value; break;
+                    break;
                 default:
                     logger.debug("Unknown config key '{}' (value: {})", key, value);
                     break;
@@ -512,15 +500,14 @@ public static TenantWorkloadConfig fromConfiguration(Configuration cfg) {
         if (cfg.getAggressiveWarmupDuration() != null) {
             t.aggressiveWarmupDuration = cfg.getAggressiveWarmupDuration().toString();
         }
-        t.minConnectionPoolSizePerEndpoint = cfg.getMinConnectionPoolSizePerEndpoint();
 
         // Connection
         t.preferredRegionsList = cfg.getPreferredRegionsList() != null
             ? String.join(",", cfg.getPreferredRegionsList()) : null;
 
-        // System property flags
-        t.isPartitionLevelCircuitBreakerEnabled = String.valueOf(cfg.isPartitionLevelCircuitBreakerEnabled());
-        t.isPerPartitionAutomaticFailoverRequired = String.valueOf(cfg.isPerPartitionAutomaticFailoverRequired());
+        // Note: JVM-global system properties (isPartitionLevelCircuitBreakerEnabled,
+        // isPerPartitionAutomaticFailoverRequired, minConnectionPoolSizePerEndpoint)
+        // are handled in BenchmarkConfig, not per-tenant.
 
         return t;
     }
@@ -552,6 +539,7 @@ public static List<TenantWorkloadConfig> parseTenantsFile(File tenantsFile) thro
             for (JsonNode tenantNode : tenantsNode) {
                 TenantWorkloadConfig tenant = OBJECT_MAPPER.treeToValue(tenantNode, TenantWorkloadConfig.class);
                 tenant.applyMap(globalDefaults, false);
+                validateTenantConfig(tenant);
                 tenants.add(tenant);
             }
         }
@@ -559,4 +547,29 @@ public static List<TenantWorkloadConfig> parseTenantsFile(File tenantsFile) thro
         logger.info("Parsed {} tenants from {}", tenants.size(), tenantsFile.getName());
         return tenants;
     }
+
+    private static void validateTenantConfig(TenantWorkloadConfig tenant) {
+        List<String> missing = new ArrayList<>();
+        if (isNullOrEmpty(tenant.getServiceEndpoint())) {
+            missing.add("serviceEndpoint");
+        }
+        if (isNullOrEmpty(tenant.getDatabaseId())) {
+            missing.add("databaseId");
+        }
+        if (isNullOrEmpty(tenant.getContainerId())) {
+            missing.add("containerId");
+        }
+        if (!tenant.isManagedIdentityRequired()
+            && isNullOrEmpty(tenant.getMasterKey())) {
+            missing.add("masterKey (required when isManagedIdentityRequired is not true)");
+        }
+        if (!missing.isEmpty()) {
+            throw new IllegalArgumentException(
+                "Tenant '" + tenant.getId() + "' is missing required configuration: " + missing);
+        }
+    }
+
+    private static boolean isNullOrEmpty(String value) {
+        return value == null || value.isEmpty();
+    }
 }
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md b/sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md
deleted file mode 100644
index baaeae94e16c..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/test-results/BENCHMARK_RESULTS.md
+++ /dev/null
@@ -1,154 +0,0 @@
-# Multi-Tenancy Benchmark Results Tracker
-
-## Summary
-
-| Run ID | Date (UTC) | Scenario | Branch | Commit | Threads | Verdict |
-|--------|-----------|----------|--------|--------|---------|---------|
-| R6 | 2026-02-20 20:01 | CHURN 3-cycle (90s settle) | fix-a1-telemetry-close | 8bc3caa168e | 0 growth | PASS - no leak |
-
----
-
-## R6 — Multi-cycle CHURN with 90s settle (2026-02-20) — CONCLUSIVE
-
-- **Branch**: `fix-a1-telemetry-close` (A1/A2 fix reverted, benchmark enhancements kept)
-- **Commit**: `8bc3caa168e`
-- **VM**: `benchuser@4.154.169.45`
-- **Results dir**: `results/20260220T200151-CHURN-90s-settle/`
-- **Settle time**: 90s per cycle (> 60s BoundedElastic evictor TTL)
-
-### Per-Cycle Results
-
-| Cycle | Threads | Thread delta | Heap (MB) | FDs |
-|-------|---------|-------------|-----------|-----|
-| 1 | 63 | +57 | 28 | 58 |
-| 2 | 63 | +57 | 38 | 57 |
-| 3 | 63 | +57 | 38 | 57 |
-
-### Leak Verdict
-
-| Metric | Value | Result |
-|--------|-------|--------|
-| Thread growth (cycle 1 to 3) | **0** | PASS |
-| `transport-response-bounded-elastic` after settle | **0** (fully evicted) | PASS |
-| Heap stable across cycles 2-3 | 38 MB = 38 MB | PASS |
-| FDs stable | 57 = 57 | PASS |
-
-### Conclusion
-
-**No thread leak exists.** The thread growth seen in R4/R5 (5s settle) was entirely due to
-BoundedElastic evictor TTL timing. With 90s settle (> 60s TTL), all idle workers are
-reclaimed and thread count is perfectly stable at 63 across all cycles.
-
-The `transport-response-bounded-elastic` threads that showed growth in R4 (8 to 17) and R5
-(8 to 37) do NOT appear at all after 90s settle -- they were all temporary workers that
-the evictor correctly cleaned up.
-
----
-
-<!-- TEMPLATE: Copy below for new runs -->
-<!--
-## RN — SCENARIO (YYYY-MM-DD)
-
-- **Branch**: `<branch>`
-- **Commit**: `<commit>` — "<message>"
-- **VM**: `<vm>`
-- **Results dir**: `<dir>`
-
-### Resource Snapshots
-
-| Phase | Timestamp | Heap (MB) | Threads | Direct Mem (MB) | FDs | GC Count | GC Time (ms) |
-|-------|-----------|-----------|---------|-----------------|-----|----------|-------------|
-| PRE_CREATE (Baseline) | | | | | | | |
-| POST_CREATE | | | | | | | |
-| Peak | | | | | | | |
-| POST_CLOSE (Final) | | | | | | | |
-
-### Leak Check
-
-| Metric | Value | Threshold | Result |
-|--------|-------|-----------|--------|
-| Thread delta (final − baseline) | | ≤ 2 | |
-| Heap ratio (final / baseline) | | ≤ 1.1× | |
-
-### Analysis
-
--
--->
-
----
-
-## Findings & Action Items
-
-### F1: reactor-http-epoll Threads (Shared LoopResources)
-
-16 `reactor-http-epoll` threads appear after any Cosmos client usage and persist for the JVM lifetime.
-NOT a leak -- Reactor Netty's global default event-loop pool (`LoopResources.DEFAULT`).
-
-- Default thread count: `Runtime.getRuntime().availableProcessors()` (16 on D16s_v5 VM)
-- Shared by ALL `ReactorNettyClient` instances (Gateway HTTP client, IMDS metadata client)
-- Cosmos SDK does NOT customize `LoopResources` -- relies on reactor-netty defaults
-- Configurable via system property: `reactor.netty.ioWorkerCount=<N>`
-
-**Multi-tenancy concern**: 100+ clients sharing 16 event-loop threads could cause contention.
-
-| Action | Description | Complexity |
-|--------|-------------|------------|
-| A23 | Benchmark event-loop contention at 100+ clients with varying `ioWorkerCount` | Low |
-| A24 | Consider exposing `ioWorkerCount` as a Cosmos SDK config for multi-tenant scenarios | Low |
-
-### F2: ClientTelemetry Cleanup Opportunities
-
-`ClientTelemetry` has significant dead code. Still instantiated per-client, creating an
-IMDS `HttpClient` pool each time even though metadata is cached statically after first call.
-
-**Active code (6)**: constructor, `init()`, `recordValue()`, `getClientTelemetryConfig()`,
-`isClientMetricsEnabled()`, `getMachineId()`
-
-**Dead code (7)**: `close()` (was no-op with wrong log message), `blockingGetOrLoadMachineId()`,
-`DEFAULT_CLIENT_TELEMETRY_ENABLED`, 4x `TCP_NEW_CHANNEL_LATENCY_*` constants
-
-**IMDS HttpClient should be static**: `metadataHttpClient` is per-instance but
-`azureVmMetaDataSingleton` is static -- only first call does HTTP. All subsequent instances
-create unused `ConnectionProvider` objects.
-
-| Action | Description | Complexity |
-|--------|-------------|------------|
-| A25 | Make IMDS `metadataHttpClient` a static singleton (lazy-init) | Low |
-| A26 | Remove dead code: `blockingGetOrLoadMachineId()`, `TCP_NEW_CHANNEL_LATENCY_*` constants | Low |
-| A27 | Fix `close()` log message (says "GlobalEndpointManager closed" -- copy-paste error) | Trivial |
-
-### F3: transport-response-bounded-elastic Thread Growth
-
-Only thread group that grows across CHURN cycles (8 after 1 cycle, 17 after 5 with fix, 37 without).
-
-Reactor `BoundedElasticScheduler` creates workers on-demand, evicts after 60s TTL.
-CHURN cycle settle time (5s) is shorter than evictor TTL, so workers accumulate temporarily.
-
-**CONFIRMED NOT A LEAK (R6)**: With 90s settle (> 60s TTL), thread count is perfectly
-stable at 63 across all cycles. All bounded-elastic workers are properly evicted.
-The growth seen in R4/R5 was purely timing -- 5s settle was too short for the 60s evictor.
-
-| Action | Description | Status |
-|--------|-------------|--------|
-| A28 | Increase CHURN settle time to 60s+ and verify bounded-elastic threads stabilize | DONE (R6) -- CONFIRMED |
-| A29 | Investigate if any bounded-elastic schedulers should be disposed on client close | NOT NEEDED -- no leak |
-
-### F4: A1/A2 Fix Impact Reassessment
-
-**Original hypothesis**: `ClientTelemetry.close()` no-op leaked IMDS pool threads and
-GlobalEndpointManager scheduler threads per client lifecycle.
-
-**Revised understanding**:
-
-| Resource | Actual behavior |
-|----------|----------------|
-| IMDS HttpClient pool | Created per-instance but unused after first client (metadata cached statically). Leaked `ConnectionProvider` wastes minor memory, no FDs or threads. |
-| GlobalEndpointManager scheduler | Already properly closed via `RxDocumentClientImpl.close()`. Never broken. |
-| reactor-http-epoll threads | Shared `LoopResources.DEFAULT` singleton -- same count (16) fix vs no-fix. |
-| Thread count diff (80 vs 100) | Entirely in `transport-response-bounded-elastic` (17 vs 37). Likely timing-related. |
-
-**Conclusion**: A1/A2 fix is correct for code hygiene but has minimal observable impact on
-threads, heap, or FDs in CHURN testing. Real multi-tenancy concerns are:
-- Event-loop contention at scale (A23)
-- Unnecessary per-instance IMDS client creation (A25)
-- Bounded-elastic growth during rapid churn (A28/A29)
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-results/README.md b/sdk/cosmos/azure-cosmos-benchmark/test-results/README.md
deleted file mode 100644
index 9dee7631785c..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/test-results/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Test Results
-
-This directory stores benchmark results downloaded from the VM.
-
-Each run is stored in a subdirectory named by run ID:
-```
-test-results/
-  R2-20260220T055015-CHURN/
-    resource_snapshots.csv
-    benchmark.log
-    gc.log
-    git-info.json
-    thread-dumps/
-    heap-dumps/
-  R3-20260220T064730-CHURN-prefix/
-    ...
-```
-
-## Downloading Results from VM
-
-```bash
-# Download a specific run:
-scp -r benchuser@<VM_IP>:~/azure-sdk-for-java/results/<run-dir> test-results/
-
-# Download all results:
-scp -r benchuser@<VM_IP>:~/azure-sdk-for-java/results/* test-results/
-```
-
-## Analyzing Results
-
-Use the multi-tenancy-benchmark-analyze skill or run:
-```bash
-# Compare heap dumps:
-python3 .github/skills/multi-tenancy-benchmark-heapdump/references/parse_hprof.py \
-  --diff test-results/<run>/heap-dumps/heap-PRE_CLOSE-*.hprof \
-         test-results/<run>/heap-dumps/heap-POST_CLOSE-*.hprof --top 20
-```
-
-All files in this directory are gitignored.
diff --git a/sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md b/sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md
deleted file mode 100644
index 2081b0ad515c..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/test-setup/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Test Setup Files
-
-This directory contains configuration files needed to run multi-tenancy benchmarks.
-
-## Files
-
-| File | Purpose | Checked in? |
-|------|---------|-------------|
-| `tenants-sample.json` | Template for tenant configuration | Yes (reference) |
-| `tenants.json` | Actual tenant config with endpoints and keys | No (gitignored) |
-| `clientHostAndKey.txt` | Raw credentials CSV used to generate tenants.json | No (gitignored) |
-| `vm-config.env` | VM connection info (IP, user, key path) | No (gitignored) |
-
-## Setup Steps
-
-1. Copy `tenants-sample.json` to `tenants.json`
-2. Fill in your Cosmos DB account endpoints and master keys
-3. Optionally create `clientHostAndKey.txt` in CSV format:
-   ```
-   <account-name>,<endpoint>,<master-key>
-   ```
-   Then use the benchmark setup skill to auto-generate `tenants.json` from it.
-
-## VM Connection
-
-After provisioning a benchmark VM, save connection info:
-```bash
-# Created by provision-benchmark-vm.sh
-VM_IP=<ip>
-VM_USER=benchuser
-VM_KEY_PATH=~/.ssh/id_rsa
-```
-
-## Deploying to VM
-
-Copy setup files to the VM:
-```bash
-scp -i $VM_KEY_PATH test-setup/tenants.json $VM_USER@$VM_IP:~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/tenants.json
-```

From 1d0cbdc60c062db84f1dbc9c436649dc83e7c112 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 10:31:06 -0800
Subject: [PATCH 04/22] Restructure cosmos benchmark agent to 3 deterministic
 skills

Consolidate the benchmark agent from 5 skills down to 3, with deterministic
script-driven flows replacing inline commands.

Skills:
- setup-resources: provision Azure infra (Cosmos DB, App Insights, VM) with
  parallel creation, capacity validation, region fallback, and verification gate
- run: clone/build/verify/execute benchmarks via single SSH session per ref,
  supports multiple refs for comparison, SIMPLE/EXPAND/CHURN presets
- analyze: download results to config-dir/results, generate markdown report
  with time-series SVG charts and multi-run comparison tables

Key changes:
- Rename provision -> setup-resources, merge setup into run, remove status
- .github/skills and .github/agents use symlinks to copilot/ (single source)
- Default region westus2, resource group rg-cosmos-benchmark-YYYYMMDD
- Config directory prompted with credential-in-repo warning
- provision-all.sh orchestrates parallel resource creation + verification
- vm-prepare-and-run.sh consolidates checkout/build/verify/run in 1 SSH session
- run-all-refs.sh loops over user-provided refs with per-ref result directories
- generate-report.py reads monitor.csv + metrics/*.csv, outputs report.md
- Remove parse_hprof.py, kusto-schema.md, generate-dashboard.py (deferred)
- Remove trigger-benchmark.sh (superseded by vm-prepare-and-run.sh)
- Merge setup-benchmark-vm.sh into provision-benchmark-vm.sh

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .github/agents/cosmos-benchmark.agent.md      |   1 +
 .github/skills/cosmos-benchmark-analyze       |   1 +
 .github/skills/cosmos-benchmark-run           |   1 +
 .../skills/cosmos-benchmark-setup-resources   |   1 +
 .github/skills/skill-creator                  |   1 +
 .../copilot/agents/cosmos-benchmark.agent.md  |  19 +-
 .../skills/cosmos-benchmark-analyze/SKILL.md  | 229 ++++-------
 .../references/kusto-schema.md                | 113 ------
 .../references/parse_hprof.py                 | 234 -----------
 .../scripts/download-results.sh               |  87 ++++
 .../scripts/generate-dashboard.py             | 157 --------
 .../scripts/generate-report.py                | 373 ++++++++++++++++++
 .../cosmos-benchmark-provision/SKILL.md       | 215 ----------
 .../scripts/setup-result-storage.sh           |  33 --
 .../skills/cosmos-benchmark-run/SKILL.md      | 168 ++++----
 .../references/tenants-sample.json            |   0
 .../scripts/generate-tenants.sh               |  97 +++++
 .../scripts/run-all-refs.sh                   |  99 +++++
 .../scripts/trigger-benchmark.sh              | 100 -----
 .../scripts/vm-prepare-and-run.sh             | 138 +++++++
 .../cosmos-benchmark-setup-resources/SKILL.md | 221 +++++++++++
 .../references/vm-sizing.md                   |   0
 .../scripts/create-cosmos-accounts.sh         |  68 ++++
 .../scripts/export-cosmos-credentials.sh      |  82 ++++
 .../scripts/find-region.sh                    |  94 +++++
 .../scripts/provision-all.sh                  | 217 ++++++++++
 .../scripts/provision-benchmark-vm.sh         |  54 ++-
 .../scripts/validate-capacity.sh              | 180 +++++++++
 .../scripts/verify-resources.sh               | 137 +++++++
 .../skills/cosmos-benchmark-setup/SKILL.md    | 178 ---------
 .../scripts/setup-benchmark-vm.sh             |  36 --
 .../skills/cosmos-benchmark-status/SKILL.md   |  61 ---
 32 files changed, 2010 insertions(+), 1385 deletions(-)
 create mode 120000 .github/agents/cosmos-benchmark.agent.md
 create mode 120000 .github/skills/cosmos-benchmark-analyze
 create mode 120000 .github/skills/cosmos-benchmark-run
 create mode 120000 .github/skills/cosmos-benchmark-setup-resources
 create mode 120000 .github/skills/skill-creator
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/kusto-schema.md
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/parse_hprof.py
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/download-results.sh
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-report.py
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/setup-result-storage.sh
 rename sdk/cosmos/azure-cosmos-benchmark/copilot/skills/{cosmos-benchmark-setup => cosmos-benchmark-run}/references/tenants-sample.json (100%)
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/generate-tenants.sh
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/trigger-benchmark.sh
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
 create mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
 rename sdk/cosmos/azure-cosmos-benchmark/copilot/skills/{cosmos-benchmark-provision => cosmos-benchmark-setup-resources}/references/vm-sizing.md (100%)
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/create-cosmos-accounts.sh
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/export-cosmos-credentials.sh
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh
 rename sdk/cosmos/azure-cosmos-benchmark/copilot/skills/{cosmos-benchmark-provision => cosmos-benchmark-setup-resources}/scripts/provision-benchmark-vm.sh (70%)
 mode change 100644 => 100755
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/verify-resources.sh
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
 delete mode 100644 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md

diff --git a/.github/agents/cosmos-benchmark.agent.md b/.github/agents/cosmos-benchmark.agent.md
new file mode 120000
index 000000000000..6b0fda774736
--- /dev/null
+++ b/.github/agents/cosmos-benchmark.agent.md
@@ -0,0 +1 @@
+../../sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md
\ No newline at end of file
diff --git a/.github/skills/cosmos-benchmark-analyze b/.github/skills/cosmos-benchmark-analyze
new file mode 120000
index 000000000000..43c8d9f0439c
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-analyze
@@ -0,0 +1 @@
+../../sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze
\ No newline at end of file
diff --git a/.github/skills/cosmos-benchmark-run b/.github/skills/cosmos-benchmark-run
new file mode 120000
index 000000000000..062d4316bae0
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-run
@@ -0,0 +1 @@
+../../sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run
\ No newline at end of file
diff --git a/.github/skills/cosmos-benchmark-setup-resources b/.github/skills/cosmos-benchmark-setup-resources
new file mode 120000
index 000000000000..28fb89755f8b
--- /dev/null
+++ b/.github/skills/cosmos-benchmark-setup-resources
@@ -0,0 +1 @@
+../../sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources
\ No newline at end of file
diff --git a/.github/skills/skill-creator b/.github/skills/skill-creator
new file mode 120000
index 000000000000..14fa4e2696f5
--- /dev/null
+++ b/.github/skills/skill-creator
@@ -0,0 +1 @@
+../../sdk/cosmos/azure-cosmos-benchmark/copilot/skills/skill-creator
\ No newline at end of file
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md
index 70f8ef25752f..e438f49754e9 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/agents/cosmos-benchmark.agent.md
@@ -1,13 +1,13 @@
 ---
 name: Cosmos Benchmark
-description: Cosmos DB benchmark agent — provision infrastructure, set up environments, run benchmarks, and analyze results. Supports both single-tenant and multi-tenant configurations. Use for benchmark/DR drill workflows.
+description: Cosmos DB benchmark agent — set up resources, run benchmarks, and analyze results. Supports both single-tenant and multi-tenant configurations. Use for benchmark/DR drill workflows.
 tools: ['readFile', 'listDir', 'runInTerminal', 'search', 'grep', 'fileSearch', 'agent']
-argument-hint: "provision accounts, setup VM, run benchmark, analyze results, or check status"
+argument-hint: "setup resources, run benchmark, or analyze results"
 ---
 
 # Cosmos Benchmark Agent
 
-You are a Cosmos DB benchmark specialist. You help with the full benchmark/DR drill lifecycle: provisioning infrastructure, setting up environments, running benchmarks, and analyzing results.
+You are a Cosmos DB benchmark specialist. You help with the full benchmark/DR drill lifecycle: provisioning infrastructure, running benchmarks, and analyzing results.
 
 ## Routing
 
@@ -15,11 +15,9 @@ Determine user intent and follow the matching workflow:
 
 | User wants to... | Skill to load |
 |---|---|
-| Provision Azure resources (Cosmos accounts, App Insights, VMs) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md` |
-| Set up environment (install tools, clone repo, generate config, build) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md` |
-| Run a benchmark (checkout branch/PR, select scenario, execute) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md` |
-| Analyze results (CSV, compare runs, heap/thread dumps, reports, Kusto) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md` |
-| Check status (resources, runs, VM, build, config overview) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md` |
+| Set up resources (create/reuse Cosmos accounts, App Insights, VMs, install tools) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md` |
+| Run a benchmark (clone repo, build, configure, execute scenarios) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md` |
+| Analyze results (CSV metrics, compare runs, heap/thread dumps, reports, Kusto) | Read `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md` |
 
 When a skill references files in its `references/` directory, read them from the skill's directory (e.g., `sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/thresholds.md`).
 
@@ -29,7 +27,7 @@ For complex multi-step workflows, use subagents to keep context clean:
 
 - **Analyze after run**: Spawn a subagent to analyze results so run context doesn't pollute analysis.
 - **Parallel analysis**: Spawn parallel subagents for multiple result directories.
-- **Provision + setup**: For full DR drill setup, spawn sequential subagents for provision → setup.
+- **Parallel resource creation**: During setup resources, the `provision-all.sh` script handles parallelism automatically.
 
 ## Benchmark Modes
 
@@ -44,7 +42,6 @@ Both use the same JAR, orchestrator, and monitoring infrastructure.
 
 After completing one task, suggest the natural next step:
 
-- After **provision** → suggest **setup**
-- After **setup** → suggest **run**
+- After **setup resources** → suggest **run**
 - After **run** → suggest **analyze**
 - After **analyze** (if baseline exists) → suggest comparing with previous run
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md
index 675b6b4a1356..2ecb7e4511a0 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/SKILL.md
@@ -1,196 +1,127 @@
 ---
 name: cosmos-benchmark-analyze
-description: Analyze Cosmos DB benchmark results — download from VM, parse CSV metrics, compare runs, analyze heap/thread dumps, generate markdown reports, export to Kusto, and query Application Insights. Triggers on "analyze results", "compare runs", "leak check", "did it pass", "heap dump", "thread dump", "generate report", "export to kusto", "regression check", monitor.csv, or result directories.
+description: Analyze Cosmos DB benchmark results — download from VM, generate markdown reports with time-series charts and comparison tables, apply pass/fail thresholds. Triggers on "analyze results", "compare runs", "leak check", "did it pass", "generate report", "regression check", or result directories.
 ---
 
 # Analyze Benchmark Results
 
-Comprehensive post-run analysis: download results, CSV metrics, run comparison, heap/thread dumps, reports, and Kusto export.
+Download results, generate a markdown report with metrics analysis, time-series charts, and multi-run comparison.
 
-## 1. Download Results from VM
+## Step 1 — Download Results
 
-Auto-detect VM connection:
-```bash
-VM_IP=$(cat benchmark-config/vm-ip); VM_USER=$(cat benchmark-config/vm-user); VM_KEY=$(cat benchmark-config/vm-key)
-```
+Download results from the VM to the local config directory:
 
-Download a run's results:
 ```bash
-scp -i $VM_KEY -r $VM_USER@$VM_IP:~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<run-name> \
-  ./results/<run-name>
-```
-
-Each run lives in its own directory. Never overwrite previous results — this enables baseline tracking.
-
-## 2. CSV Metrics Analysis
-
-### Workflow
+# List available runs on VM
+bash scripts/download-results.sh --config-dir "$CONFIG_DIR" --list
 
-1. Find `monitor.csv` in the result directory.
-2. Parse CSV columns (see `references/thresholds.md` for column definitions).
-3. Cross-reference with lifecycle events from the benchmark log file (pattern: `[LIFECYCLE] <event> timestamp=<ISO>`).
-4. Extract snapshots:
-   - **Baseline** = first row after `PRE_CREATE`
-   - **Peak** = row with highest `heap_used_kb`
-   - **Final** = last row (after `POST_CLOSE` + settle)
-5. Compute: `thread_delta`, `heap_ratio`
-6. Apply thresholds from `references/thresholds.md`.
-
-### Output Format
+# Download a specific run
+bash scripts/download-results.sh --config-dir "$CONFIG_DIR" --run-name <run-name>
 
+# Download all runs
+bash scripts/download-results.sh --config-dir "$CONFIG_DIR" --all
 ```
-📊 Benchmark Results: <directory>
-Branch: <branch>  Commit: <commit>  Scenario: <scenario>
-
-HEAP:    Baseline=<X>MB  Peak=<Y>MB  After close=<Z>MB
-THREADS: Baseline=<X>  Peak=<Y>  After close=<Z>
-FDs:     Peak=<X>
-GC:      Count=<X>  Time=<Y>ms
 
-✅/🔴 Thread leak: delta=<N> (threshold: ≤2)
-✅/🔴 Memory leak: ratio=<N> (threshold: ≤1.1)
+Results are saved to `$CONFIG_DIR/results/<run-name>/`. Each run directory contains:
 
-Overall: ✅ PASSED / 🔴 FAILED
 ```
-
-## 3. Compare Two Runs
-
-1. Read `monitor.csv` from both directories.
-2. Extract baseline, peak, final for each.
-3. Read `git-info.json` from each for branch/commit.
-4. Compute deltas.
-
-### Output
-
-```
-📊 Comparing:
-   Before: <dir1> (branch: <branch1>, commit: <commit1>)
-   After:  <dir2> (branch: <branch2>, commit: <commit2>)
-
-| Metric                    | Before | After  | Delta   | Status |
-|---------------------------|--------|--------|---------|--------|
-| Threads after close       |    218 |     19 |    -199 | ✅     |
-| Heap after close (MB)     |    342 |    134 |    -208 | ✅     |
-| Heap ratio                |   2.67 |   1.05 |  -1.62  | ✅     |
-| Peak FDs                  |   4428 |   4312 |    -116 | ✅     |
-| GC count                  |    847 |    812 |     -35 | ✅     |
-
-Overall: ✅ Fix validated / 🔴 Regression detected
+<run-name>/
+├── monitor.csv            # JVM metrics (threads, heap, FDs, GC, RSS, CPU)
+├── metrics/               # Codahale CSV metrics (throughput, latency per operation)
+│   ├── #Successful Operations.csv
+│   ├── #Unsuccessful Operations.csv
+│   └── ...                # Per-tenant and per-operation variants
+├── git-info.json          # branch, commit SHA
+├── gc.log                 # G1GC log
+└── benchmark.log          # Benchmark stdout/stderr
 ```
 
-Status: ✅ improved, 🟡 marginal (<10%), 🔴 regressed (>10% worse)
-
-## 4. Heap Dump Analysis
-
-### Locate heap dumps
-
-```
-results/<run-name>/heap-dumps/heap-PRE_CLOSE-*.hprof
-results/<run-name>/heap-dumps/heap-POST_CLOSE-*.hprof
-```
+## Step 2 — Generate Report
 
-### Option A: HeapDumpAnalyzer (built into benchmark JAR)
+Generate a markdown report from the downloaded results:
 
 ```bash
-java -cp azure-cosmos-benchmark-*-jar-with-dependencies.jar \
-  com.azure.cosmos.benchmark.HeapDumpAnalyzer <pre.hprof> <post.hprof>
+python3 scripts/generate-report.py \
+  --results-dir "$CONFIG_DIR/results" \
+  --output "$CONFIG_DIR/results/report.md"
 ```
 
-### Option B: Python hprof parser (lightweight, no deps)
+To analyze specific runs only:
 
 ```bash
-python3 references/parse_hprof.py <pre.hprof> --top 30
-python3 references/parse_hprof.py --diff <pre.hprof> <post.hprof> --top 20
+python3 scripts/generate-report.py \
+  --results-dir "$CONFIG_DIR/results" \
+  --runs "20260302-SIMPLE-main,20260302-SIMPLE-fix-leak"
 ```
 
-### Option C: YourKit (detailed, requires license)
+### What the report contains
 
-If YourKit is installed on the VM:
+#### Per-run summary
 
-```bash
-# Open snapshot in YourKit CLI
-<yourkit-dir>/bin/profiler.sh -export -snapshot=<file.hprof> -csv -outdir=<output>
-```
+For each run, the report includes:
+- **Git info**: branch, commit
+- **JVM metrics table**: baseline, peak, and final values for threads, heap, RSS, FDs, GC
+- **Pass/fail verdict**: thread leak (delta ≤2) and memory leak (ratio ≤1.1) checks
+- **Throughput table**: Codahale metrics (ops/sec mean, 1m, 5m rates) from `metrics/*.csv`
+- **Time-series SVG charts**: inline sparklines for threads, heap, FDs, RSS, GC count, CPU over time
 
-Or analyze interactively via YourKit GUI by downloading the .hprof files locally.
+#### Multi-run comparison table (when ≥2 runs)
 
-### Interpret results
+If multiple runs are present, the report includes:
+- **Side-by-side metrics comparison**: threads, heap, heap ratio, thread delta, FDs, GC, RSS for each run
+- **Throughput comparison**: ops/sec for each operation across runs
 
-Look for classes with more instances/bytes after close — these indicate objects not released during `CosmosAsyncClient.close()`. Common suspects: Reactor schedulers, Netty connection pools, background threads, unbounded caches.
+### Metrics analyzed
 
-## 5. Thread Dump Analysis
+From **`monitor.csv`** (JVM-level, sampled every 60s):
+| Metric | Description |
+|---|---|
+| threads | Live thread count |
+| heap_used_kb | Used heap (S1U+EU+OU from jstat) |
+| heap_max_kb | Max heap capacity |
+| rss_kb | Resident set size |
+| fds | Open file descriptors |
+| cpu_pct | CPU usage percentage |
+| gc_count | Cumulative GC count |
+| gc_time_ms | Cumulative GC time |
 
-### Capture thread dump during benchmark
+From **`metrics/*.csv`** (Codahale, per-operation):
+| File | Metrics |
+|---|---|
+| `#Successful Operations.csv` | count, mean_rate, m1_rate, m5_rate |
+| `#Unsuccessful Operations.csv` | count, mean_rate, m1_rate, m5_rate |
+| Per-tenant/operation variants | Same columns per operation type |
 
-```bash
-# On the VM, while benchmark is running:
-jcmd <pid> Thread.print > results/<run-name>/thread-dump-$(date +%s).txt
+### Pass/fail thresholds
 
-# Or using jstack:
-jstack <pid> > results/<run-name>/thread-dump-$(date +%s).txt
-```
+See `references/thresholds.md` for full details:
 
-Capture multiple dumps at intervals to identify stuck or leaked threads:
-```bash
-for i in 1 2 3; do jcmd <pid> Thread.print > results/<run-name>/thread-dump-$i.txt; sleep 30; done
-```
+| Check | Threshold | Verdict |
+|---|---|---|
+| Thread delta (final − baseline) | ≤ 2 | ✅ / 🔴 |
+| Heap ratio (final / baseline) | ≤ 1.1 | ✅ / 🔴 |
+| P99 latency scaling | < 5× at N=100 vs N=1 | 🟡 warn |
+| Throughput scaling | > 0.7× at N=100 vs N=1 | 🟡 warn |
 
-### Analyze thread dumps
+## Step 3 — Thread Dump Analysis (optional)
+
+If thread dumps were captured during the run (via `capture-diagnostics.sh`):
 
 Look for:
-- **Thread count growth**: Compare total thread counts across dumps
-- **Stuck threads**: Same thread in same stack across multiple dumps
-- **Leaked pools**: Thread pool threads that should have been shut down after client close
-- **Daemon vs non-daemon**: Non-daemon threads prevent JVM exit
+- **Thread count growth**: compare total counts across dumps
+- **Stuck threads**: same thread in same stack across dumps
+- **Leaked pools**: threads that should have been shut down after client close
 
-Key thread name patterns in Cosmos SDK:
+Key Cosmos SDK thread name patterns:
 - `cosmos-parallel-*` — SDK parallel scheduler
 - `reactor-http-*` — Reactor Netty event loop
 - `boundedElastic-*` — Reactor bounded elastic pool
 - `globalEndpointManager-*` — Cosmos endpoint refresh
 
-## 6. Generate Markdown Report
-
-Produce a markdown report with embedded charts (as inline base64 images or Plotly HTML):
-
-### Using generate-dashboard.py
-
-```bash
-python3 copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py \
-  <results-dir>/metrics \
-  <results-dir>/benchmark.log \
-  <results-dir>/report.html \
-  <results-dir>/monitor.csv
-```
-
-Arguments: `<metrics-dir> <log-file> <output-html> [monitor.csv]`
-
-Then reference the HTML dashboard in the markdown report or generate a standalone markdown file with metrics tables and verdicts.
-
-## 7. Export to Kusto
-
-See `references/kusto-schema.md` for:
-- Table schema (`BenchmarkResults`, `BenchmarkSummary`)
-- CSV enrichment commands (add run metadata to monitor.csv)
-- `.ingest` commands for Azure Data Explorer
-- Sample queries (latest runs, compare runs, trend over time)
-
-## 8. Application Insights Queries
-
-If the benchmark was run with `APPLICATIONINSIGHTS_CONNECTION_STRING`, query metrics:
-
-```bash
-az monitor app-insights query \
-  --app <app-insights-name> \
-  --resource-group rg-cosmos-benchmark \
-  --analytics-query "<KQL-query>"
-```
-
-Placeholder: user will provide specific KQL queries for latency percentiles, error rates, and throughput over time.
-
-## References
+## Scripts Reference
 
-- **Pass/fail thresholds and CSV columns**: `references/thresholds.md`
-- **Python hprof parser**: `references/parse_hprof.py`
-- **Kusto table schema & ingestion**: `references/kusto-schema.md`
-- **Dashboard generator**: `scripts/generate-dashboard.py`
+| Script | Purpose |
+|---|---|
+| `scripts/download-results.sh` | Download results from VM to `$CONFIG_DIR/results/`. |
+| `scripts/generate-report.py` | Generate markdown report with metrics, charts, and comparison tables. |
+| `references/thresholds.md` | Pass/fail thresholds and monitor.csv column definitions. |
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/kusto-schema.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/kusto-schema.md
deleted file mode 100644
index 540f15392a38..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/kusto-schema.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Kusto Table Schema for Benchmark Results
-
-## Table: BenchmarkResults
-
-```kql
-.create table BenchmarkResults (
-    RunId: string,
-    Timestamp: datetime,
-    Branch: string,
-    CommitSha: string,
-    Scenario: string,
-    TenantCount: int,
-    Operation: string,
-    ConnectionMode: string,
-    // Monitor metrics (from monitor.csv)
-    Threads: int,
-    FileDescriptors: int,
-    RssKb: long,
-    CpuPct: real,
-    HeapUsedKb: long,
-    HeapMaxKb: long,
-    GcCount: int,
-    GcTimeMs: long,
-    // Computed metrics
-    Phase: string,
-    ThreadDelta: int,
-    HeapRatio: real,
-    // Verdict
-    Passed: bool,
-    FailReason: string
-)
-```
-
-## Ingestion from CSV
-
-### Step 1: Prepare CSV for ingestion
-
-Add run metadata columns to monitor.csv:
-
-```bash
-# On the VM or locally after downloading results:
-RUN_ID="<run-name>"
-BRANCH=$(jq -r .branch results/<run-name>/git-info.json)
-COMMIT=$(jq -r .commitId results/<run-name>/git-info.json)
-
-awk -v run="$RUN_ID" -v branch="$BRANCH" -v commit="$COMMIT" \
-  'NR>1 {print run","$0","branch","commit}' \
-  results/<run-name>/monitor.csv > results/<run-name>/monitor-enriched.csv
-```
-
-### Step 2: Ingest into Kusto
-
-```kql
-.ingest into table BenchmarkResults (
-    h'https://<storage-account>.blob.core.windows.net/<container>/monitor-enriched.csv'
-) with (format='csv', ignoreFirstRecord=true)
-```
-
-Or inline from local file via Kusto Explorer / Azure Data Explorer web UI.
-
-### Step 3: Query examples
-
-```kql
-// Latest runs summary
-BenchmarkResults
-| summarize MaxThreads=max(Threads), MaxHeapMB=max(HeapUsedKb)/1024,
-            FinalThreads=arg_max(Timestamp, Threads), FinalHeapKb=arg_max(Timestamp, HeapUsedKb)
-  by RunId, Branch, Scenario
-| order by Timestamp desc
-
-// Compare two runs
-let baseline = "20260226-CHURN-main-baseline";
-let fix = "20260226-CHURN-fix-leak";
-BenchmarkResults
-| where RunId in (baseline, fix)
-| summarize MaxThreads=max(Threads), MaxHeapMB=max(HeapUsedKb)/1024 by RunId
-| order by RunId
-
-// Trend over time (multiple runs)
-BenchmarkResults
-| where Scenario == "CHURN"
-| summarize FinalThreads=arg_max(Timestamp, Threads) by RunId, Branch
-| order by Timestamp asc
-| render timechart
-```
-
-## Table: BenchmarkSummary
-
-Aggregated per-run summary (one row per run):
-
-```kql
-.create table BenchmarkSummary (
-    RunId: string,
-    Timestamp: datetime,
-    Branch: string,
-    CommitSha: string,
-    Scenario: string,
-    TenantCount: int,
-    BaselineThreads: int,
-    PeakThreads: int,
-    FinalThreads: int,
-    ThreadDelta: int,
-    BaselineHeapKb: long,
-    PeakHeapKb: long,
-    FinalHeapKb: long,
-    HeapRatio: real,
-    PeakFDs: int,
-    TotalGcCount: int,
-    TotalGcTimeMs: long,
-    Passed: bool,
-    FailReasons: string
-)
-```
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/parse_hprof.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/parse_hprof.py
deleted file mode 100644
index f58fd86be534..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/references/parse_hprof.py
+++ /dev/null
@@ -1,234 +0,0 @@
-#!/usr/bin/env python3
-"""
-Lightweight .hprof histogram parser for benchmark heap dump analysis.
-Does NOT load the entire heap into memory -- streams through the binary format.
-
-Usage:
-  python3 parse_hprof.py <file.hprof> [--top N]
-  python3 parse_hprof.py --diff <pre.hprof> <post.hprof> [--top N]
-"""
-
-import struct
-import sys
-import argparse
-from collections import defaultdict
-
-# HPROF binary format constants
-HPROF_UTF8          = 0x01
-HPROF_LOAD_CLASS    = 0x02
-HPROF_HEAP_DUMP     = 0x0C
-HPROF_HEAP_DUMP_SEG = 0x1C
-HPROF_HEAP_DUMP_END = 0x2C
-
-# Heap dump sub-tags
-HPROF_GC_CLASS_DUMP        = 0x20
-HPROF_GC_INSTANCE_DUMP     = 0x21
-HPROF_GC_OBJ_ARRAY_DUMP    = 0x22
-HPROF_GC_PRIM_ARRAY_DUMP   = 0x23
-
-TYPE_SIZES = {2: 4, 4: 1, 5: 2, 6: 4, 7: 8, 8: 1, 9: 2, 10: 4, 11: 8}
-
-
-def read_id(f, id_size):
-    data = f.read(id_size)
-    if len(data) < id_size:
-        return None
-    if id_size == 4:
-        return struct.unpack('>I', data)[0]
-    return struct.unpack('>Q', data)[0]
-
-
-def parse_histogram(filepath):
-    """Parse an hprof file and return a dict of {class_name: (instance_count, total_bytes)}."""
-    strings = {}       # id -> utf8 string
-    class_names = {}   # class_serial -> string_id
-    class_id_to_name = {}  # class_obj_id -> class_name
-    class_instance_sizes = {}  # class_obj_id -> instance_size
-    histogram = defaultdict(lambda: [0, 0])  # class_name -> [count, bytes]
-
-    with open(filepath, 'rb') as f:
-        # Read header
-        header = b''
-        while True:
-            c = f.read(1)
-            if c == b'\x00':
-                break
-            header += c
-
-        id_size = struct.unpack('>I', f.read(4))[0]
-        f.read(8)  # timestamp
-
-        while True:
-            tag_data = f.read(1)
-            if not tag_data:
-                break
-            tag = tag_data[0]
-            f.read(4)  # timestamp
-            length = struct.unpack('>I', f.read(4))[0]
-
-            if tag == HPROF_UTF8:
-                str_id = read_id(f, id_size)
-                name = f.read(length - id_size).decode('utf-8', errors='replace')
-                strings[str_id] = name
-
-            elif tag == HPROF_LOAD_CLASS:
-                serial = struct.unpack('>I', f.read(4))[0]
-                class_obj_id = read_id(f, id_size)
-                f.read(4)  # stack trace serial
-                name_id = read_id(f, id_size)
-                if name_id in strings:
-                    class_id_to_name[class_obj_id] = strings[name_id]
-
-            elif tag in (HPROF_HEAP_DUMP, HPROF_HEAP_DUMP_SEG):
-                end_pos = f.tell() + length
-                while f.tell() < end_pos:
-                    sub_tag_data = f.read(1)
-                    if not sub_tag_data:
-                        break
-                    sub_tag = sub_tag_data[0]
-
-                    if sub_tag == HPROF_GC_CLASS_DUMP:
-                        class_obj_id = read_id(f, id_size)
-                        f.read(4)  # stack serial
-                        read_id(f, id_size)  # super
-                        read_id(f, id_size)  # classloader
-                        read_id(f, id_size)  # signer
-                        read_id(f, id_size)  # protection domain
-                        read_id(f, id_size)  # reserved1
-                        read_id(f, id_size)  # reserved2
-                        inst_size = struct.unpack('>I', f.read(4))[0]
-                        class_instance_sizes[class_obj_id] = inst_size
-
-                        # constant pool
-                        cp_count = struct.unpack('>H', f.read(2))[0]
-                        for _ in range(cp_count):
-                            f.read(2)  # index
-                            tp = f.read(1)[0]
-                            f.read(TYPE_SIZES.get(tp, id_size))
-
-                        # static fields
-                        sf_count = struct.unpack('>H', f.read(2))[0]
-                        for _ in range(sf_count):
-                            read_id(f, id_size)  # name
-                            tp = f.read(1)[0]
-                            f.read(TYPE_SIZES.get(tp, id_size))
-
-                        # instance fields
-                        if_count = struct.unpack('>H', f.read(2))[0]
-                        for _ in range(if_count):
-                            read_id(f, id_size)  # name
-                            f.read(1)  # type
-
-                    elif sub_tag == HPROF_GC_INSTANCE_DUMP:
-                        read_id(f, id_size)  # obj id
-                        f.read(4)  # stack serial
-                        class_id = read_id(f, id_size)
-                        data_len = struct.unpack('>I', f.read(4))[0]
-                        f.read(data_len)
-
-                        name = class_id_to_name.get(class_id, f'unknown_0x{class_id:x}')
-                        inst_size = class_instance_sizes.get(class_id, data_len)
-                        histogram[name][0] += 1
-                        histogram[name][1] += inst_size + data_len
-
-                    elif sub_tag == HPROF_GC_OBJ_ARRAY_DUMP:
-                        read_id(f, id_size)  # obj id
-                        f.read(4)  # stack serial
-                        num_elements = struct.unpack('>I', f.read(4))[0]
-                        array_class_id = read_id(f, id_size)
-                        f.read(num_elements * id_size)
-
-                        name = class_id_to_name.get(array_class_id, 'Object[]')
-                        histogram[name + '[]'][0] += 1
-                        histogram[name + '[]'][1] += num_elements * id_size
-
-                    elif sub_tag == HPROF_GC_PRIM_ARRAY_DUMP:
-                        read_id(f, id_size)  # obj id
-                        f.read(4)  # stack serial
-                        num_elements = struct.unpack('>I', f.read(4))[0]
-                        elem_type = f.read(1)[0]
-                        elem_size = TYPE_SIZES.get(elem_type, 1)
-                        f.read(num_elements * elem_size)
-
-                        type_names = {4: 'boolean', 5: 'char', 6: 'float', 7: 'double',
-                                      8: 'byte', 9: 'short', 10: 'int', 11: 'long'}
-                        name = type_names.get(elem_type, 'unknown') + '[]'
-                        histogram[name][0] += 1
-                        histogram[name][1] += num_elements * elem_size
-
-                    elif sub_tag == 0xFF:
-                        read_id(f, id_size)  # obj id
-                    elif sub_tag in (0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08):
-                        read_id(f, id_size)  # obj id
-                    else:
-                        # Unknown sub-tag, try to skip to end
-                        remaining = end_pos - f.tell()
-                        if remaining > 0:
-                            f.seek(end_pos)
-                        break
-
-            else:
-                f.read(length)
-
-    return dict(histogram)
-
-
-def print_histogram(hist, top_n=30):
-    sorted_items = sorted(hist.items(), key=lambda x: x[1][1], reverse=True)
-    print(f"{'Class':<60} {'Count':>10} {'Bytes':>12}")
-    print('-' * 84)
-    for name, (count, bytes_) in sorted_items[:top_n]:
-        print(f"{name:<60} {count:>10} {bytes_:>12}")
-    total_count = sum(v[0] for v in hist.values())
-    total_bytes = sum(v[1] for v in hist.values())
-    print('-' * 84)
-    print(f"{'TOTAL':<60} {total_count:>10} {total_bytes:>12}")
-
-
-def print_diff(pre_hist, post_hist, top_n=20):
-    all_classes = set(pre_hist.keys()) | set(post_hist.keys())
-    diffs = []
-    for cls in all_classes:
-        pre_count, pre_bytes = pre_hist.get(cls, (0, 0))
-        post_count, post_bytes = post_hist.get(cls, (0, 0))
-        d_count = post_count - pre_count
-        d_bytes = post_bytes - pre_bytes
-        if d_bytes != 0:
-            diffs.append((cls, pre_count, post_count, d_count, pre_bytes, post_bytes, d_bytes))
-
-    # Sort by absolute byte delta descending
-    diffs.sort(key=lambda x: abs(x[6]), reverse=True)
-
-    print(f"{'Class':<55} {'PRE cnt':>8} {'POST cnt':>9} {'D cnt':>7} {'D bytes':>12}")
-    print('-' * 93)
-    for cls, pre_c, post_c, d_c, pre_b, post_b, d_b in diffs[:top_n]:
-        sign = '+' if d_b > 0 else ''
-        print(f"{cls:<55} {pre_c:>8} {post_c:>9} {d_c:>+7} {sign}{d_b:>11}")
-    print('-' * 93)
-
-    total_pre = sum(v[1] for v in pre_hist.values())
-    total_post = sum(v[1] for v in post_hist.values())
-    print(f"Total: PRE={total_pre/1024/1024:.1f}MB  POST={total_post/1024/1024:.1f}MB  Delta={sign}{(total_post-total_pre)/1024/1024:.1f}MB")
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Analyze .hprof heap dump files')
-    parser.add_argument('files', nargs='+', help='.hprof file(s)')
-    parser.add_argument('--top', type=int, default=30, help='Top N classes to show')
-    parser.add_argument('--diff', action='store_true', help='Compare two hprof files')
-    args = parser.parse_args()
-
-    if args.diff and len(args.files) == 2:
-        print(f"Parsing {args.files[0]}...")
-        pre = parse_histogram(args.files[0])
-        print(f"Parsing {args.files[1]}...")
-        post = parse_histogram(args.files[1])
-        print()
-        print_diff(pre, post, args.top)
-    elif len(args.files) == 1:
-        print(f"Parsing {args.files[0]}...")
-        hist = parse_histogram(args.files[0])
-        print()
-        print_histogram(hist, args.top)
-    else:
-        parser.print_help()
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/download-results.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/download-results.sh
new file mode 100755
index 000000000000..049aec0f8e51
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/download-results.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# download-results.sh — Download benchmark results from VM to local machine
+#
+# Usage:
+#   ./download-results.sh --config-dir <path> --run-name <name> [--output-dir ./results]
+#   ./download-results.sh --config-dir <path> --all [--output-dir ./results]
+#   ./download-results.sh --config-dir <path> --list
+#
+# Modes:
+#   --run-name <name>  Download a specific run directory
+#   --all              Download all runs from the VM
+#   --list             List available runs on the VM (no download)
+
+set -euo pipefail
+
+CONFIG_DIR=""
+RUN_NAME=""
+OUTPUT_DIR=""
+ALL=false
+LIST=false
+REMOTE_RESULTS="~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results"
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --config-dir) CONFIG_DIR="$2"; shift 2 ;;
+    --run-name)   RUN_NAME="$2"; shift 2 ;;
+    --output-dir) OUTPUT_DIR="$2"; shift 2 ;;
+    --all)        ALL=true; shift ;;
+    --list)       LIST=true; shift ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CONFIG_DIR" ]]; then
+  echo "Usage: $0 --config-dir <path> (--run-name <name> | --all | --list)" >&2
+  exit 1
+fi
+
+# Default output to $CONFIG_DIR/results
+[[ -z "$OUTPUT_DIR" ]] && OUTPUT_DIR="$CONFIG_DIR/results"
+
+VM_IP=$(cat "$CONFIG_DIR/vm-ip")
+VM_USER=$(cat "$CONFIG_DIR/vm-user")
+VM_KEY=$(cat "$CONFIG_DIR/vm-key")
+SCP_CMD="scp -i $VM_KEY -o StrictHostKeyChecking=no -r"
+SSH_CMD="ssh -i $VM_KEY -o StrictHostKeyChecking=no $VM_USER@$VM_IP"
+
+if [[ "$LIST" == "true" ]]; then
+  echo "Available runs on VM ($VM_IP):"
+  $SSH_CMD "ls -1d $REMOTE_RESULTS/*/ 2>/dev/null | while read d; do
+    NAME=\$(basename \$d)
+    HAS_MONITOR=\$(test -f \$d/monitor.csv && echo '📊' || echo '❌')
+    GIT_INFO=''
+    if [[ -f \$d/git-info.json ]]; then
+      GIT_INFO=\$(python3 -c \"import json; d=json.load(open('\$d/git-info.json')); print(f\\\"branch={d.get('branch','?')} commit={d.get('commit','?')}\\\")\" 2>/dev/null || echo '')
+    fi
+    echo \"  \$HAS_MONITOR \$NAME  \$GIT_INFO\"
+  done" 2>/dev/null || echo "  (no runs found)"
+  exit 0
+fi
+
+mkdir -p "$OUTPUT_DIR"
+
+if [[ -n "$RUN_NAME" ]]; then
+  echo "Downloading: $RUN_NAME"
+  $SCP_CMD "$VM_USER@$VM_IP:$REMOTE_RESULTS/$RUN_NAME" "$OUTPUT_DIR/"
+  echo "✅ Downloaded to $OUTPUT_DIR/$RUN_NAME"
+
+elif [[ "$ALL" == "true" ]]; then
+  echo "Downloading all runs from VM..."
+  RUNS=$($SSH_CMD "ls -1d $REMOTE_RESULTS/*/ 2>/dev/null | xargs -I{} basename {}" || echo "")
+  if [[ -z "$RUNS" ]]; then
+    echo "No runs found on VM"
+    exit 0
+  fi
+  COUNT=0
+  for RUN in $RUNS; do
+    echo "  Downloading: $RUN"
+    $SCP_CMD "$VM_USER@$VM_IP:$REMOTE_RESULTS/$RUN" "$OUTPUT_DIR/"
+    COUNT=$((COUNT + 1))
+  done
+  echo "✅ Downloaded $COUNT run(s) to $OUTPUT_DIR/"
+
+else
+  echo "Provide --run-name, --all, or --list" >&2
+  exit 1
+fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py
deleted file mode 100644
index 41ac591345f8..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-dashboard.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import csv, re, os, sys
-from datetime import datetime, timezone
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-
-metrics_dir = sys.argv[1]
-log_file = sys.argv[2]
-output_html = sys.argv[3]
-# Optional: external monitor CSV
-monitor_csv = sys.argv[4] if len(sys.argv) > 4 else None
-
-# Parse lifecycle events
-lifecycle = []
-with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
-    for line in f:
-        m = re.search(r'\[LIFECYCLE\]\s+(\S+).*timestamp=(\S+)', line)
-        if m:
-            event = m.group(1)
-            ts_str = m.group(2)
-            try:
-                dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
-                lifecycle.append((dt.isoformat(), event))
-            except:
-                pass
-
-def read_metric(filename, value_col=1):
-    filepath = os.path.join(metrics_dir, filename)
-    if not os.path.exists(filepath):
-        return [], []
-    times, vals = [], []
-    seen = set()
-    with open(filepath, 'r') as f:
-        reader = csv.reader(f)
-        next(reader)
-        for row in reader:
-            t = int(row[0])
-            if t in seen:
-                continue
-            seen.add(t)
-            dt = datetime.fromtimestamp(t, tz=timezone.utc)
-            try:
-                v = float(row[value_col])
-            except:
-                continue
-            times.append(dt.isoformat())
-            vals.append(v)
-    return times, vals
-
-def read_monitor_csv(filepath):
-    if not filepath or not os.path.exists(filepath):
-        return {}
-    cols = {}
-    with open(filepath, 'r') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            for key in row:
-                if key not in cols:
-                    cols[key] = []
-                cols[key].append(row[key])
-    # Convert timestamps
-    if 'timestamp' in cols:
-        cols['time'] = cols['timestamp']
-    elif 'epoch' in cols:
-        cols['time'] = [datetime.fromtimestamp(int(e), tz=timezone.utc).isoformat() for e in cols['epoch']]
-    # Convert numeric cols
-    for key in ['cpu_pct', 'rss_mb', 'vsz_mb', 'threads', 'fd_count', 'tcp_established', 'tcp_time_wait', 'tcp_close_wait']:
-        if key in cols:
-            cols[key] = [float(v) for v in cols[key]]
-    return cols
-
-# Load in-process metrics
-heap_t, heap_v = read_metric('memory.heap.used.csv')
-heap_v_mb = [v / (1024*1024) for v in heap_v]
-thread_t, thread_v = read_metric('threads.count.csv')
-success_t, success_v = read_metric('#Successful Operations.csv', value_col=1)
-gc_t, gc_v = read_metric('gc.G1-Young-Generation.count.csv')
-
-# Load external monitor
-mon = read_monitor_csv(monitor_csv)
-has_monitor = 'time' in mon and len(mon['time']) > 0
-
-# Determine layout
-if has_monitor:
-    rows = 7
-    titles = ('Heap Memory (MB) [JVM]', 'RSS Memory (MB) [OS]', 'CPU % [OS]',
-              'Thread Count [JVM]', 'File Descriptors [OS]', 'TCP Connections [OS]',
-              'Successful Ops (cumulative)')
-    heights = [0.16, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14]
-else:
-    rows = 4
-    titles = ('Heap Memory (MB)', 'Thread Count', 'Successful Ops (cumulative)', 'GC Count (G1 Young)')
-    heights = [0.3, 0.25, 0.25, 0.2]
-
-fig = make_subplots(rows=rows, cols=1, shared_xaxes=True, vertical_spacing=0.04,
-    subplot_titles=titles, row_heights=heights)
-
-if has_monitor:
-    # Row 1: Heap (JVM)
-    fig.add_trace(go.Scatter(x=heap_t, y=heap_v_mb, mode='lines', name='Heap Used',
-        line=dict(color='#2196F3', width=1), fill='tozeroy', fillcolor='rgba(33,150,243,0.1)'), row=1, col=1)
-    # Row 2: RSS (OS)
-    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('rss_mb', []), mode='lines', name='RSS',
-        line=dict(color='#E91E63', width=2)), row=2, col=1)
-    # Row 3: CPU (OS)
-    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('cpu_pct', []), mode='lines', name='CPU %',
-        line=dict(color='#F44336', width=1.5), fill='tozeroy', fillcolor='rgba(244,67,54,0.1)'), row=3, col=1)
-    # Row 4: Threads (JVM)
-    fig.add_trace(go.Scatter(x=thread_t, y=thread_v, mode='lines', name='Threads (JVM)',
-        line=dict(color='#FF9800', width=2)), row=4, col=1)
-    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('threads', []), mode='lines', name='Threads (OS)',
-        line=dict(color='#FF9800', width=1, dash='dot')), row=4, col=1)
-    # Row 5: FDs (OS)
-    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('fd_count', []), mode='lines', name='File Descriptors',
-        line=dict(color='#795548', width=2)), row=5, col=1)
-    # Row 6: TCP (OS)
-    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('tcp_established', []), mode='lines', name='TCP ESTAB',
-        line=dict(color='#009688', width=2)), row=6, col=1)
-    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('tcp_time_wait', []), mode='lines', name='TCP TIME_WAIT',
-        line=dict(color='#CDDC39', width=1)), row=6, col=1)
-    fig.add_trace(go.Scatter(x=mon['time'], y=mon.get('tcp_close_wait', []), mode='lines', name='TCP CLOSE_WAIT',
-        line=dict(color='#FF5722', width=1)), row=6, col=1)
-    # Row 7: Success ops
-    fig.add_trace(go.Scatter(x=success_t, y=success_v, mode='lines', name='Success Ops',
-        line=dict(color='#4CAF50', width=2)), row=7, col=1)
-else:
-    fig.add_trace(go.Scatter(x=heap_t, y=heap_v_mb, mode='lines', name='Heap Used MB',
-        line=dict(color='#2196F3', width=1), fill='tozeroy', fillcolor='rgba(33,150,243,0.1)'), row=1, col=1)
-    fig.add_trace(go.Scatter(x=thread_t, y=thread_v, mode='lines', name='Threads',
-        line=dict(color='#FF9800', width=2)), row=2, col=1)
-    fig.add_trace(go.Scatter(x=success_t, y=success_v, mode='lines', name='Success Ops',
-        line=dict(color='#4CAF50', width=2)), row=3, col=1)
-    fig.add_trace(go.Scatter(x=gc_t, y=gc_v, mode='lines', name='GC Count',
-        line=dict(color='#9C27B0', width=1.5)), row=4, col=1)
-
-# Lifecycle vertical lines
-event_colors = {'CYCLE_START': 'green', 'POST_CREATE': 'blue', 'POST_WORKLOAD': 'orange',
-    'POST_CLOSE': 'red', 'POST_SETTLE': 'gray', 'COMPLETE': 'black', 'PRE_CREATE': 'lightgray'}
-shapes = []
-annotations = []
-for dt_str, event in lifecycle:
-    color = event_colors.get(event, 'gray')
-    shapes.append(dict(type='line', x0=dt_str, x1=dt_str, y0=0, y1=1, yref='paper',
-        line=dict(color=color, width=1, dash='dot')))
-    short = event.replace('POST_', '').replace('CYCLE_', '')
-    annotations.append(dict(x=dt_str, y=1.02, yref='paper', text=short,
-        showarrow=False, font=dict(size=7, color=color), textangle=-45))
-
-monitor_label = ' + OS Monitor' if has_monitor else ''
-fig.update_layout(
-    shapes=shapes, annotations=annotations,
-    title=f'CHURN Benchmark Dashboard{monitor_label}',
-    height=250 * rows, showlegend=True, hovermode='x unified', template='plotly_white',
-    legend=dict(orientation='h', y=-0.03)
-)
-
-fig.write_html(output_html, include_plotlyjs=True)
-print(f'Dashboard ({rows} panels, monitor={"yes" if has_monitor else "no"}): {output_html}')
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-report.py b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-report.py
new file mode 100755
index 000000000000..b185282008e7
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-analyze/scripts/generate-report.py
@@ -0,0 +1,373 @@
+#!/usr/bin/env python3
+"""
+generate-report.py — Generate a markdown benchmark report from result directories.
+
+Usage:
+    python3 generate-report.py --results-dir <path> [--output report.md] [--runs run1,run2,...]
+
+Reads:
+  - monitor.csv           (JVM metrics: threads, heap, FDs, GC)
+  - metrics/*.csv         (Codahale: throughput, latency)
+  - git-info.json         (branch, commit)
+  - gc.log                (GC pause summary)
+
+Generates:
+  - Markdown report with per-run summaries, time-series tables, inline SVG charts,
+    and a comparison table if multiple runs are present.
+"""
+
+import argparse
+import csv
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+
+
+def read_csv(path):
+    """Read a CSV file and return list of dicts."""
+    if not os.path.isfile(path):
+        return []
+    with open(path, newline="") as f:
+        reader = csv.DictReader(f)
+        return [row for row in reader]
+
+
+def read_json(path):
+    """Read a JSON file."""
+    if not os.path.isfile(path):
+        return {}
+    with open(path) as f:
+        return json.load(f)
+
+
+def safe_int(val, default=0):
+    try:
+        return int(float(val))
+    except (ValueError, TypeError):
+        return default
+
+
+def safe_float(val, default=0.0):
+    try:
+        return float(val)
+    except (ValueError, TypeError):
+        return default
+
+
+def parse_monitor(rows):
+    """Extract baseline, peak, and final snapshots from monitor.csv rows."""
+    if not rows:
+        return None
+    numeric_rows = []
+    for r in rows:
+        numeric_rows.append({
+            "timestamp": r.get("timestamp", ""),
+            "threads": safe_int(r.get("threads")),
+            "fds": safe_int(r.get("fds")),
+            "rss_kb": safe_int(r.get("rss_kb")),
+            "cpu_pct": safe_float(r.get("cpu_pct")),
+            "heap_used_kb": safe_int(r.get("heap_used_kb")),
+            "heap_max_kb": safe_int(r.get("heap_max_kb")),
+            "gc_count": safe_int(r.get("gc_count")),
+            "gc_time_ms": safe_int(r.get("gc_time_ms")),
+        })
+
+    baseline = numeric_rows[0]
+    peak = max(numeric_rows, key=lambda r: r["heap_used_kb"])
+    final = numeric_rows[-1]
+
+    return {
+        "baseline": baseline,
+        "peak": peak,
+        "final": final,
+        "rows": numeric_rows,
+        "count": len(numeric_rows),
+    }
+
+
+def parse_throughput(metrics_dir):
+    """Read Codahale CSV metrics for throughput."""
+    result = {}
+    if not os.path.isdir(metrics_dir):
+        return result
+    for fname in os.listdir(metrics_dir):
+        if not fname.endswith(".csv"):
+            continue
+        path = os.path.join(metrics_dir, fname)
+        rows = read_csv(path)
+        if not rows:
+            continue
+        label = fname.replace(".csv", "")
+        last = rows[-1]
+        result[label] = {
+            "count": safe_int(last.get("count", 0)),
+            "mean_rate": safe_float(last.get("mean_rate", 0)),
+            "m1_rate": safe_float(last.get("m1_rate", 0)),
+            "m5_rate": safe_float(last.get("m5_rate", 0)),
+            "rows": rows,
+        }
+    return result
+
+
+def generate_svg_chart(rows, key, label, width=600, height=150):
+    """Generate an inline SVG sparkline chart for a metric over time."""
+    values = [r[key] for r in rows]
+    if not values or max(values) == 0:
+        return ""
+
+    n = len(values)
+    max_val = max(values)
+    min_val = min(values)
+    val_range = max_val - min_val if max_val != min_val else 1
+    padding = 10
+
+    points = []
+    for i, v in enumerate(values):
+        x = padding + (i / max(n - 1, 1)) * (width - 2 * padding)
+        y = height - padding - ((v - min_val) / val_range) * (height - 2 * padding)
+        points.append(f"{x:.1f},{y:.1f}")
+
+    polyline = " ".join(points)
+
+    svg = f"""<svg width="{width}" height="{height + 30}" xmlns="http://www.w3.org/2000/svg">
+  <text x="{padding}" y="12" font-size="12" fill="#333">{label} (min={min_val:,.0f}, max={max_val:,.0f})</text>
+  <rect x="{padding}" y="18" width="{width - 2 * padding}" height="{height}" fill="#f8f9fa" stroke="#ddd"/>
+  <polyline points="{polyline}" fill="none" stroke="#0066cc" stroke-width="1.5"/>
+</svg>"""
+    return svg
+
+
+def run_summary(run_dir):
+    """Generate summary data for a single run."""
+    git_info = read_json(os.path.join(run_dir, "git-info.json"))
+    monitor_rows = read_csv(os.path.join(run_dir, "monitor.csv"))
+    monitor = parse_monitor(monitor_rows)
+    throughput = parse_throughput(os.path.join(run_dir, "metrics"))
+    run_name = os.path.basename(run_dir)
+
+    return {
+        "name": run_name,
+        "dir": run_dir,
+        "git": git_info,
+        "monitor": monitor,
+        "throughput": throughput,
+    }
+
+
+def format_run_section(summary):
+    """Format a markdown section for one run."""
+    lines = []
+    name = summary["name"]
+    git = summary["git"]
+    monitor = summary["monitor"]
+    throughput = summary["throughput"]
+
+    branch = git.get("branch", git.get("branchName", "?"))
+    commit = git.get("commitId", git.get("commit", "?"))
+
+    lines.append(f"### {name}")
+    lines.append(f"")
+    lines.append(f"**Branch:** `{branch}` | **Commit:** `{commit}`")
+    lines.append("")
+
+    if monitor:
+        b, p, f_ = monitor["baseline"], monitor["peak"], monitor["final"]
+        thread_delta = f_["threads"] - b["threads"]
+        heap_ratio = f_["heap_used_kb"] / b["heap_used_kb"] if b["heap_used_kb"] > 0 else 0
+        thread_status = "✅" if thread_delta <= 2 else "🔴"
+        heap_status = "✅" if heap_ratio <= 1.1 else "🔴"
+        overall = "✅ PASSED" if (thread_delta <= 2 and heap_ratio <= 1.1) else "🔴 FAILED"
+
+        lines.append("#### JVM Metrics")
+        lines.append("")
+        lines.append("| Metric | Baseline | Peak | Final |")
+        lines.append("|--------|----------|------|-------|")
+        lines.append(f"| Threads | {b['threads']} | {p['threads']} | {f_['threads']} |")
+        lines.append(f"| Heap (MB) | {b['heap_used_kb']//1024} | {p['heap_used_kb']//1024} | {f_['heap_used_kb']//1024} |")
+        lines.append(f"| RSS (MB) | {b['rss_kb']//1024} | {p['rss_kb']//1024} | {f_['rss_kb']//1024} |")
+        lines.append(f"| FDs | {b['fds']} | {p['fds']} | {f_['fds']} |")
+        lines.append(f"| GC count | {b['gc_count']} | — | {f_['gc_count']} |")
+        lines.append(f"| GC time (ms) | {b['gc_time_ms']} | — | {f_['gc_time_ms']} |")
+        lines.append("")
+        lines.append(f"- {thread_status} Thread leak: delta={thread_delta} (threshold: ≤2)")
+        lines.append(f"- {heap_status} Memory leak: ratio={heap_ratio:.2f} (threshold: ≤1.1)")
+        lines.append(f"- **Overall: {overall}**")
+        lines.append("")
+
+        # Time-series charts
+        rows = monitor["rows"]
+        if len(rows) > 2:
+            lines.append("#### Time Series")
+            lines.append("")
+            for key, label in [
+                ("threads", "Threads"),
+                ("heap_used_kb", "Heap Used (KB)"),
+                ("fds", "File Descriptors"),
+                ("rss_kb", "RSS (KB)"),
+                ("gc_count", "GC Count"),
+                ("cpu_pct", "CPU %"),
+            ]:
+                svg = generate_svg_chart(rows, key, label)
+                if svg:
+                    lines.append(svg)
+                    lines.append("")
+
+    if throughput:
+        lines.append("#### Throughput Metrics")
+        lines.append("")
+        lines.append("| Metric | Count | Mean Rate (ops/s) | 1m Rate | 5m Rate |")
+        lines.append("|--------|-------|-------------------|---------|---------|")
+        for label, data in sorted(throughput.items()):
+            lines.append(
+                f"| {label} | {data['count']:,} | {data['mean_rate']:.1f} | "
+                f"{data['m1_rate']:.1f} | {data['m5_rate']:.1f} |"
+            )
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def format_comparison(summaries):
+    """Format a comparison table across multiple runs."""
+    lines = []
+    lines.append("## Comparison")
+    lines.append("")
+
+    # Header
+    header = "| Metric |"
+    separator = "|--------|"
+    for s in summaries:
+        branch = s["git"].get("branch", s["git"].get("branchName", "?"))
+        commit = s["git"].get("commitId", s["git"].get("commit", "?"))[:7]
+        header += f" {branch} ({commit}) |"
+        separator += "--------|"
+    lines.append(header)
+    lines.append(separator)
+
+    # Rows
+    metrics = [
+        ("Threads (final)", lambda s: s["monitor"]["final"]["threads"] if s["monitor"] else "—"),
+        ("Heap final (MB)", lambda s: s["monitor"]["final"]["heap_used_kb"] // 1024 if s["monitor"] else "—"),
+        ("Heap ratio", lambda s: f"{s['monitor']['final']['heap_used_kb'] / s['monitor']['baseline']['heap_used_kb']:.2f}" if s["monitor"] and s["monitor"]["baseline"]["heap_used_kb"] > 0 else "—"),
+        ("Thread delta", lambda s: s["monitor"]["final"]["threads"] - s["monitor"]["baseline"]["threads"] if s["monitor"] else "—"),
+        ("Peak FDs", lambda s: s["monitor"]["peak"]["fds"] if s["monitor"] else "—"),
+        ("GC count", lambda s: s["monitor"]["final"]["gc_count"] if s["monitor"] else "—"),
+        ("GC time (ms)", lambda s: s["monitor"]["final"]["gc_time_ms"] if s["monitor"] else "—"),
+        ("RSS peak (MB)", lambda s: s["monitor"]["peak"]["rss_kb"] // 1024 if s["monitor"] else "—"),
+    ]
+
+    for label, extractor in metrics:
+        row = f"| {label} |"
+        for s in summaries:
+            val = extractor(s)
+            row += f" {val} |"
+        lines.append(row)
+
+    # Throughput comparison
+    all_throughput_keys = set()
+    for s in summaries:
+        all_throughput_keys.update(s["throughput"].keys())
+
+    if all_throughput_keys:
+        lines.append("")
+        lines.append("### Throughput Comparison")
+        lines.append("")
+        header = "| Metric |"
+        separator = "|--------|"
+        for s in summaries:
+            branch = s["git"].get("branch", s["git"].get("branchName", "?"))[:20]
+            header += f" {branch} |"
+            separator += "--------|"
+        lines.append(header)
+        lines.append(separator)
+
+        for key in sorted(all_throughput_keys):
+            row = f"| {key} (ops/s) |"
+            for s in summaries:
+                if key in s["throughput"]:
+                    row += f" {s['throughput'][key]['mean_rate']:.1f} |"
+                else:
+                    row += " — |"
+            lines.append(row)
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate benchmark report")
+    parser.add_argument("--results-dir", required=True, help="Directory containing run subdirectories")
+    parser.add_argument("--output", default=None, help="Output markdown file (default: <results-dir>/report.md)")
+    parser.add_argument("--runs", default=None, help="Comma-separated run names (default: all in results-dir)")
+    args = parser.parse_args()
+
+    results_dir = args.results_dir
+    if not os.path.isdir(results_dir):
+        print(f"ERROR: {results_dir} not found", file=sys.stderr)
+        sys.exit(1)
+
+    # Find runs
+    if args.runs:
+        run_names = [r.strip() for r in args.runs.split(",")]
+    else:
+        run_names = sorted([
+            d for d in os.listdir(results_dir)
+            if os.path.isdir(os.path.join(results_dir, d))
+        ])
+
+    if not run_names:
+        print("No runs found", file=sys.stderr)
+        sys.exit(1)
+
+    output_path = args.output or os.path.join(results_dir, "report.md")
+
+    # Generate report
+    summaries = []
+    for name in run_names:
+        run_dir = os.path.join(results_dir, name)
+        if not os.path.isdir(run_dir):
+            print(f"WARNING: {run_dir} not found, skipping", file=sys.stderr)
+            continue
+        summaries.append(run_summary(run_dir))
+
+    report = []
+    report.append(f"# Benchmark Report")
+    report.append(f"")
+    report.append(f"Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}")
+    report.append(f"")
+    report.append(f"Runs: {len(summaries)}")
+    report.append(f"")
+
+    # Comparison table (if multiple runs)
+    if len(summaries) > 1:
+        report.append(format_comparison(summaries))
+
+    # Per-run details
+    report.append("## Run Details")
+    report.append("")
+    for s in summaries:
+        report.append(format_run_section(s))
+        report.append("---")
+        report.append("")
+
+    # Write output
+    content = "\n".join(report)
+    with open(output_path, "w") as f:
+        f.write(content)
+
+    print(f"✅ Report generated: {output_path}")
+    print(f"   Runs analyzed: {len(summaries)}")
+    for s in summaries:
+        branch = s["git"].get("branch", s["git"].get("branchName", "?"))
+        overall = "?"
+        if s["monitor"]:
+            td = s["monitor"]["final"]["threads"] - s["monitor"]["baseline"]["threads"]
+            hr = s["monitor"]["final"]["heap_used_kb"] / s["monitor"]["baseline"]["heap_used_kb"] if s["monitor"]["baseline"]["heap_used_kb"] > 0 else 0
+            overall = "✅" if (td <= 2 and hr <= 1.1) else "🔴"
+        print(f"   {overall} {s['name']} ({branch})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md
deleted file mode 100644
index ad36bf5ad6e1..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/SKILL.md
+++ /dev/null
@@ -1,215 +0,0 @@
----
-name: cosmos-benchmark-provision
-description: Provision Azure infrastructure for Cosmos DB benchmarks — create or reuse Cosmos DB accounts, Application Insights, and Azure VMs. Verifies region capacity before creating resources. Use when the user needs to create benchmark resources, reuse existing ones, provision infrastructure, or set up a DR drill. Triggers on "provision", "create accounts", "create VM", "create app insights", "setup infrastructure", "DR drill setup", "reuse existing".
----
-
-# Provision Benchmark Infrastructure
-
-Create or reuse Azure resources needed for a benchmark or DR drill. All resources must be in the **same region**.
-
-## Before You Start
-
-### 1. Choose region
-
-Ask the user for the target Azure region. All resources will be co-located.
-
-### 2. Verify capacity
-
-**Check quotas before creating anything.** If any check fails, prompt the user to choose a different region.
-
-```bash
-# VM vCPU quota
-az vm list-usage --location <region> -o table | grep -i "Standard DSv5"
-
-# Cosmos DB account count (limit is typically 50 per subscription)
-az cosmosdb list --query "length(@)"
-
-# App Insights limit check (200 per subscription per region)
-az monitor app-insights component list --query "[?location=='<region>'] | length(@)"
-```
-
-### 3. Choose create vs. reuse
-
-For each resource type, ask the user:
-- **Create new** — provision fresh resources
-- **Reuse existing** — point to config files or discover from Azure
-
-## 1. Cosmos DB Accounts
-
-### Create new
-
-Ask user for: count (N), naming prefix, consistency level (default: Session), throughput (default: 10000 RU/s).
-
-```bash
-# Create resource group (shared by all benchmark resources)
-az group create --name rg-cosmos-benchmark --location <region>
-
-# Create N accounts in parallel
-for i in $(seq 0 $((N-1))); do
-  az cosmosdb create \
-    --resource-group rg-cosmos-benchmark \
-    --name "${PREFIX}${i}" \
-    --locations regionName=<region> failoverPriority=0 \
-    --default-consistency-level Session \
-    --kind GlobalDocumentDB &
-done
-wait
-
-# Create database + container in each
-for i in $(seq 0 $((N-1))); do
-  az cosmosdb sql database create \
-    --resource-group rg-cosmos-benchmark \
-    --account-name "${PREFIX}${i}" \
-    --name benchdb
-
-  az cosmosdb sql container create \
-    --resource-group rg-cosmos-benchmark \
-    --account-name "${PREFIX}${i}" \
-    --database-name benchdb \
-    --name benchcol \
-    --partition-key-path /id \
-    --throughput 10000
-done
-```
-
-### Export credentials
-
-Generate `clientHostAndKey.txt` for the **setup** skill:
-
-```bash
-for i in $(seq 0 $((N-1))); do
-  ENDPOINT=$(az cosmosdb show -g rg-cosmos-benchmark -n "${PREFIX}${i}" --query documentEndpoint -o tsv)
-  KEY=$(az cosmosdb keys list -g rg-cosmos-benchmark -n "${PREFIX}${i}" --query primaryMasterKey -o tsv)
-  echo "${PREFIX}${i},${ENDPOINT},${KEY}"
-done > clientHostAndKey.txt
-```
-
-### Reuse existing
-
-Ask user how to discover existing accounts:
-- **Option A**: Point to an existing `clientHostAndKey.txt` file
-- **Option B**: Discover from resource group:
-  ```bash
-  az cosmosdb list -g <rg> --query "[].{name:name, endpoint:documentEndpoint}" -o table
-  ```
-  Then export credentials using the loop above with the discovered account names.
-
-## 2. Application Insights
-
-### Create new
-
-```bash
-az monitor app-insights component create \
-  --app <app-insights-name> \
-  --location <region> \
-  --resource-group rg-cosmos-benchmark \
-  --kind web \
-  --application-type web
-```
-
-### Get connection string
-
-```bash
-AI_CONN_STR=$(az monitor app-insights component show \
-  --app <app-insights-name> \
-  --resource-group rg-cosmos-benchmark \
-  --query connectionString -o tsv)
-echo "$AI_CONN_STR" > benchmark-config/app-insights-connection-string.txt
-```
-
-The **run** skill uses this via environment variable `APPLICATIONINSIGHTS_CONNECTION_STRING`.
-
-### Reuse existing
-
-Ask user how to discover:
-- **Option A**: Provide the connection string directly
-- **Option B**: Discover from resource group:
-  ```bash
-  az monitor app-insights component list -g <rg> --query "[].{name:name, connectionString:connectionString}" -o table
-  ```
-
-Save to `benchmark-config/app-insights-connection-string.txt`.
-
-## 3. Azure VMs
-
-### Create new (via provision script)
-
-```bash
-bash copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh \
-  --new --location <region> --create-key \
-  [--size Standard_D16s_v5] \
-  [--disk-size 256] \
-  [--rg rg-cosmos-benchmark] \
-  [--vm-name vm-benchmark-01] \
-  [--skip-setup]
-```
-
-Script flags:
-| Flag | Default | Description |
-|---|---|---|
-| `--new` | — | Create a new VM |
-| `--location` | eastus | Azure region |
-| `--create-key [path]` | — | Generate SSH key pair (optional path) |
-| `--ssh-key <pub>` | — | Use existing public key |
-| `--size` | Standard_D16s_v5 | VM SKU |
-| `--disk-size` | 256 | OS disk in GB |
-| `--rg` | rg-cosmos-benchmark | Resource group |
-| `--vm-name` | vm-benchmark-01 | VM name |
-| `--skip-setup` | false | Skip auto-running setup-benchmark-vm.sh |
-
-### Create new (manual Azure CLI)
-
-```bash
-az vm create \
-  --resource-group rg-cosmos-benchmark \
-  --name vm-benchmark-01 \
-  --image Ubuntu2204 \
-  --size Standard_D16s_v5 \
-  --accelerated-networking true \
-  --admin-username benchuser \
-  --generate-ssh-keys \
-  --os-disk-size-gb 256 \
-  --storage-sku Premium_LRS
-az vm open-port --resource-group rg-cosmos-benchmark --name vm-benchmark-01 --port 22
-```
-
-### Reuse existing VM
-
-```bash
-# Option A: provide IP directly
-bash copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh \
-  --existing --ip <VM_IP> --user benchuser --key ~/.ssh/id_rsa
-
-# Option B: discover from resource group + VM name
-bash copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh \
-  --existing --rg <rg> --vm-name <name> --key ~/.ssh/id_rsa
-```
-
-### Connection info saved
-
-The provision script saves:
-- `benchmark-config/vm-ip` — VM public IP
-- `benchmark-config/vm-user` — SSH username
-- `benchmark-config/vm-key` — path to SSH private key
-- `benchmark-config/vm-config.env` — all three in `KEY=VALUE` format
-
-Read `references/vm-sizing.md` for workload-specific VM sizing.
-
-## 4. Resource Group Cleanup
-
-When done with all benchmarks:
-```bash
-az group delete --name rg-cosmos-benchmark --yes --no-wait
-```
-
-## After Provisioning
-
-Use the **cosmos-benchmark-setup** skill to:
-- Install JDK/Maven/tools on the VM
-- Generate `tenants.json` from `clientHostAndKey.txt`
-- Clone repo and build the benchmark JAR
-
-## References
-
-- **VM sizing by workload**: `references/vm-sizing.md`
-- **Provision script**: `scripts/provision-benchmark-vm.sh`
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/setup-result-storage.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/setup-result-storage.sh
deleted file mode 100644
index 69faaa231418..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/setup-result-storage.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-# setup-result-storage.sh — Create Cosmos DB containers for benchmark results
-# See §9.2.3 of the test plan.
-
-set -euo pipefail
-
-RESOURCE_GROUP="${1:-rg-cosmos-benchmark}"
-ACCOUNT_NAME="${2:-cosmos-bench-results}"
-LOCATION="${3:-eastus}"
-
-echo "=== Creating result storage ==="
-echo "  RG:       $RESOURCE_GROUP"
-echo "  Account:  $ACCOUNT_NAME"
-echo "  Location: $LOCATION"
-
-az cosmosdb create --name "$ACCOUNT_NAME" --resource-group "$RESOURCE_GROUP" \
-  --default-consistency-level Session --locations regionName="$LOCATION" failoverPriority=0
-
-az cosmosdb sql database create --account-name "$ACCOUNT_NAME" \
-  --resource-group "$RESOURCE_GROUP" --name benchresults
-
-az cosmosdb sql container create --account-name "$ACCOUNT_NAME" \
-  --resource-group "$RESOURCE_GROUP" --database-name benchresults \
-  --name runs --partition-key-path /scenario --throughput 400
-
-az cosmosdb sql container create --account-name "$ACCOUNT_NAME" \
-  --resource-group "$RESOURCE_GROUP" --database-name benchresults \
-  --name snapshots --partition-key-path /testRunId --throughput 400 --default-ttl 2592000
-
-echo ""
-echo "=== Result storage ready ==="
-echo "  export RESULT_COSMOS_ENDPOINT=$(az cosmosdb show -n $ACCOUNT_NAME -g $RESOURCE_GROUP --query documentEndpoint -o tsv)"
-echo "  export RESULT_COSMOS_KEY=$(az cosmosdb keys list -n $ACCOUNT_NAME -g $RESOURCE_GROUP --query primaryMasterKey -o tsv)"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
index dc60bf3899b9..37970d3510d4 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
@@ -1,123 +1,118 @@
 ---
 name: cosmos-benchmark-run
-description: Run a Cosmos DB benchmark scenario. Supports branch/PR/commit checkout, CHURN preset and custom scenarios, auto-configures App Insights monitoring from provision output, runs in tmux on remote VMs, and supports multi-VM parallel execution. Triggers on "run benchmark", "execute test", "test fix on branch", "run from PR", "start benchmark", "DR drill".
+description: Build and run Cosmos DB benchmarks — clone repo at a branch/PR/commit, generate tenants.json, build the benchmark JAR, and execute scenarios on remote VMs. Supports multiple refs for comparison. Triggers on "run benchmark", "execute test", "start benchmark", "build benchmark", "create tenants.json", "DR drill".
 ---
 
-# Run a Benchmark
+# Run Benchmark
 
-Execute a benchmark on one or more VMs. Always uses `run-benchmark.sh` wrapper (includes `monitor.sh` + git metadata capture).
+Clone, build, and execute a benchmark on a provisioned VM. All operations are implemented as scripts. Each ref uses a single SSH session for checkout → build → verify → run.
 
 ## VM Connection
 
-Auto-detect from provision output:
+Read VM connection info from the config directory provided during resource setup:
 
 ```bash
-VM_IP=$(cat benchmark-config/vm-ip)
-VM_USER=$(cat benchmark-config/vm-user)
-VM_KEY=$(cat benchmark-config/vm-key)
-SSH_CMD="ssh -i $VM_KEY $VM_USER@$VM_IP"
+CONFIG_DIR="<path-from-setup-resources-step>"
 ```
 
-## 1. Configure Monitoring
+## Step 1 — Collect Inputs
 
-### Application Insights (auto-configure from provision output)
+Ask the user for:
 
-If `benchmark-config/app-insights-connection-string.txt` exists:
+1. **Refs to benchmark** — one or more branches, commits, PRs, or tags:
+   - Single: `main`
+   - Multiple for comparison: `main, fix/telemetry-leak`
+   - PR + baseline: `PR#12345, main`
+   - Commit SHAs: `abc1234, def5678`
 
-```bash
-AI_CONN_STR=$(cat benchmark-config/app-insights-connection-string.txt)
-$SSH_CMD "echo 'export APPLICATIONINSIGHTS_CONNECTION_STRING=\"$AI_CONN_STR\"' >> ~/.bashrc"
-```
+2. **Scenario preset** (default: `SIMPLE`):
 
-If not found, ask the user whether to:
-- Provide an App Insights connection string
-- Skip App Insights (local CSV metrics only)
+   | Preset | Operations | Duration | Use case |
+   |---|---|---|---|
+   | **SIMPLE** | ReadThroughput | ~30 min | Quick validation, single-op benchmark |
+   | **EXPAND** | ReadThroughput → WriteThroughput → QueryOrderby | ~90 min | Full performance profile |
+   | **CHURN** | Configurable (cycles) | Varies | Leak detection, resource cleanup validation |
 
-### Graphite (optional)
+   See `references/scenarios.md` for custom operation flags.
 
-```bash
-$SSH_CMD "echo 'export GRAPHITE_SERVICE_ADDRESS=\"<host>:<port>\"' >> ~/.bashrc"
-```
+3. **tenants.json customization** — use defaults or adjust operation, concurrency, connection mode.
 
-## 2. Scenario Selection
+## Step 2 — Generate Config
 
-Read `references/presets.md` for preset flag recipes.
+Generate `tenants.json` from the credentials exported during resource setup:
 
-### CHURN preset (default — leak detection)
-
-Tests client create/close resource leaks (threads, connections, memory).
-
-```
--cycles 5 -numberOfOperations 500
+```bash
+bash scripts/generate-tenants.sh \
+  --config-dir "$CONFIG_DIR" \
+  --output tenants.json \
+  --copy-to-vm
 ```
 
-The harness auto-applies when cycles > 1:
-- `settleTimeMs=90000`
-- `suppressCleanup=true`
-- `gcBetweenCycles=true`
-
-### Custom scenarios
+Options:
+| Flag | Default | Description |
+|---|---|---|
+| `--config-dir` | — | **Required.** Path to config directory |
+| `--output` | `tenants.json` | Output file path |
+| `--operation` | `ReadThroughput` | Default operation |
+| `--connection-mode` | `GATEWAY` | `GATEWAY` or `DIRECT` |
+| `--concurrency` | `20` | Concurrent operations |
+| `--copy-to-vm` | false | SCP the file to `~/tenants.json` on VM |
 
-Read `references/scenarios.md` for the full operation catalog (20 types) and tuning parameters.
-Users can pass any combination of CLI flags for custom workloads.
+Ensure `tenants.json` and `clientHostAndKey.txt` are in `.gitignore` — they contain secrets.
 
-## 3. Execute on Single VM
+## Step 3 — Run Benchmarks
 
-Always run inside tmux so the benchmark survives SSH disconnection.
+### Single or multiple refs
 
 ```bash
-$SSH_CMD "tmux new-session -d -s bench 'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
-  bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh CHURN ~/tenants.json ./results/<run-name> [extra-flags]'"
+bash scripts/run-all-refs.sh \
+  --config-dir "$CONFIG_DIR" \
+  --refs "main, fix/telemetry-leak" \
+  --scenario SIMPLE
 ```
 
-### Monitor progress
+The orchestrator, for each ref, uses **a single SSH session** to:
+1. Checkout the ref (auto-detects branch/PR/commit/tag)
+2. Build linting-extensions + cosmos benchmark JAR
+3. Verify readiness (JDK, JAR, config, disk)
+4. Execute the benchmark
 
-```bash
-# Attach to tmux session
-$SSH_CMD -t "tmux attach -t bench"
+Results are saved to `results/<date>-<scenario>-<ref-label>/` on the VM.
 
-# Or peek at output without attaching
-$SSH_CMD "tmux capture-pane -t bench -p | tail -30"
+If any ref fails, it's skipped and the next ref proceeds. A summary is printed at the end.
 
-# Check if monitor.csv is growing
-$SSH_CMD "wc -l ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<run-name>/monitor.csv"
+### Multiple VMs (parallel)
+
+Run `run-all-refs.sh` with different `--config-dir` paths pointing to different VMs:
+
+```bash
+bash scripts/run-all-refs.sh --config-dir "$CONFIG_DIR_VM1" --refs "main" --scenario SIMPLE &
+bash scripts/run-all-refs.sh --config-dir "$CONFIG_DIR_VM2" --refs "fix/leak" --scenario SIMPLE &
+wait
 ```
 
 ### Run naming convention
 
-Use descriptive names: `<date>-<scenario>-<branch>`, e.g.:
+Runs are named `<date>-<scenario>-<ref-label>`, e.g.:
 ```
-20260226-CHURN-fix-telemetry-leak
-20260226-CHURN-main-baseline
+20260302-SIMPLE-main
+20260302-EXPAND-fix-telemetry-leak
+20260302-CHURN-PR-12345
 ```
 
-## 4. Execute on Multiple VMs (parallel)
-
-For comparing versions or running different scenarios simultaneously:
+### Monitor progress
 
 ```bash
-# VM 1: baseline (main branch)
-ssh -i $VM_KEY $VM_USER@<VM1_IP> "tmux new-session -d -s bench \
-  'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
-   bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh CHURN ~/tenants.json ./results/baseline-main'"
-
-# VM 2: fix branch
-ssh -i $VM_KEY $VM_USER@<VM2_IP> "tmux new-session -d -s bench \
-  'cd ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark && \
-   bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh CHURN ~/tenants.json ./results/fix-branch'"
-```
+SSH_CMD="ssh -i $(cat $CONFIG_DIR/vm-key) $(cat $CONFIG_DIR/vm-user)@$(cat $CONFIG_DIR/vm-ip)"
 
-To test different SDK versions on different VMs, use the **setup** skill on each VM with different branch/PR/commit targets before running.
-
-## 5. What run-benchmark.sh Does
+# Peek at output
+$SSH_CMD "tmux capture-pane -t bench -p | tail -30"
 
-The wrapper script (`copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh`) handles:
-1. Captures git metadata (branch, commit) → `git-info.json`
-2. Launches JVM with `-Xmx8g -XX:+UseG1GC` + GC logging
-3. Spawns `monitor.sh` in parallel for external JVM monitoring → `monitor.csv`
-4. Cleans up monitoring on exit
+# Check monitor.csv
+$SSH_CMD "wc -l ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<run-name>/monitor.csv"
+```
 
-### Output directory structure
+## Output Directory Structure
 
 ```
 results/<run-name>/
@@ -130,11 +125,18 @@ results/<run-name>/
 
 ## After Run
 
-Suggest using the **cosmos-benchmark-analyze** skill to analyze results.
-
-## References
-
-- **Preset flag recipes**: `references/presets.md`
-- **Full operation catalog & custom scenarios**: `references/scenarios.md`
-- **Run script**: `scripts/run-benchmark.sh`
-- **Trigger script**: `scripts/trigger-benchmark.sh`
+Suggest using the **cosmos-benchmark-analyze** skill to download and analyze results.
+
+## Scripts Reference
+
+| Script | Purpose |
+|---|---|
+| `scripts/run-all-refs.sh` | **Orchestrator.** For each ref: sends `vm-prepare-and-run.sh` to VM via single SSH. |
+| `scripts/vm-prepare-and-run.sh` | **Runs ON VM.** Checkout → build → verify → run for one ref. |
+| `scripts/generate-tenants.sh` | Generate tenants.json from clientHostAndKey.txt. |
+| `scripts/run-benchmark.sh` | Execute benchmark with monitoring (git metadata, GC log, monitor.csv). |
+| `scripts/monitor.sh` | External JVM monitoring (spawned by run-benchmark.sh). |
+| `scripts/capture-diagnostics.sh` | Capture thread/heap dumps and JFR recordings during a live run. |
+| `references/tenants-sample.json` | Template for tenants.json structure. |
+| `references/presets.md` | Preset flag recipes (SIMPLE, EXPAND, CHURN). |
+| `references/scenarios.md` | Full operation catalog (20+ types). |
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/references/tenants-sample.json b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/references/tenants-sample.json
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/references/tenants-sample.json
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/references/tenants-sample.json
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/generate-tenants.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/generate-tenants.sh
new file mode 100755
index 000000000000..61efcff1e0a1
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/generate-tenants.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+# generate-tenants.sh — Generate tenants.json from clientHostAndKey.txt
+#
+# Usage:
+#   ./generate-tenants.sh --config-dir <path> [--output tenants.json] [--operation ReadThroughput]
+#                         [--connection-mode GATEWAY] [--concurrency 20] [--copy-to-vm]
+#
+# Reads $CONFIG_DIR/clientHostAndKey.txt (format: name,endpoint,key per line)
+# Generates tenants.json with globalDefaults and tenant entries.
+
+set -euo pipefail
+
+CONFIG_DIR=""
+OUTPUT="tenants.json"
+OPERATION="ReadThroughput"
+CONNECTION_MODE="GATEWAY"
+CONCURRENCY="20"
+NUM_OPERATIONS="100000"
+NUM_PRECREATED="1000"
+COPY_TO_VM=false
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --config-dir)       CONFIG_DIR="$2"; shift 2 ;;
+    --output)           OUTPUT="$2"; shift 2 ;;
+    --operation)        OPERATION="$2"; shift 2 ;;
+    --connection-mode)  CONNECTION_MODE="$2"; shift 2 ;;
+    --concurrency)      CONCURRENCY="$2"; shift 2 ;;
+    --num-operations)   NUM_OPERATIONS="$2"; shift 2 ;;
+    --num-precreated)   NUM_PRECREATED="$2"; shift 2 ;;
+    --copy-to-vm)       COPY_TO_VM=true; shift ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CONFIG_DIR" ]]; then
+  echo "Usage: $0 --config-dir <path> [options]" >&2
+  exit 1
+fi
+
+INPUT="$CONFIG_DIR/clientHostAndKey.txt"
+if [[ ! -f "$INPUT" ]]; then
+  echo "ERROR: $INPUT not found" >&2
+  exit 1
+fi
+
+# Build tenants array
+TENANTS=""
+INDEX=0
+while IFS=',' read -r NAME ENDPOINT KEY; do
+  [[ -z "$NAME" ]] && continue
+  if [[ $INDEX -gt 0 ]]; then
+    TENANTS="$TENANTS,"
+  fi
+  TENANTS="$TENANTS
+    {
+      \"id\": \"tenant-$INDEX\",
+      \"serviceEndpoint\": \"$ENDPOINT\",
+      \"masterKey\": \"$KEY\",
+      \"databaseId\": \"benchdb\",
+      \"containerId\": \"benchcol\"
+    }"
+  INDEX=$((INDEX + 1))
+done < "$INPUT"
+
+# Write tenants.json
+cat > "$OUTPUT" << EOF
+{
+  "globalDefaults": {
+    "connectionMode": "$CONNECTION_MODE",
+    "consistencyLevel": "SESSION",
+    "concurrency": "$CONCURRENCY",
+    "numberOfOperations": "$NUM_OPERATIONS",
+    "operation": "$OPERATION",
+    "numberOfPreCreatedDocuments": "$NUM_PRECREATED",
+    "connectionSharingAcrossClientsEnabled": "false",
+    "maxConnectionPoolSize": "1000",
+    "applicationName": "cosmos-bench"
+  },
+  "tenants": [$TENANTS
+  ]
+}
+EOF
+
+echo "Generated $OUTPUT with $INDEX tenant(s)"
+echo "  Operation: $OPERATION"
+echo "  Connection mode: $CONNECTION_MODE"
+echo "  Concurrency: $CONCURRENCY"
+
+# Copy to VM if requested
+if [[ "$COPY_TO_VM" == "true" ]]; then
+  VM_IP=$(cat "$CONFIG_DIR/vm-ip")
+  VM_USER=$(cat "$CONFIG_DIR/vm-user")
+  VM_KEY=$(cat "$CONFIG_DIR/vm-key")
+  scp -i "$VM_KEY" -o StrictHostKeyChecking=no "$OUTPUT" "$VM_USER@$VM_IP:~/tenants.json"
+  echo "Copied to VM: ~/tenants.json"
+fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
new file mode 100755
index 000000000000..451fee359b11
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# run-all-refs.sh — Orchestrate benchmark execution across multiple git refs
+#
+# Usage:
+#   ./run-all-refs.sh --config-dir <path> --refs "main,fix/leak" [--scenario SIMPLE]
+#
+# For each ref, runs vm-prepare-and-run.sh on the VM via a single SSH session
+# (checkout → build → verify → run — all in one connection).
+
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+CONFIG_DIR=""
+SCENARIO="SIMPLE"
+REFS_CSV=""
+TENANTS_FILE="~/tenants.json"
+EXTRA_FLAGS=""
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --config-dir)    CONFIG_DIR="$2"; shift 2 ;;
+    --scenario)      SCENARIO="$2"; shift 2 ;;
+    --refs)          REFS_CSV="$2"; shift 2 ;;
+    --tenants-file)  TENANTS_FILE="$2"; shift 2 ;;
+    --extra-flags)   EXTRA_FLAGS="$2"; shift 2 ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CONFIG_DIR" || -z "$REFS_CSV" ]]; then
+  echo "Usage: $0 --config-dir <path> --refs \"ref1,ref2,...\" [--scenario SIMPLE]" >&2
+  exit 1
+fi
+
+VM_IP=$(cat "$CONFIG_DIR/vm-ip")
+VM_USER=$(cat "$CONFIG_DIR/vm-user")
+VM_KEY=$(cat "$CONFIG_DIR/vm-key")
+SSH_CMD="ssh -i $VM_KEY -o StrictHostKeyChecking=no $VM_USER@$VM_IP"
+
+IFS=',' read -ra REFS <<< "$REFS_CSV"
+TOTAL=${#REFS[@]}
+DATE=$(date +%Y%m%d)
+
+echo "============================================="
+echo "  Benchmark Run: $TOTAL ref(s), scenario=$SCENARIO"
+echo "============================================="
+for i in "${!REFS[@]}"; do
+  echo "  [$((i+1))/$TOTAL] $(echo "${REFS[$i]}" | xargs)"
+done
+echo "============================================="
+echo ""
+
+SUCCEEDED=0
+FAILED=0
+RESULTS=()
+
+for i in "${!REFS[@]}"; do
+  REF=$(echo "${REFS[$i]}" | xargs)
+  REF_LABEL=$(echo "$REF" | tr '/' '-' | tr '#' '-')
+  RUN_NAME="${DATE}-${SCENARIO}-${REF_LABEL}"
+  SEQ="[$((i+1))/$TOTAL]"
+
+  echo ""
+  echo "$SEQ Starting: $REF → $RUN_NAME"
+  echo "   (single SSH session: checkout → build → verify → run)"
+
+  # Send vm-prepare-and-run.sh to VM and execute — 1 SSH session per ref
+  $SSH_CMD "bash -s" < "$SCRIPT_DIR/vm-prepare-and-run.sh" \
+    -- "$REF" "$SCENARIO" "$TENANTS_FILE" "$RUN_NAME" $EXTRA_FLAGS
+  RUN_EXIT=$?
+
+  if [[ $RUN_EXIT -eq 0 ]]; then
+    echo "$SEQ ✅ Completed: $REF → results/$RUN_NAME"
+    SUCCEEDED=$((SUCCEEDED + 1))
+    RESULTS+=("✅ $RUN_NAME")
+  else
+    echo "$SEQ ❌ Failed: $REF (exit=$RUN_EXIT)"
+    FAILED=$((FAILED + 1))
+    RESULTS+=("❌ $RUN_NAME")
+  fi
+done
+
+echo ""
+echo "============================================="
+echo "  Summary: $SUCCEEDED/$TOTAL succeeded, $FAILED/$TOTAL failed"
+echo "============================================="
+for R in "${RESULTS[@]}"; do
+  echo "  $R"
+done
+echo ""
+
+if [[ $TOTAL -gt 1 ]]; then
+  echo "💡 Download and compare results:"
+  echo "   bash scripts/download-results.sh --config-dir $CONFIG_DIR --all"
+fi
+
+[[ $FAILED -gt 0 ]] && exit 1
+exit 0
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/trigger-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/trigger-benchmark.sh
deleted file mode 100644
index 48660ee16c87..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/trigger-benchmark.sh
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/bin/bash
-# trigger-benchmark.sh — Checkout a branch/PR, build, and run benchmark
-#
-# Usage:
-#   ./trigger-benchmark.sh --branch <name> --scenario <scenario> --tenants <file> [options]
-#   ./trigger-benchmark.sh --pr <number> --scenario <scenario> --tenants <file> [options]
-#   ./trigger-benchmark.sh --compare <branch-a> <branch-b> --scenario <scenario> --tenants <file>
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-SDK_DIR="${SDK_DIR:-$(cd "$SCRIPT_DIR/../../.." && pwd)}"
-BRANCH=""
-PR_NUMBER=""
-COMPARE_A=""
-COMPARE_B=""
-SCENARIO="SCALING"
-TENANTS_FILE="tenants.json"
-RESULT_SINK="CSV"
-SKIP_BUILD=false
-EXTRA_ARGS=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --branch)       BRANCH="$2"; shift ;;
-        --pr)           PR_NUMBER="$2"; shift ;;
-        --compare)      COMPARE_A="$2"; COMPARE_B="$3"; shift 2 ;;
-        --scenario)     SCENARIO="$2"; shift ;;
-        --tenants)      TENANTS_FILE="$2"; shift ;;
-        --result-sink)  RESULT_SINK="$2"; shift ;;
-        --sdk-dir)      SDK_DIR="$2"; shift ;;
-        --skip-build)   SKIP_BUILD=true ;;
-        *)              EXTRA_ARGS="$EXTRA_ARGS $1" ;;
-    esac
-    shift
-done
-
-build_and_run() {
-    local ref="$1"
-    local label="$2"
-
-    echo ""
-    echo "════════════════════════════════════════════════════════"
-    echo "  Building and running: $label ($ref)"
-    echo "════════════════════════════════════════════════════════"
-
-    cd "$SDK_DIR"
-
-    # Checkout
-    if [[ "$ref" =~ ^[0-9]+$ ]]; then
-        echo "Fetching PR #${ref}..."
-        git fetch origin pull/${ref}/head:pr-${ref}
-        git checkout pr-${ref}
-    else
-        echo "Checking out branch: $ref"
-        git fetch origin "$ref"
-        git checkout "$ref"
-        git pull origin "$ref" 2>/dev/null || true
-    fi
-
-    COMMIT_ID=$(git rev-parse --short HEAD)
-    BRANCH_NAME=$(git rev-parse --abbrev-ref HEAD)
-    echo "  Commit: $COMMIT_ID"
-
-    # Build
-    if [[ "$SKIP_BUILD" == "false" ]]; then
-        echo "Building azure-cosmos + benchmark module..."
-        mvn install -pl sdk/cosmos/azure-cosmos -am -DskipTests -q
-        mvn package -pl sdk/cosmos/azure-cosmos-benchmark -DskipTests -q
-        echo "Build complete."
-    fi
-
-    # Run
-    local OUTPUT_DIR="./results/$(date +%Y%m%dT%H%M%S)-${label}-${SCENARIO}"
-    "$SCRIPT_DIR/run-benchmark.sh" "$SCENARIO" "$TENANTS_FILE" "$OUTPUT_DIR" \
-        --branch "$BRANCH_NAME" \
-        ${PR_NUMBER:+--pr "$PR_NUMBER"} \
-        --result-sink "$RESULT_SINK" \
-        $EXTRA_ARGS
-
-    echo "  Results: $OUTPUT_DIR"
-    echo "$OUTPUT_DIR" >> .last-benchmark-runs
-}
-
-if [[ -n "$COMPARE_A" && -n "$COMPARE_B" ]]; then
-    build_and_run "$COMPARE_A" "before"
-    build_and_run "$COMPARE_B" "after"
-    echo ""
-    echo "════════════════════════════════════════════════════════"
-    echo "  Both runs complete. Compare results:"
-    tail -2 .last-benchmark-runs
-    echo "════════════════════════════════════════════════════════"
-elif [[ -n "$PR_NUMBER" ]]; then
-    build_and_run "$PR_NUMBER" "pr-${PR_NUMBER}"
-elif [[ -n "$BRANCH" ]]; then
-    build_and_run "$BRANCH" "$BRANCH"
-else
-    echo "Usage: $0 --branch <name> | --pr <number> | --compare <branch-a> <branch-b>"
-    exit 1
-fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
new file mode 100755
index 000000000000..df66f898d582
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+# vm-prepare-and-run.sh — Runs ON the VM. Checkout, build, verify, and run benchmark for one ref.
+#
+# Usage (executed on VM via SSH):
+#   bash vm-prepare-and-run.sh <ref> <scenario> <tenants-file> <run-name> [extra-flags...]
+#
+# Ref auto-detection:
+#   PR#<number>   → fetches pull request
+#   tag:<name>    → checks out tag
+#   7-40 hex chars → commit SHA
+#   anything else → branch name
+#
+# Exit 0 on success, 1 on failure.
+
+set -uo pipefail
+
+REF="${1:?Usage: $0 <ref> <scenario> <tenants-file> <run-name> [extra-flags...]}"
+SCENARIO="${2:-SIMPLE}"
+TENANTS_FILE="${3:-~/tenants.json}"
+RUN_NAME="${4:-$(date +%Y%m%d)-${SCENARIO}-run}"
+shift 4 || true
+EXTRA_FLAGS="$*"
+
+REPO_DIR=~/azure-sdk-for-java
+BENCH_DIR=$REPO_DIR/sdk/cosmos/azure-cosmos-benchmark
+export PATH=/opt/apache-maven-3.9.12/bin:$PATH
+MAVEN_FLAGS="-e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Drevapi.skip=true"
+
+echo "============================================="
+echo "  Ref: $REF"
+echo "  Scenario: $SCENARIO"
+echo "  Run name: $RUN_NAME"
+echo "============================================="
+
+# --- Step 1: Checkout ---
+echo ""
+echo "=== [1/4] Checkout: $REF ==="
+
+if [[ ! -d "$REPO_DIR/.git" ]]; then
+  echo "Cloning repo..."
+  git clone --depth 1 https://github.com/Azure/azure-sdk-for-java.git "$REPO_DIR"
+fi
+
+cd "$REPO_DIR"
+
+if [[ "$REF" =~ ^PR#([0-9]+)$ ]]; then
+  PR_NUM="${BASH_REMATCH[1]}"
+  git fetch origin "pull/$PR_NUM/head:pr-$PR_NUM"
+  git checkout "pr-$PR_NUM"
+elif [[ "$REF" =~ ^tag:(.+)$ ]]; then
+  TAG="${BASH_REMATCH[1]}"
+  git fetch --tags origin
+  git checkout "tags/$TAG"
+elif [[ "$REF" =~ ^[0-9a-f]{7,40}$ ]]; then
+  git fetch origin
+  git checkout "$REF"
+else
+  git fetch --depth 1 origin "$REF"
+  git checkout "$REF"
+  git pull origin "$REF" 2>/dev/null || true
+fi
+
+BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "detached")
+COMMIT=$(git rev-parse --short HEAD)
+echo "Checked out: $BRANCH @ $COMMIT"
+
+# --- Step 2: Build ---
+echo ""
+echo "=== [2/4] Build ==="
+
+cd "$REPO_DIR"
+mvn -e -DskipTests -pl sdk/tools/linting-extensions install
+
+cd "$REPO_DIR/sdk/cosmos"
+mvn $MAVEN_FLAGS -pl ,azure-cosmos -am clean install
+mvn $MAVEN_FLAGS -pl ,azure-cosmos-test clean install
+mvn $MAVEN_FLAGS -pl ,azure-cosmos-encryption clean install
+mvn $MAVEN_FLAGS -pl ,azure-cosmos-benchmark clean package -P package-assembly
+
+JAR=$(ls "$BENCH_DIR/target/"*jar-with-dependencies.jar 2>/dev/null | head -1)
+if [[ -z "$JAR" ]]; then
+  echo "❌ Build failed — JAR not found"
+  exit 1
+fi
+echo "✅ Built: $(basename $JAR)"
+
+# --- Step 3: Verify ---
+echo ""
+echo "=== [3/4] Verify ==="
+
+READY=true
+
+if ! java -version 2>&1 | grep -qi "openjdk"; then
+  echo "  ❌ JDK not found"; READY=false
+else
+  echo "  ✅ JDK: $(java -version 2>&1 | head -1)"
+fi
+
+TENANTS_RESOLVED=$(eval echo "$TENANTS_FILE")
+if [[ -f "$TENANTS_RESOLVED" ]]; then
+  echo "  ✅ Config: $TENANTS_FILE"
+else
+  echo "  ❌ Config: $TENANTS_FILE not found"; READY=false
+fi
+
+DISK_AVAIL=$(df -BG / | tail -1 | awk '{print $4}' | tr -d 'G')
+if [[ "$DISK_AVAIL" -ge 10 ]]; then
+  echo "  ✅ Disk: ${DISK_AVAIL}GB"
+else
+  echo "  ❌ Disk: ${DISK_AVAIL}GB (<10GB)"; READY=false
+fi
+
+if [[ "$READY" != "true" ]]; then
+  echo "❌ Readiness check failed"
+  exit 1
+fi
+
+# --- Step 4: Run ---
+echo ""
+echo "=== [4/4] Run: $SCENARIO ==="
+
+cd "$BENCH_DIR"
+if [[ -f "copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh" ]]; then
+  bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh \
+    "$SCENARIO" "$TENANTS_FILE" "./results/$RUN_NAME" $EXTRA_FLAGS
+else
+  echo "WARNING: run-benchmark.sh not found, running JAR directly"
+  mkdir -p "results/$RUN_NAME"
+  java -Xmx8g -Xms8g -XX:+UseG1GC \
+    -Xlog:gc*:"results/$RUN_NAME/gc.log" \
+    -jar "$JAR" \
+    -tenantsFile "$TENANTS_RESOLVED" \
+    -reportingDirectory "results/$RUN_NAME/metrics" \
+    2>&1 | tee "results/$RUN_NAME/benchmark.log"
+fi
+
+echo ""
+echo "✅ Completed: $REF ($BRANCH @ $COMMIT) → results/$RUN_NAME"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
new file mode 100644
index 000000000000..9f2b7ee37930
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
@@ -0,0 +1,221 @@
+---
+name: cosmos-benchmark-setup-resources
+description: Set up Azure resources for Cosmos DB benchmarks — create or reuse Cosmos DB accounts, Application Insights, and Azure VMs. Installs JDK/Maven on VMs. Validates region capacity before creating resources. Triggers on "provision", "create accounts", "create VM", "setup resources", "setup infrastructure", "DR drill setup", "reuse existing".
+---
+
+# Setup Resources
+
+Create or reuse Azure resources needed for a benchmark or DR drill, and prepare the VM with build tools. This is the first step of the benchmark workflow.
+
+All deterministic operations are implemented as scripts in `scripts/`. The agent's role is to gather user input, run the appropriate scripts, and report results.
+
+## Step 1 — Config Directory
+
+Ask the user where to persist resource configuration files (VM connection info, Cosmos DB credentials, App Insights connection strings). All downstream skills read from this directory.
+
+```bash
+CONFIG_DIR="<user-specified-path>"
+mkdir -p "$CONFIG_DIR"
+```
+
+**⚠️ If the chosen path is inside the git repository**, warn the user:
+> "This directory is inside the git repo. Files here contain credentials (keys, connection strings) and risk being committed. Consider using a path outside the repo, or ensure it is added to `.gitignore`."
+
+If the user confirms a repo-internal path, add it to `.gitignore`:
+```bash
+echo "$CONFIG_DIR/" >> .gitignore
+```
+
+### Reuse existing config
+
+Check if the config directory already has all expected files:
+
+```bash
+ls "$CONFIG_DIR"/vm-ip "$CONFIG_DIR"/vm-user "$CONFIG_DIR"/vm-key \
+   "$CONFIG_DIR"/clientHostAndKey.txt \
+   "$CONFIG_DIR"/app-insights-connection-string.txt 2>/dev/null
+```
+
+If all files are present → skip to the **setup benchmark** skill. If some are missing → create only the missing resources.
+
+## Step 2 — Confirm Tenant and Subscription
+
+```bash
+az account show --query "{tenant:tenantId, subscription:name, subscriptionId:id}" -o table
+```
+
+Ask the user to confirm or switch:
+- **Switch tenant**: `az login --tenant <tenant-id>`
+- **Switch subscription**: `az account set --subscription <subscription-id>`
+- **List subscriptions**: `az account list --query "[].{name:name, id:id, tenantId:tenantId}" -o table`
+
+## Step 3 — Gather Resource Requirements
+
+Ask the user what resources are needed. Defaults:
+
+| Resource | Default | Customizable |
+|---|---|---|
+| Azure VM | 1 × Standard_D16s_v5 | Count, size, disk |
+| Cosmos DB accounts | 1 | Count (N), naming prefix, consistency level |
+| Application Insights | 1 | Name |
+| Resource group | `rg-cosmos-benchmark-YYYYMMDD` (today's date) | Name |
+| Region | `westus2` | Any Azure region |
+
+For each resource type, ask: **Create new** or **Reuse existing**.
+
+## Step 4 — Validate Capacity and Select Region
+
+Run the validation script against the preferred region (default: `westus2`):
+
+```bash
+bash scripts/validate-capacity.sh \
+  --region <preferred-region> \
+  --vm-size <vm-size> \
+  --vm-count <vm-count> \
+  --cosmos-count <cosmos-count> \
+  --app-insights-count 1
+```
+
+The script checks:
+- VM SKU availability in the region (and suggests similar SKUs if exact match unavailable)
+- vCPU quota (enough headroom for requested VMs)
+- Cosmos DB account quota (subscription-wide limit of ~50)
+- Application Insights quota (per-region limit of ~200)
+- Resource provider registration (auto-registers if needed)
+
+**Output**: JSON summary with per-check pass/fail, available capacity, and messages.
+
+### If preferred region fails
+
+Run the region finder to automatically scan candidate regions:
+
+```bash
+bash scripts/find-region.sh \
+  --preferred <preferred-region> \
+  --vm-size <vm-size> \
+  --vm-count <vm-count> \
+  --cosmos-count <cosmos-count>
+```
+
+The script:
+1. Validates the preferred region first
+2. If it fails, tries candidates: `westus2`, `eastus`, `eastus2`, `westus3`, `centralus`, `northeurope`, `westeurope`, `southeastasia`, `australiaeast`
+3. Outputs the first region where all checks pass
+4. If a region has a similar but not identical VM SKU, reports the alternative
+5. If no region works, outputs `NONE` and a failure summary
+
+**Present the result to the user for confirmation** before proceeding.
+
+## Step 5 — Create Resources
+
+Once region and requirements are confirmed, run the orchestrator to create all resources in parallel:
+
+```bash
+bash scripts/provision-all.sh \
+  --config-dir "$CONFIG_DIR" \
+  --region <region> \
+  --rg $RG \
+  --cosmos-prefix <prefix> \
+  --cosmos-count <N> \
+  --app-insights-name <app-insights-name> \
+  --vm-name vm-benchmark-01 \
+  --vm-size Standard_D16s_v5 \
+  --create-key
+```
+
+The orchestrator:
+1. Creates the resource group
+2. Launches **in parallel**: Cosmos DB accounts, App Insights, and VM creation
+3. Waits for all three to complete
+4. Exports Cosmos DB credentials to `$CONFIG_DIR/clientHostAndKey.txt`
+5. Runs `verify-resources.sh` to confirm everything is ready
+
+Each sub-task logs to `$CONFIG_DIR/logs/` for debugging if anything fails.
+
+**Do not proceed to the next step until `provision-all.sh` exits with code 0.**
+
+### Reuse existing resources
+
+If the user wants to reuse existing resources instead of creating new ones, handle each resource type individually:
+
+#### Cosmos DB — reuse
+
+Point to an existing `clientHostAndKey.txt`, or discover accounts from a resource group:
+
+```bash
+bash scripts/export-cosmos-credentials.sh \
+  --rg $RG --discover --config-dir "$CONFIG_DIR"
+```
+
+#### App Insights — reuse
+
+Discover and save the connection string:
+
+```bash
+az monitor app-insights component list -g $RG \
+  --query "[].{name:name, connectionString:connectionString}" -o table
+
+# Save the chosen connection string
+echo "<connection-string>" > "$CONFIG_DIR/app-insights-connection-string.txt"
+```
+
+#### VM — reuse
+
+```bash
+bash scripts/provision-benchmark-vm.sh \
+  --existing --ip <VM_IP> --user benchuser --key ~/.ssh/id_rsa \
+  --config-dir "$CONFIG_DIR"
+```
+
+### After reuse: verify
+
+Always run the verification gate after reusing resources:
+
+```bash
+bash scripts/verify-resources.sh --config-dir "$CONFIG_DIR"
+```
+
+## Step 6 — Verification Gate
+
+**This step is mandatory before proceeding.** If `provision-all.sh` was used, verification already ran. If resources were reused or created manually, run explicitly:
+
+```bash
+bash scripts/verify-resources.sh --config-dir "$CONFIG_DIR" [--cosmos-count <N>]
+```
+
+The script checks:
+- ✅ Config directory exists with expected files
+- ✅ `clientHostAndKey.txt` has the expected number of entries
+- ✅ `app-insights-connection-string.txt` is non-empty
+- ✅ VM is SSH-reachable
+- ✅ JDK and Maven are installed on the VM
+
+**Exit code 0 = proceed. Exit code 1 = fix issues first.**
+
+## Resource Group Cleanup
+
+When done with all benchmarks:
+```bash
+az group delete --name $RG --yes --no-wait
+```
+
+## After Setup Resources
+
+Proceed to the **cosmos-benchmark-run** skill to:
+- Clone the repo (at a specific branch/PR/commit) on the VM
+- Generate `tenants.json` from `$CONFIG_DIR/clientHostAndKey.txt`
+- Build the benchmark JAR
+- Execute the benchmark
+
+## Scripts Reference
+
+| Script | Purpose |
+|---|---|
+| `scripts/provision-all.sh` | **Orchestrator.** Creates RG → launches Cosmos/AppInsights/VM in parallel → exports credentials → verifies. |
+| `scripts/validate-capacity.sh` | Check region capacity (VM SKU, quotas). JSON output. |
+| `scripts/find-region.sh` | Find first region passing all capacity checks. |
+| `scripts/create-cosmos-accounts.sh` | Create N Cosmos DB accounts with progress logging. |
+| `scripts/export-cosmos-credentials.sh` | Export Cosmos DB credentials to `clientHostAndKey.txt`. |
+| `scripts/provision-benchmark-vm.sh` | Create/connect VM, install tools (JDK/Maven/tmux), save config. |
+| `scripts/verify-resources.sh` | **Gate.** Verify all resources are ready before proceeding. |
+| `references/vm-sizing.md` | VM sizing recommendations by workload. |
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/references/vm-sizing.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/references/vm-sizing.md
similarity index 100%
rename from sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/references/vm-sizing.md
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/references/vm-sizing.md
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/create-cosmos-accounts.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/create-cosmos-accounts.sh
new file mode 100755
index 000000000000..91430440d477
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/create-cosmos-accounts.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# create-cosmos-accounts.sh — Create Cosmos DB accounts sequentially with status tracking
+#
+# Usage:
+#   ./create-cosmos-accounts.sh --rg <resource-group> --prefix <name-prefix> --count <N> --region <region>
+#
+# Accounts are created one at a time. Progress is logged to /tmp/cosmos-account-creation.log.
+# Run in background with: nohup ./create-cosmos-accounts.sh [args] &
+
+set -uo pipefail
+
+RG=""
+PREFIX=""
+COUNT=1
+REGION="westus2"
+CONSISTENCY="Session"
+LOG="/tmp/cosmos-account-creation.log"
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --rg) RG="$2"; shift 2 ;;
+    --prefix) PREFIX="$2"; shift 2 ;;
+    --count) COUNT="$2"; shift 2 ;;
+    --region) REGION="$2"; shift 2 ;;
+    --consistency) CONSISTENCY="$2"; shift 2 ;;
+    --log) LOG="$2"; shift 2 ;;
+    *) echo "Unknown flag: $1"; exit 1 ;;
+  esac
+done
+
+if [[ -z "$RG" || -z "$PREFIX" ]]; then
+  echo "Usage: $0 --rg <resource-group> --prefix <name-prefix> [--count N] [--region region]"
+  exit 1
+fi
+
+echo "$(date): Starting creation of $COUNT Cosmos DB accounts" | tee "$LOG"
+echo "  Resource group: $RG" | tee -a "$LOG"
+echo "  Prefix: $PREFIX" | tee -a "$LOG"
+echo "  Region: $REGION" | tee -a "$LOG"
+echo "  Consistency: $CONSISTENCY" | tee -a "$LOG"
+echo "" | tee -a "$LOG"
+
+SUCCESS=0
+FAILED=0
+
+for i in $(seq 0 $((COUNT - 1))); do
+  NAME="${PREFIX}${i}"
+  echo "$(date): [$((i+1))/$COUNT] Creating ${NAME}..." | tee -a "$LOG"
+  if az cosmosdb create \
+    --resource-group "$RG" \
+    --name "$NAME" \
+    --locations regionName="$REGION" failoverPriority=0 \
+    --default-consistency-level "$CONSISTENCY" \
+    --kind GlobalDocumentDB 2>>"$LOG" 1>/dev/null; then
+    echo "$(date): ✅ ${NAME} created" | tee -a "$LOG"
+    SUCCESS=$((SUCCESS + 1))
+  else
+    echo "$(date): ❌ ${NAME} FAILED" | tee -a "$LOG"
+    FAILED=$((FAILED + 1))
+  fi
+done
+
+echo "" | tee -a "$LOG"
+echo "$(date): Complete. Succeeded: $SUCCESS / $COUNT, Failed: $FAILED / $COUNT" | tee -a "$LOG"
+
+if [[ $FAILED -gt 0 ]]; then
+  exit 1
+fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/export-cosmos-credentials.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/export-cosmos-credentials.sh
new file mode 100755
index 000000000000..ed7bac174979
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/export-cosmos-credentials.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# export-cosmos-credentials.sh — Export Cosmos DB account credentials to clientHostAndKey.txt
+#
+# Usage:
+#   ./export-cosmos-credentials.sh --rg <resource-group> --prefix <name-prefix> --count <N> --config-dir <path>
+#   ./export-cosmos-credentials.sh --rg <resource-group> --discover --config-dir <path>
+#
+# Modes:
+#   --prefix + --count: Export credentials for accounts named <prefix>0, <prefix>1, ...
+#   --discover:         List all accounts in the resource group and export all
+
+set -euo pipefail
+
+RG=""
+PREFIX=""
+COUNT=0
+CONFIG_DIR=""
+DISCOVER=false
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --rg)         RG="$2"; shift 2 ;;
+    --prefix)     PREFIX="$2"; shift 2 ;;
+    --count)      COUNT="$2"; shift 2 ;;
+    --config-dir) CONFIG_DIR="$2"; shift 2 ;;
+    --discover)   DISCOVER=true; shift ;;
+    *) echo "Unknown flag: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$RG" || -z "$CONFIG_DIR" ]]; then
+  echo "Usage: $0 --rg <resource-group> --config-dir <path> (--prefix <prefix> --count <N> | --discover)" >&2
+  exit 1
+fi
+
+mkdir -p "$CONFIG_DIR"
+OUTPUT_FILE="$CONFIG_DIR/clientHostAndKey.txt"
+
+if [[ "$DISCOVER" == "true" ]]; then
+  echo "Discovering Cosmos DB accounts in resource group: $RG"
+  ACCOUNTS=$(az cosmosdb list -g "$RG" --query "[].name" -o tsv 2>/dev/null)
+  if [[ -z "$ACCOUNTS" ]]; then
+    echo "No Cosmos DB accounts found in $RG" >&2
+    exit 1
+  fi
+  EXPORTED=0
+  > "$OUTPUT_FILE"
+  for ACCT in $ACCOUNTS; do
+    ENDPOINT=$(az cosmosdb show -g "$RG" -n "$ACCT" --query documentEndpoint -o tsv)
+    KEY=$(az cosmosdb keys list -g "$RG" -n "$ACCT" --query primaryMasterKey -o tsv)
+    echo "${ACCT},${ENDPOINT},${KEY}" >> "$OUTPUT_FILE"
+    EXPORTED=$((EXPORTED + 1))
+    echo "  Exported: $ACCT ($EXPORTED)"
+  done
+  echo "Exported $EXPORTED accounts to $OUTPUT_FILE"
+
+elif [[ -n "$PREFIX" && "$COUNT" -gt 0 ]]; then
+  echo "Exporting credentials for $COUNT accounts with prefix: $PREFIX"
+  EXPORTED=0
+  FAILED=0
+  > "$OUTPUT_FILE"
+  for i in $(seq 0 $((COUNT - 1))); do
+    NAME="${PREFIX}${i}"
+    ENDPOINT=$(az cosmosdb show -g "$RG" -n "$NAME" --query documentEndpoint -o tsv 2>/dev/null)
+    if [[ -z "$ENDPOINT" ]]; then
+      echo "  ❌ $NAME not found — skipping" >&2
+      FAILED=$((FAILED + 1))
+      continue
+    fi
+    KEY=$(az cosmosdb keys list -g "$RG" -n "$NAME" --query primaryMasterKey -o tsv)
+    echo "${NAME},${ENDPOINT},${KEY}" >> "$OUTPUT_FILE"
+    EXPORTED=$((EXPORTED + 1))
+    echo "  ✅ $NAME ($EXPORTED/$COUNT)"
+  done
+  echo ""
+  echo "Exported $EXPORTED/$COUNT accounts to $OUTPUT_FILE"
+  [[ $FAILED -gt 0 ]] && echo "⚠️  $FAILED accounts not found" >&2
+
+else
+  echo "Provide --prefix + --count or --discover" >&2
+  exit 1
+fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh
new file mode 100755
index 000000000000..bb1817d8be33
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# find-region.sh — Find a region with capacity for all benchmark resources
+#
+# Usage:
+#   ./find-region.sh [--preferred westus2] [--vm-size Standard_D16s_v5] [--vm-count 1] \
+#                    [--cosmos-count 1] [--app-insights-count 1]
+#
+# Checks the preferred region first. If it fails, tries candidate regions.
+# Prints the first valid region name to stdout. Exits 1 if none found.
+#
+# Output (stdout):
+#   Line 1: region name (or "NONE" if no region found)
+#   Line 2+: JSON capacity report for the selected region (or summary of failures)
+
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PREFERRED="westus2"
+VM_SIZE="Standard_D16s_v5"
+VM_COUNT=1
+COSMOS_COUNT=1
+APP_INSIGHTS_COUNT=1
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --preferred)          PREFERRED="$2"; shift 2 ;;
+    --vm-size)            VM_SIZE="$2"; shift 2 ;;
+    --vm-count)           VM_COUNT="$2"; shift 2 ;;
+    --cosmos-count)       COSMOS_COUNT="$2"; shift 2 ;;
+    --app-insights-count) APP_INSIGHTS_COUNT="$2"; shift 2 ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+CANDIDATES=("westus2" "eastus" "eastus2" "westus3" "centralus" "northeurope" "westeurope" "southeastasia" "australiaeast")
+
+validate_region() {
+  local region="$1"
+  bash "$SCRIPT_DIR/validate-capacity.sh" \
+    --region "$region" \
+    --vm-size "$VM_SIZE" \
+    --vm-count "$VM_COUNT" \
+    --cosmos-count "$COSMOS_COUNT" \
+    --app-insights-count "$APP_INSIGHTS_COUNT" 2>/dev/null
+}
+
+# Try preferred region first
+echo "Checking preferred region: $PREFERRED..." >&2
+RESULT=$(validate_region "$PREFERRED")
+if [[ $? -eq 0 ]]; then
+  echo "$PREFERRED"
+  echo "$RESULT"
+  exit 0
+fi
+
+echo "Preferred region $PREFERRED failed capacity checks." >&2
+echo "$RESULT" | python3 -c "
+import sys, json
+try:
+  d = json.load(sys.stdin)
+  for k in ['vm_sku', 'vm_quota', 'cosmos_db', 'app_insights']:
+    if not d[k]['passed']:
+      print(f\"  ❌ {k}: {d[k]['message']}\", file=sys.stderr)
+except: pass
+" 2>&2 || true
+
+echo "" >&2
+echo "Searching candidate regions..." >&2
+
+FAILURES=""
+for REGION in "${CANDIDATES[@]}"; do
+  [[ "$REGION" == "$PREFERRED" ]] && continue
+  echo "  Checking $REGION..." >&2
+  RESULT=$(validate_region "$REGION")
+  if [[ $? -eq 0 ]]; then
+    echo "$REGION"
+    echo "$RESULT"
+    exit 0
+  fi
+  FAILURES="$FAILURES\n$REGION: $(echo "$RESULT" | python3 -c "
+import sys, json
+try:
+  d = json.load(sys.stdin)
+  fails = [f\"{k}: {d[k]['message']}\" for k in ['vm_sku','vm_quota','cosmos_db','app_insights'] if not d[k]['passed']]
+  print('; '.join(fails))
+except: print('parse error')
+" 2>/dev/null || echo "check error")"
+done
+
+echo "" >&2
+echo "No candidate region has capacity for all resources." >&2
+echo -e "Failures:$FAILURES" >&2
+echo "NONE"
+exit 1
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh
new file mode 100755
index 000000000000..fb8584d62c3f
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+# provision-all.sh — Orchestrate parallel creation of all benchmark resources
+#
+# Usage:
+#   ./provision-all.sh --config-dir <path> --region <region> --rg <resource-group> \
+#     [--cosmos-prefix <prefix>] [--cosmos-count <N>] [--cosmos-consistency Session] \
+#     [--app-insights-name <name>] \
+#     [--vm-name <name>] [--vm-size Standard_D16s_v5] [--vm-disk-size 256] \
+#     [--create-key] [--skip-vm-setup]
+#
+# Creates resource group, then launches Cosmos DB, App Insights, and VM creation
+# in parallel. Waits for all to complete. Exports credentials. Runs verify-resources.sh.
+#
+# Exit codes:
+#   0 — all resources provisioned and verified
+#   1 — one or more resources failed
+
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+# Required
+CONFIG_DIR=""
+REGION="westus2"
+RG="rg-cosmos-benchmark-$(date +%Y%m%d)"
+
+# Cosmos DB
+COSMOS_PREFIX="cosmos-bench-"
+COSMOS_COUNT=1
+COSMOS_CONSISTENCY="Session"
+
+# App Insights
+APP_INSIGHTS_NAME="cosmos-bench-ai"
+
+# VM
+VM_NAME="vm-benchmark-01"
+VM_SIZE="Standard_D16s_v5"
+VM_DISK_SIZE=256
+CREATE_KEY=true
+SKIP_VM_SETUP=false
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --config-dir)          CONFIG_DIR="$2"; shift 2 ;;
+    --region)              REGION="$2"; shift 2 ;;
+    --rg)                  RG="$2"; shift 2 ;;
+    --cosmos-prefix)       COSMOS_PREFIX="$2"; shift 2 ;;
+    --cosmos-count)        COSMOS_COUNT="$2"; shift 2 ;;
+    --cosmos-consistency)  COSMOS_CONSISTENCY="$2"; shift 2 ;;
+    --app-insights-name)   APP_INSIGHTS_NAME="$2"; shift 2 ;;
+    --vm-name)             VM_NAME="$2"; shift 2 ;;
+    --vm-size)             VM_SIZE="$2"; shift 2 ;;
+    --vm-disk-size)        VM_DISK_SIZE="$2"; shift 2 ;;
+    --create-key)          CREATE_KEY=true; shift ;;
+    --skip-vm-setup)       SKIP_VM_SETUP=true; shift ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CONFIG_DIR" ]]; then
+  echo "ERROR: --config-dir <path> is required" >&2
+  exit 1
+fi
+
+mkdir -p "$CONFIG_DIR"
+LOG_DIR="$CONFIG_DIR/logs"
+mkdir -p "$LOG_DIR"
+
+echo "============================================="
+echo "  Provisioning Benchmark Resources"
+echo "============================================="
+echo "  Config dir:     $CONFIG_DIR"
+echo "  Region:         $REGION"
+echo "  Resource group: $RG"
+echo "  Cosmos DB:      $COSMOS_COUNT account(s), prefix=$COSMOS_PREFIX"
+echo "  App Insights:   $APP_INSIGHTS_NAME"
+echo "  VM:             $VM_NAME ($VM_SIZE)"
+echo "============================================="
+echo ""
+
+# --- Step 1: Create resource group (must complete before parallel creation) ---
+echo "[1/4] Creating resource group: $RG in $REGION"
+az group create --name "$RG" --location "$REGION" -o none 2>&1 | tee "$LOG_DIR/rg.log"
+echo ""
+
+# --- Step 2: Launch parallel resource creation ---
+echo "[2/4] Creating resources in parallel..."
+echo ""
+
+COSMOS_PID=""
+AI_PID=""
+VM_PID=""
+
+# Cosmos DB accounts (background)
+echo "  Starting: Cosmos DB ($COSMOS_COUNT accounts)..."
+(
+  bash "$SCRIPT_DIR/create-cosmos-accounts.sh" \
+    --rg "$RG" --prefix "$COSMOS_PREFIX" --count "$COSMOS_COUNT" \
+    --region "$REGION" --consistency "$COSMOS_CONSISTENCY" \
+    --log "$LOG_DIR/cosmos-accounts.log" \
+  && echo "COSMOS_SUCCESS" > "$LOG_DIR/cosmos-status" \
+  || echo "COSMOS_FAILED" > "$LOG_DIR/cosmos-status"
+) &
+COSMOS_PID=$!
+
+# App Insights (background)
+echo "  Starting: Application Insights ($APP_INSIGHTS_NAME)..."
+(
+  az monitor app-insights component create \
+    --app "$APP_INSIGHTS_NAME" \
+    --location "$REGION" \
+    --resource-group "$RG" \
+    --kind web --application-type web \
+    -o none 2>&1 | tee "$LOG_DIR/app-insights.log"
+
+  az monitor app-insights component show \
+    --app "$APP_INSIGHTS_NAME" \
+    --resource-group "$RG" \
+    --query connectionString -o tsv > "$CONFIG_DIR/app-insights-connection-string.txt" 2>>"$LOG_DIR/app-insights.log"
+
+  if [[ -s "$CONFIG_DIR/app-insights-connection-string.txt" ]]; then
+    echo "AI_SUCCESS" > "$LOG_DIR/ai-status"
+  else
+    echo "AI_FAILED" > "$LOG_DIR/ai-status"
+  fi
+) &
+AI_PID=$!
+
+# VM (background)
+echo "  Starting: VM ($VM_NAME)..."
+VM_EXTRA_FLAGS=""
+[[ "$CREATE_KEY" == "true" ]] && VM_EXTRA_FLAGS="$VM_EXTRA_FLAGS --create-key"
+[[ "$SKIP_VM_SETUP" == "true" ]] && VM_EXTRA_FLAGS="$VM_EXTRA_FLAGS --skip-setup"
+(
+  bash "$SCRIPT_DIR/provision-benchmark-vm.sh" \
+    --new --location "$REGION" \
+    --rg "$RG" --vm-name "$VM_NAME" \
+    --size "$VM_SIZE" --disk-size "$VM_DISK_SIZE" \
+    --config-dir "$CONFIG_DIR" \
+    $VM_EXTRA_FLAGS \
+    2>&1 | tee "$LOG_DIR/vm.log"
+
+  if [[ -f "$CONFIG_DIR/vm-ip" ]]; then
+    echo "VM_SUCCESS" > "$LOG_DIR/vm-status"
+  else
+    echo "VM_FAILED" > "$LOG_DIR/vm-status"
+  fi
+) &
+VM_PID=$!
+
+echo ""
+echo "  PIDs: Cosmos=$COSMOS_PID, AppInsights=$AI_PID, VM=$VM_PID"
+echo ""
+
+# --- Step 3: Wait for all to complete ---
+echo "[3/4] Waiting for all resources to complete..."
+echo ""
+
+OVERALL_OK=true
+
+wait $COSMOS_PID
+COSMOS_EXIT=$?
+COSMOS_STATUS=$(cat "$LOG_DIR/cosmos-status" 2>/dev/null || echo "COSMOS_UNKNOWN")
+if [[ "$COSMOS_STATUS" == "COSMOS_SUCCESS" ]]; then
+  echo "  ✅ Cosmos DB: $COSMOS_COUNT account(s) created"
+else
+  echo "  ❌ Cosmos DB: creation failed (exit=$COSMOS_EXIT). See $LOG_DIR/cosmos-accounts.log"
+  OVERALL_OK=false
+fi
+
+wait $AI_PID
+AI_EXIT=$?
+AI_STATUS=$(cat "$LOG_DIR/ai-status" 2>/dev/null || echo "AI_UNKNOWN")
+if [[ "$AI_STATUS" == "AI_SUCCESS" ]]; then
+  echo "  ✅ App Insights: $APP_INSIGHTS_NAME created"
+else
+  echo "  ❌ App Insights: creation failed (exit=$AI_EXIT). See $LOG_DIR/app-insights.log"
+  OVERALL_OK=false
+fi
+
+wait $VM_PID
+VM_EXIT=$?
+VM_STATUS=$(cat "$LOG_DIR/vm-status" 2>/dev/null || echo "VM_UNKNOWN")
+if [[ "$VM_STATUS" == "VM_SUCCESS" ]]; then
+  echo "  ✅ VM: $VM_NAME created"
+else
+  echo "  ❌ VM: creation failed (exit=$VM_EXIT). See $LOG_DIR/vm.log"
+  OVERALL_OK=false
+fi
+
+echo ""
+
+if [[ "$OVERALL_OK" == "false" ]]; then
+  echo "❌ One or more resources failed to create. Check logs in $LOG_DIR/"
+  exit 1
+fi
+
+# --- Step 3b: Export Cosmos DB credentials (must happen after accounts are created) ---
+echo "  Exporting Cosmos DB credentials..."
+bash "$SCRIPT_DIR/export-cosmos-credentials.sh" \
+  --rg "$RG" --prefix "$COSMOS_PREFIX" --count "$COSMOS_COUNT" \
+  --config-dir "$CONFIG_DIR" 2>&1 | tee "$LOG_DIR/export-credentials.log"
+
+if [[ ! -s "$CONFIG_DIR/clientHostAndKey.txt" ]]; then
+  echo "  ❌ Credential export failed. See $LOG_DIR/export-credentials.log"
+  exit 1
+fi
+echo ""
+
+# --- Step 4: Verify all resources ---
+echo "[4/4] Verifying all resources..."
+echo ""
+bash "$SCRIPT_DIR/verify-resources.sh" \
+  --config-dir "$CONFIG_DIR" \
+  --cosmos-count "$COSMOS_COUNT"
+
+exit $?
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-benchmark-vm.sh
old mode 100644
new mode 100755
similarity index 70%
rename from sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh
rename to sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-benchmark-vm.sh
index c3d18a9ca7a5..36a3d2686f62
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-provision/scripts/provision-benchmark-vm.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-benchmark-vm.sh
@@ -3,16 +3,18 @@
 # See §6.5 of the test plan for full details.
 #
 # Usage:
-#   ./provision-benchmark-vm.sh --new --location eastus [--create-key] [--ssh-key <pub>] [options]
-#   ./provision-benchmark-vm.sh --existing --ip <ip> --user <user> --key <private-key>
-#   ./provision-benchmark-vm.sh --existing --rg <rg> --vm-name <name> --key <private-key>
+#   ./provision-benchmark-vm.sh --new --location westus2 [--create-key] [--ssh-key <pub>] [--config-dir <path>] [options]
+#   ./provision-benchmark-vm.sh --existing --ip <ip> --user <user> --key <private-key> [--config-dir <path>]
+#   ./provision-benchmark-vm.sh --existing --rg <rg> --vm-name <name> --key <private-key> [--config-dir <path>]
+#
+# Config directory: where VM connection info is saved. Required.
 
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 MODE=""
-LOCATION="eastus"
-RG="rg-cosmos-benchmark"
+LOCATION="westus2"
+RG="rg-cosmos-benchmark-$(date +%Y%m%d)"
 VM_NAME="vm-benchmark-01"
 VM_SIZE="Standard_D16s_v5"
 VM_IP=""
@@ -23,6 +25,7 @@ CREATE_KEY=false
 CREATE_KEY_PATH=""
 DISK_SIZE=256
 SETUP_AFTER_CREATE=true
+CONFIG_DIR=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -44,6 +47,7 @@ while [[ $# -gt 0 ]]; do
             ;;
         --disk-size)    DISK_SIZE="$2"; shift ;;
         --skip-setup)   SETUP_AFTER_CREATE=false ;;
+        --config-dir)   CONFIG_DIR="$2"; shift ;;
         *) echo "Unknown option: $1"; exit 1 ;;
     esac
     shift
@@ -96,8 +100,23 @@ if [[ "$MODE" == "new" ]]; then
     echo "VM created. IP: $VM_IP"
 
     if [[ "$SETUP_AFTER_CREATE" == "true" ]]; then
-        echo "=== Running setup script ==="
-        $(ssh_cmd) -o StrictHostKeyChecking=no "${SSH_USER}@${VM_IP}" 'bash -s' < "$SCRIPT_DIR/setup-benchmark-vm.sh"
+        echo "=== Installing tools on VM ==="
+        $(ssh_cmd) -o StrictHostKeyChecking=no "${SSH_USER}@${VM_IP}" 'bash -s' << 'SETUP_SCRIPT'
+set -euo pipefail
+echo "=== Installing JDK, Maven, tools ==="
+sudo apt-get update && sudo apt-get install -y openjdk-21-jdk git net-tools iproute2 sysstat procps tmux
+
+wget -q https://dlcdn.apache.org/maven/maven-3/3.9.12/binaries/apache-maven-3.9.12-bin.tar.gz -O /tmp/maven.tar.gz
+sudo tar -xzf /tmp/maven.tar.gz -C /opt/
+sudo ln -sf /opt/apache-maven-3.9.12/bin/mvn /usr/local/bin/mvn
+
+wget -qO /tmp/async-profiler.tar.gz \
+  https://github.com/async-profiler/async-profiler/releases/download/v3.0/async-profiler-3.0-linux-x64.tar.gz
+sudo tar -xzf /tmp/async-profiler.tar.gz -C /opt/
+echo 'export PATH=$PATH:/opt/apache-maven-3.9.12/bin:/opt/async-profiler-3.0-linux-x64/bin' >> ~/.bashrc
+
+echo "=== VM tool setup complete ==="
+SETUP_SCRIPT
     fi
 
 elif [[ "$MODE" == "existing" ]]; then
@@ -111,13 +130,18 @@ else
     exit 1
 fi
 
-echo "$VM_IP" > benchmark-config/vm-ip
-echo "$SSH_USER" > benchmark-config/vm-user
-echo "$SSH_PRIVATE_KEY" > benchmark-config/vm-key
+if [[ -z "$CONFIG_DIR" ]]; then
+    echo "ERROR: --config-dir <path> is required" >&2
+    exit 1
+fi
+
+mkdir -p "$CONFIG_DIR"
+echo "$VM_IP" > "$CONFIG_DIR/vm-ip"
+echo "$SSH_USER" > "$CONFIG_DIR/vm-user"
+echo "$SSH_PRIVATE_KEY" > "$CONFIG_DIR/vm-key"
 
-# Also save to benchmark-config/ for organized access
-mkdir -p benchmark-config
-echo "VM_IP=$VM_IP" > benchmark-config/vm-config.env
-echo "VM_USER=$SSH_USER" >> benchmark-config/vm-config.env
-echo "VM_KEY_PATH=$SSH_PRIVATE_KEY" >> benchmark-config/vm-config.env
+echo "VM_IP=$VM_IP" > "$CONFIG_DIR/vm-config.env"
+echo "VM_USER=$SSH_USER" >> "$CONFIG_DIR/vm-config.env"
+echo "VM_KEY_PATH=$SSH_PRIVATE_KEY" >> "$CONFIG_DIR/vm-config.env"
 echo "=== Ready: $(ssh_cmd) ${SSH_USER}@${VM_IP} ==="
+echo "=== Config saved to: $CONFIG_DIR ==="
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh
new file mode 100755
index 000000000000..74fa6b25857a
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+# validate-capacity.sh — Check whether a region has capacity for benchmark resources
+#
+# Usage:
+#   ./validate-capacity.sh --region <region> [--vm-size Standard_D16s_v5] [--vm-count 1] \
+#                          [--cosmos-count 1] [--app-insights-count 1]
+#
+# Exit codes:
+#   0 — all checks passed
+#   1 — one or more checks failed
+#
+# Output: JSON summary to stdout with per-check results
+
+set -uo pipefail
+
+REGION="westus2"
+VM_SIZE="Standard_D16s_v5"
+VM_COUNT=1
+COSMOS_COUNT=1
+APP_INSIGHTS_COUNT=1
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --region)             REGION="$2"; shift 2 ;;
+    --vm-size)            VM_SIZE="$2"; shift 2 ;;
+    --vm-count)           VM_COUNT="$2"; shift 2 ;;
+    --cosmos-count)       COSMOS_COUNT="$2"; shift 2 ;;
+    --app-insights-count) APP_INSIGHTS_COUNT="$2"; shift 2 ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+PASS=true
+VM_SKU_OK=false
+VM_QUOTA_OK=false
+COSMOS_OK=false
+APP_INSIGHTS_OK=false
+VM_SKU_MSG=""
+VM_QUOTA_MSG=""
+COSMOS_MSG=""
+APP_INSIGHTS_MSG=""
+AVAILABLE_VCPUS=0
+COSMOS_AVAILABLE=0
+APP_INSIGHTS_AVAILABLE=0
+SUGGESTED_SKU=""
+
+# --- 1. Resource providers ---
+for NS in Microsoft.Compute Microsoft.DocumentDB Microsoft.Insights; do
+  STATE=$(az provider show --namespace "$NS" --query "registrationState" -o tsv 2>/dev/null || echo "Unknown")
+  if [[ "$STATE" != "Registered" ]]; then
+    echo "Registering $NS..." >&2
+    az provider register --namespace "$NS" 2>/dev/null
+  fi
+done
+
+# --- 2. VM SKU availability ---
+# Check if the exact SKU is available (no restrictions)
+RESTRICTED=$(az vm list-skus --location "$REGION" --size "$VM_SIZE" \
+  --query "[?restrictions[?reasonCode=='NotAvailableForSubscription']] | length(@)" -o tsv 2>/dev/null || echo "0")
+TOTAL_SKUS=$(az vm list-skus --location "$REGION" --size "$VM_SIZE" \
+  --query "length(@)" -o tsv 2>/dev/null || echo "0")
+
+if [[ "$TOTAL_SKUS" -gt 0 && "$RESTRICTED" -eq 0 ]]; then
+  VM_SKU_OK=true
+  VM_SKU_MSG="$VM_SIZE available"
+else
+  VM_SKU_MSG="$VM_SIZE not available"
+  PASS=false
+  # Try to find a similar SKU in the same family
+  # Extract family prefix (e.g., Standard_D16s_v5 -> standardDSv5Family, also try v4)
+  FAMILY_BASE=$(echo "$VM_SIZE" | sed -E 's/Standard_D([0-9]+)s_v[0-9]+/standardDSv/')
+  VCPUS_NEEDED=$(echo "$VM_SIZE" | sed -E 's/Standard_D([0-9]+)s_v[0-9]+/\1/')
+  for VER in 5 4 3; do
+    FAMILY="${FAMILY_BASE}${VER}Family"
+    ALT=$(az vm list-skus --location "$REGION" \
+      --query "[?family=='$FAMILY' && restrictions[0]==null].name" -o tsv 2>/dev/null | head -5)
+    if [[ -n "$ALT" ]]; then
+      # Find a SKU with matching vCPU count
+      for SKU in $ALT; do
+        SKU_VCPUS=$(az vm list-skus --location "$REGION" --size "$SKU" \
+          --query "[0].capabilities[?name=='vCPUs'].value | [0]" -o tsv 2>/dev/null || echo "0")
+        if [[ "$SKU_VCPUS" == "$VCPUS_NEEDED" ]]; then
+          SUGGESTED_SKU="$SKU"
+          break 2
+        fi
+      done
+    fi
+  done
+  if [[ -n "$SUGGESTED_SKU" ]]; then
+    VM_SKU_MSG="$VM_SIZE not available; similar SKU: $SUGGESTED_SKU"
+  fi
+fi
+
+# --- 3. VM vCPU quota ---
+# Parse "Standard DSv5 Family vCPUs" usage line
+VCPUS_PER_VM=$(echo "$VM_SIZE" | sed -E 's/Standard_D([0-9]+)s_v[0-9]+/\1/')
+VCPUS_NEEDED=$((VCPUS_PER_VM * VM_COUNT))
+
+USAGE_LINE=$(az vm list-usage --location "$REGION" -o tsv 2>/dev/null | grep -i "Standard DSv5 Family" || echo "")
+if [[ -n "$USAGE_LINE" ]]; then
+  CURRENT_USAGE=$(echo "$USAGE_LINE" | awk '{print $1}')
+  LIMIT=$(echo "$USAGE_LINE" | awk '{print $2}')
+  AVAILABLE_VCPUS=$((LIMIT - CURRENT_USAGE))
+  if [[ $AVAILABLE_VCPUS -ge $VCPUS_NEEDED ]]; then
+    VM_QUOTA_OK=true
+    VM_QUOTA_MSG="$AVAILABLE_VCPUS vCPUs available (need $VCPUS_NEEDED)"
+  else
+    VM_QUOTA_MSG="Only $AVAILABLE_VCPUS vCPUs available (need $VCPUS_NEEDED)"
+    PASS=false
+  fi
+else
+  # No usage line found — could mean zero usage or family not available
+  VM_QUOTA_OK=true
+  VM_QUOTA_MSG="No existing usage found; quota assumed available"
+  AVAILABLE_VCPUS=999
+fi
+
+# --- 4. Cosmos DB account quota ---
+CURRENT_COSMOS=$(az cosmosdb list --query "length(@)" -o tsv 2>/dev/null || echo "0")
+COSMOS_LIMIT=50
+COSMOS_AVAILABLE=$((COSMOS_LIMIT - CURRENT_COSMOS))
+if [[ $COSMOS_AVAILABLE -ge $COSMOS_COUNT ]]; then
+  COSMOS_OK=true
+  COSMOS_MSG="$COSMOS_AVAILABLE slots available (need $COSMOS_COUNT)"
+else
+  COSMOS_MSG="Only $COSMOS_AVAILABLE slots available (need $COSMOS_COUNT, limit $COSMOS_LIMIT)"
+  PASS=false
+fi
+
+# --- 5. Application Insights quota ---
+CURRENT_AI=$(az monitor app-insights component list \
+  --query "[?location=='$REGION'] | length(@)" -o tsv 2>/dev/null || echo "0")
+AI_LIMIT=200
+AI_AVAILABLE=$((AI_LIMIT - CURRENT_AI))
+if [[ $AI_AVAILABLE -ge $APP_INSIGHTS_COUNT ]]; then
+  APP_INSIGHTS_OK=true
+  APP_INSIGHTS_MSG="$AI_AVAILABLE slots available (need $APP_INSIGHTS_COUNT)"
+else
+  APP_INSIGHTS_MSG="Only $AI_AVAILABLE slots available (need $APP_INSIGHTS_COUNT, limit $AI_LIMIT)"
+  PASS=false
+fi
+
+# --- Output JSON summary ---
+cat <<EOF
+{
+  "region": "$REGION",
+  "passed": $PASS,
+  "vm_sku": {
+    "passed": $VM_SKU_OK,
+    "requested": "$VM_SIZE",
+    "suggested_alternative": "$SUGGESTED_SKU",
+    "message": "$VM_SKU_MSG"
+  },
+  "vm_quota": {
+    "passed": $VM_QUOTA_OK,
+    "available_vcpus": $AVAILABLE_VCPUS,
+    "needed_vcpus": $VCPUS_NEEDED,
+    "message": "$VM_QUOTA_MSG"
+  },
+  "cosmos_db": {
+    "passed": $COSMOS_OK,
+    "current_accounts": $CURRENT_COSMOS,
+    "available_slots": $COSMOS_AVAILABLE,
+    "needed": $COSMOS_COUNT,
+    "message": "$COSMOS_MSG"
+  },
+  "app_insights": {
+    "passed": $APP_INSIGHTS_OK,
+    "available_slots": $AI_AVAILABLE,
+    "needed": $APP_INSIGHTS_COUNT,
+    "message": "$APP_INSIGHTS_MSG"
+  }
+}
+EOF
+
+if [[ "$PASS" == "true" ]]; then
+  exit 0
+else
+  exit 1
+fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/verify-resources.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/verify-resources.sh
new file mode 100755
index 000000000000..9dbd1017b810
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/verify-resources.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# verify-resources.sh — Verify all benchmark resources are provisioned and accessible
+#
+# Usage:
+#   ./verify-resources.sh --config-dir <path> [--cosmos-count <N>]
+#
+# Checks:
+#   1. Config directory exists with expected files
+#   2. clientHostAndKey.txt has the expected number of entries
+#   3. App Insights connection string is non-empty
+#   4. VM is SSH-reachable and has JDK + Maven installed
+#
+# Exit codes:
+#   0 — all checks passed, ready to proceed
+#   1 — one or more checks failed
+
+set -uo pipefail
+
+CONFIG_DIR=""
+COSMOS_COUNT=0  # 0 means skip count validation, just check file exists
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --config-dir)   CONFIG_DIR="$2"; shift 2 ;;
+    --cosmos-count) COSMOS_COUNT="$2"; shift 2 ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CONFIG_DIR" ]]; then
+  echo "Usage: $0 --config-dir <path> [--cosmos-count N]" >&2
+  exit 1
+fi
+
+PASS=true
+CHECKS=""
+
+check() {
+  local name="$1" result="$2" msg="$3"
+  if [[ "$result" == "true" ]]; then
+    CHECKS="$CHECKS\n  ✅ $name: $msg"
+  else
+    CHECKS="$CHECKS\n  ❌ $name: $msg"
+    PASS=false
+  fi
+}
+
+# --- 1. Config directory ---
+if [[ -d "$CONFIG_DIR" ]]; then
+  check "Config directory" "true" "$CONFIG_DIR exists"
+else
+  check "Config directory" "false" "$CONFIG_DIR not found"
+  echo -e "$CHECKS"
+  exit 1
+fi
+
+# --- 2. Cosmos DB credentials ---
+if [[ -f "$CONFIG_DIR/clientHostAndKey.txt" ]]; then
+  LINE_COUNT=$(wc -l < "$CONFIG_DIR/clientHostAndKey.txt" | tr -d ' ')
+  if [[ "$COSMOS_COUNT" -gt 0 && "$LINE_COUNT" -ne "$COSMOS_COUNT" ]]; then
+    check "Cosmos DB credentials" "false" "Expected $COSMOS_COUNT entries, found $LINE_COUNT"
+  else
+    check "Cosmos DB credentials" "true" "$LINE_COUNT account(s) in clientHostAndKey.txt"
+  fi
+else
+  check "Cosmos DB credentials" "false" "clientHostAndKey.txt not found"
+fi
+
+# --- 3. App Insights connection string ---
+if [[ -f "$CONFIG_DIR/app-insights-connection-string.txt" ]]; then
+  AI_CONN=$(cat "$CONFIG_DIR/app-insights-connection-string.txt" | tr -d '[:space:]')
+  if [[ -n "$AI_CONN" ]]; then
+    check "App Insights" "true" "Connection string present"
+  else
+    check "App Insights" "false" "Connection string file is empty"
+  fi
+else
+  check "App Insights" "false" "app-insights-connection-string.txt not found"
+fi
+
+# --- 4. VM connection files ---
+VM_FILES_OK=true
+for F in vm-ip vm-user vm-key; do
+  if [[ ! -f "$CONFIG_DIR/$F" ]]; then
+    VM_FILES_OK=false
+  fi
+done
+
+if [[ "$VM_FILES_OK" == "true" ]]; then
+  check "VM config files" "true" "vm-ip, vm-user, vm-key present"
+
+  # --- 5. VM SSH connectivity + tools ---
+  VM_IP=$(cat "$CONFIG_DIR/vm-ip")
+  VM_USER=$(cat "$CONFIG_DIR/vm-user")
+  VM_KEY=$(cat "$CONFIG_DIR/vm-key")
+
+  SSH_OUTPUT=$(ssh -i "$VM_KEY" -o StrictHostKeyChecking=no -o ConnectTimeout=10 \
+    "$VM_USER@$VM_IP" \
+    'echo "SSH_OK"; java -version 2>&1 | head -1; /opt/apache-maven-3.9.12/bin/mvn --version 2>&1 | head -1' 2>/dev/null || echo "SSH_FAILED")
+
+  if echo "$SSH_OUTPUT" | grep -q "SSH_OK"; then
+    check "VM SSH" "true" "Reachable at $VM_IP"
+
+    if echo "$SSH_OUTPUT" | grep -qi "openjdk"; then
+      JDK_VER=$(echo "$SSH_OUTPUT" | grep -i "openjdk" | head -1)
+      check "VM JDK" "true" "$JDK_VER"
+    else
+      check "VM JDK" "false" "JDK not found on VM"
+    fi
+
+    if echo "$SSH_OUTPUT" | grep -qi "maven"; then
+      MVN_VER=$(echo "$SSH_OUTPUT" | grep -i "maven" | head -1)
+      check "VM Maven" "true" "$MVN_VER"
+    else
+      check "VM Maven" "false" "Maven not found on VM"
+    fi
+  else
+    check "VM SSH" "false" "Cannot reach $VM_IP"
+  fi
+else
+  check "VM config files" "false" "Missing one or more of: vm-ip, vm-user, vm-key"
+fi
+
+# --- Output ---
+echo ""
+echo "=== Resource Verification ==="
+echo "  Config: $CONFIG_DIR"
+echo -e "$CHECKS"
+echo ""
+
+if [[ "$PASS" == "true" ]]; then
+  echo "✅ All checks passed — ready to proceed to benchmark setup."
+  exit 0
+else
+  echo "❌ Some checks failed — fix issues before proceeding."
+  exit 1
+fi
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md
deleted file mode 100644
index d172db221b47..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/SKILL.md
+++ /dev/null
@@ -1,178 +0,0 @@
----
-name: cosmos-benchmark-setup
-description: Set up the benchmark execution environment — install tools on VM, clone repo at a specific branch/PR/commit/tag, generate tenants.json, copy files, build the benchmark JAR, and verify readiness. Auto-detects VM from provision output. Triggers on "setup benchmark", "install JDK", "create tenants.json", "create config", "copy files to VM", "verify build", "clone repo on VM".
----
-
-# Benchmark Environment Setup
-
-Prepare the execution environment after infrastructure is provisioned.
-
-## VM Connection
-
-Auto-detect VM connection info from provision output:
-
-```bash
-VM_IP=$(cat benchmark-config/vm-ip)
-VM_USER=$(cat benchmark-config/vm-user)
-VM_KEY=$(cat benchmark-config/vm-key)
-SSH_CMD="ssh -i $VM_KEY $VM_USER@$VM_IP"
-```
-
-If `benchmark-config/vm-ip` doesn't exist, ask the user for VM IP and SSH credentials.
-
-All remote commands below use `$SSH_CMD` as shorthand.
-
-## 1. Install Dependencies on VM
-
-Run the setup script:
-
-```bash
-$SSH_CMD 'bash -s' < copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
-```
-
-### What gets installed
-
-| Component | Version | Location |
-|-----------|---------|----------|
-| OpenJDK | 21 | System package |
-| Maven | 3.9.12 | `/opt/apache-maven-3.9.12` |
-| async-profiler | 3.0 | `/opt/async-profiler-3.0-linux-x64` |
-| git, net-tools, sysstat, tmux | latest | System packages |
-
-### Verify
-
-```bash
-$SSH_CMD 'java -version 2>&1 | head -1; /opt/apache-maven-3.9.12/bin/mvn --version 2>&1 | head -1; tmux -V; df -h / | tail -1'
-```
-
-## 2. Clone/Update Repo on VM
-
-### From a branch
-
-```bash
-$SSH_CMD "git clone --depth 1 -b <branch> <repo-url> ~/azure-sdk-for-java"
-```
-
-If already cloned:
-```bash
-$SSH_CMD "cd ~/azure-sdk-for-java && git fetch --depth 1 origin <branch> && git checkout <branch> && git pull origin <branch>"
-```
-
-### From a PR number
-
-```bash
-$SSH_CMD "cd ~/azure-sdk-for-java && git fetch origin pull/<pr-number>/head:pr-<pr-number> && git checkout pr-<pr-number>"
-```
-
-### From a commit SHA
-
-```bash
-$SSH_CMD "cd ~/azure-sdk-for-java && git fetch origin && git checkout <commit-sha>"
-```
-
-### From a tag
-
-```bash
-$SSH_CMD "cd ~/azure-sdk-for-java && git fetch --tags origin && git checkout tags/<tag-name>"
-```
-
-Ask the user which ref type they want. Default to branch if unspecified.
-
-## 3. Generate Benchmark Configuration
-
-### Multi-tenant mode (tenants.json)
-
-If `clientHostAndKey.txt` exists (created by provision skill), generate `tenants.json`:
-
-#### Sample tenants.json template
-
-```json
-{
-  "globalDefaults": {
-    "connectionMode": "GATEWAY",
-    "consistencyLevel": "SESSION",
-    "concurrency": "20",
-    "numberOfOperations": "100000",
-    "operation": "ReadThroughput",
-    "numberOfPreCreatedDocuments": "1000",
-    "connectionSharingAcrossClientsEnabled": "false",
-    "maxConnectionPoolSize": "1000",
-    "applicationName": "cosmos-bench"
-  },
-  "tenants": []
-}
-```
-
-#### Generation steps
-
-1. Read `clientHostAndKey.txt` — each line: `<name>,<endpoint>,<key>`
-2. For each line, create a tenant entry:
-   ```json
-   { "id": "tenant-<index>", "serviceEndpoint": "<endpoint>", "masterKey": "<key>", "databaseId": "benchdb", "containerId": "benchcol" }
-   ```
-3. Ask user if they want to customize `globalDefaults` (operation type, concurrency, connection mode, etc.) or use the template defaults.
-4. Write to `sdk/cosmos/azure-cosmos-benchmark/tenants.json`
-5. Verify: parse output, confirm tenant count, first/last endpoints.
-
-### Single-tenant mode
-
-No config file needed. The **run** skill will construct CLI flags directly:
-```
--serviceEndpoint <endpoint> -masterKey <key> -databaseId benchdb -containerId benchcol
-```
-
-### Gitignore
-
-Ensure `clientHostAndKey.txt` and `tenants.json` are in `.gitignore` — they contain secrets.
-
-## 4. Copy Config Files to VM
-
-```bash
-scp -i $VM_KEY -o StrictHostKeyChecking=no tenants.json $VM_USER@$VM_IP:~/tenants.json
-```
-
-## 5. Build the Benchmark JAR
-
-All builds run in a tmux session to survive SSH disconnection.
-
-Common Maven flags: `MAVEN_FLAGS="-e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true"`
-
-```bash
-$SSH_CMD "tmux new-session -d -s build 'export PATH=/opt/apache-maven-3.9.12/bin:\$PATH && \
-  cd ~/azure-sdk-for-java/sdk/cosmos && \
-  mvn $MAVEN_FLAGS -pl ,azure-cosmos -am clean install && \
-  mvn $MAVEN_FLAGS -pl ,azure-cosmos-test clean install && \
-  mvn $MAVEN_FLAGS -pl ,azure-cosmos-encryption clean install && \
-  mvn $MAVEN_FLAGS -pl ,azure-cosmos-benchmark clean package -P package-assembly && \
-  echo BUILD_COMPLETE'"
-```
-
-Monitor build progress:
-```bash
-$SSH_CMD "tmux capture-pane -t build -p | tail -20"
-```
-
-## 6. Verify Readiness
-
-```bash
-$SSH_CMD "echo '=== VM Check ==='; \
-  java -version 2>&1 | head -1; \
-  /opt/apache-maven-3.9.12/bin/mvn --version 2>&1 | head -1; \
-  df -h / | tail -1; \
-  ls ~/tenants.json 2>/dev/null && echo 'Config: ✅' || echo 'Config: ❌ MISSING'; \
-  ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/target/*jar-with-dependencies.jar 2>/dev/null \
-    && echo 'JAR: ✅' || echo 'JAR: ❌ MISSING'; \
-  cd ~/azure-sdk-for-java && echo \"Branch: \$(git rev-parse --abbrev-ref HEAD)  Commit: \$(git rev-parse --short HEAD)\""
-```
-
-Checklist:
-- ✅ JDK 21 installed
-- ✅ Maven 3.8.1+ installed
-- ✅ Repo cloned (correct branch/PR/commit)
-- ✅ Benchmark JAR built
-- ✅ Config file present (tenants.json or single-tenant credentials)
-- ✅ Disk space >10 GB free
-
-## After Setup
-
-Suggest: "Ready to run. Use the **cosmos-benchmark-run** skill to start a benchmark."
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
deleted file mode 100644
index 8fbb6c6cf148..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup/scripts/setup-benchmark-vm.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# setup-benchmark-vm.sh — Run once after VM creation to install dependencies
-# See §6.3 of the test plan.
-
-set -euo pipefail
-
-echo "=== Setting up benchmark VM ==="
-
-# JDK + networking tools
-sudo apt-get update && sudo apt-get install -y openjdk-21-jdk git net-tools iproute2 sysstat procps tmux
-
-# Maven 3.9+ (Ubuntu 22.04 apt provides 3.6.3 which is too old for SDK plugins)
-wget -q https://dlcdn.apache.org/maven/maven-3/3.9.12/binaries/apache-maven-3.9.12-bin.tar.gz -O /tmp/maven.tar.gz
-sudo tar -xzf /tmp/maven.tar.gz -C /opt/
-sudo ln -sf /opt/apache-maven-3.9.12/bin/mvn /usr/local/bin/mvn
-export PATH=/opt/apache-maven-3.9.12/bin:$PATH
-
-# Async-profiler
-wget -qO /tmp/async-profiler.tar.gz \
-  https://github.com/async-profiler/async-profiler/releases/download/v3.0/async-profiler-3.0-linux-x64.tar.gz
-sudo tar -xzf /tmp/async-profiler.tar.gz -C /opt/
-echo 'export PATH=$PATH:/opt/async-profiler-3.0-linux-x64/bin' >> ~/.bashrc
-
-# Clone SDK
-git clone https://github.com/Azure/azure-sdk-for-java.git ~/azure-sdk-for-java
-cd ~/azure-sdk-for-java
-
-# Build benchmark module (must run from sdk/cosmos)
-cd sdk/cosmos
-mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos -am clean install
-mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos-test clean install
-mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos-encryption clean install
-mvn -e -DskipTests -Dgpg.skip -Dmaven.javadoc.skip=true -Dcodesnippet.skip=true -Dspotbugs.skip=true -Dcheckstyle.skip=true -Drevapi.skip=true -pl ,azure-cosmos-benchmark clean package -P package-assembly
-
-echo "=== Setup complete ==="
-echo "Next: Set APPLICATIONINSIGHTS_CONNECTION_STRING and copy tenants.json from benchmark-config/ to the VM"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md
deleted file mode 100644
index b4178ed22119..000000000000
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-status/SKILL.md
+++ /dev/null
@@ -1,61 +0,0 @@
----
-name: cosmos-benchmark-status
-description: Show the current state of the Cosmos DB benchmark environment — Azure resources, recent runs, VM status, build status, config, and App Insights health. Use when the user asks "what runs do I have", "benchmark status", "is the VM up", "show recent results", "list accounts", or wants an overview before starting a benchmark session.
----
-
-# Benchmark Environment Status
-
-Check and report the current state of the entire benchmark environment.
-
-## Checks
-
-1. **Azure Resources** (requires `az` CLI):
-   - Cosmos DB accounts: `az cosmosdb list -g rg-cosmos-benchmark --query "[].{name:name, endpoint:documentEndpoint}" -o table`
-   - Application Insights: `az monitor app-insights component list -g rg-cosmos-benchmark --query "[].{name:name, connectionString:connectionString}" -o table`
-   - VMs: `az vm list -g rg-cosmos-benchmark -d --query "[].{name:name, ip:publicIps, state:powerState}" -o table`
-
-2. **App Insights Health**: Verify metrics are being received:
-   ```bash
-   az monitor app-insights query \
-     --app <app-insights-name> \
-     --resource-group rg-cosmos-benchmark \
-     --analytics-query "customMetrics | where timestamp > ago(1h) | summarize count() by bin(timestamp, 5m) | order by timestamp desc | take 5"
-   ```
-   If count > 0, metrics are flowing. If empty, App Insights may not be configured or the benchmark hasn't reported yet.
-
-3. **Recent results**: List directories under `sdk/cosmos/azure-cosmos-benchmark/results/` and `./results/`.
-   - For each: check if `monitor.csv` exists (📊 = complete, ❌ = incomplete)
-   - Read `git-info.json` for branch/commit if present
-   - Show most recent 5–10 runs
-
-4. **Benchmark VM**: Check for `benchmark-config/vm-ip` file in workspace root.
-   - If found, SSH to verify: `ssh -i $(cat benchmark-config/vm-key) $(cat benchmark-config/vm-user)@$(cat benchmark-config/vm-ip) "echo OK; uptime; java -version 2>&1 | head -1; ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/ 2>/dev/null | tail -5"`
-   - Report: reachable/unreachable, uptime, JDK version, runs on VM
-
-5. **Build status**: Check for `sdk/cosmos/azure-cosmos-benchmark/target/azure-cosmos-benchmark-*-jar-with-dependencies.jar`.
-   - Report: found (with file timestamp) or "Not built"
-
-6. **Config**: Check for `tenants.json` in workspace root and `sdk/cosmos/azure-cosmos-benchmark/`.
-   - Report: found/not found, count of tenants if parseable
-
-## Output Format
-
-```
-☁️ Azure Resources:
-  Cosmos DB: 3 accounts (cosmosdb-bench-0, -1, -2)
-  App Insights: ✅ cosmos-bench-ai (receiving metrics: 142 events/5min)
-  VMs: ✅ vm-benchmark-01 (running, 4.154.169.45)
-
-📊 Recent Runs (local):
-  📊 20260226-CHURN-fix-leak/     branch: fix-leak  commit: abc1234
-  📊 20260225-CHURN-main-base/    branch: main      commit: def5678
-  ❌ 20260226-CHURN-experiment/   (incomplete)
-
-📊 Recent Runs (VM):
-  📊 20260226-CHURN-fix-leak/
-  📊 20260225-CHURN-main-base/
-
-🖥️ Benchmark VM: ✅ 4.154.169.45 (up 3 days, JDK 21.0.10)
-🔨 Build: ✅ JAR found (2026-02-26 14:30)
-📋 Config: ✅ tenants.json (3 tenants)
-```

From ada2942e48d51805fdd9fbc1be82e4c8a7272988 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 15:00:24 -0800
Subject: [PATCH 05/22] Improve benchmark capacity validation and region finder
 scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add timestamped progress logging to validate-capacity.sh
- Fix restriction detection to handle all types (Zone, NotAvailableForSubscription)
- Replace slow per-SKU API calls with single-call alternative SKU search
- Add --find-alternatives flag to control similar SKU search
- Add restriction_reason field to JSON output
- Derive quota family dynamically from effective SKU

- Add --fallback-regions flag to find-region.sh for user-specified regions
- Implement 4-phase search: preferred exact → preferred similar → fallback exact → fallback similar
- Add [N/M] progress updates printed as each region completes
- Add --stop-on-first flag (default: true)
- Fix integration bugs: JSON path, exit code logic, stdin-based parsing

- Update SKILL.md to document new flags and search strategy

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../cosmos-benchmark-setup-resources/SKILL.md |  47 ++--
 .../scripts/find-region.sh                    | 248 +++++++++++++++---
 .../scripts/validate-capacity.sh              | 175 +++++++++---
 3 files changed, 383 insertions(+), 87 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
index 9f2b7ee37930..83dba4829f7b 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
@@ -73,17 +73,19 @@ bash scripts/validate-capacity.sh \
   --vm-size <vm-size> \
   --vm-count <vm-count> \
   --cosmos-count <cosmos-count> \
-  --app-insights-count 1
+  --app-insights-count 1 \
+  --find-alternatives true
 ```
 
-The script checks:
-- VM SKU availability in the region (and suggests similar SKUs if exact match unavailable)
-- vCPU quota (enough headroom for requested VMs)
+The script checks (with timestamped progress logging to stderr):
+- Resource provider registration (auto-registers if needed)
+- VM SKU availability (detects all restriction types: Zone, NotAvailableForSubscription)
+- Alternative D-series SKUs with matching vCPU count (single API call, if `--find-alternatives true`)
+- vCPU quota (dynamically resolves the quota family from the effective SKU)
 - Cosmos DB account quota (subscription-wide limit of ~50)
 - Application Insights quota (per-region limit of ~200)
-- Resource provider registration (auto-registers if needed)
 
-**Output**: JSON summary with per-check pass/fail, available capacity, and messages.
+**Output**: JSON summary with per-check pass/fail, `restriction_reason`, `suggested_alternative`, and messages.
 
 ### If preferred region fails
 
@@ -94,15 +96,30 @@ bash scripts/find-region.sh \
   --preferred <preferred-region> \
   --vm-size <vm-size> \
   --vm-count <vm-count> \
-  --cosmos-count <cosmos-count>
+  --cosmos-count <cosmos-count> \
+  --fallback-regions "eastus,centralus,westeurope" \
+  --stop-on-first true
+```
+
+The script uses a 4-phase search strategy:
+1. Preferred region with exact SKU
+2. Preferred region with similar SKUs (finds alternatives before giving up on the region)
+3. Fallback regions with exact SKU
+4. Fallback regions with similar SKUs
+
+Progress is reported to stderr as each region is checked:
+```
+[1/9] westus2: ❌ Standard_D16s_v5 restricted (similar: Standard_D16ads_v7 available)
+[2/9] eastus: ❌ not listed
+[3/9] centralus: ✅ Standard_D16s_v5 available
 ```
 
-The script:
-1. Validates the preferred region first
-2. If it fails, tries candidates: `westus2`, `eastus`, `eastus2`, `westus3`, `centralus`, `northeurope`, `westeurope`, `southeastasia`, `australiaeast`
-3. Outputs the first region where all checks pass
-4. If a region has a similar but not identical VM SKU, reports the alternative
-5. If no region works, outputs `NONE` and a failure summary
+**Output** (stdout): Line 1 = region, Line 2 = VM size (may differ if alternative), Line 3+ = JSON.
+
+| Flag | Default | Description |
+|---|---|---|
+| `--fallback-regions` | (built-in list) | Comma-separated regions to try instead of defaults |
+| `--stop-on-first` | `true` | Stop at first exact match; `false` checks all regions |
 
 **Present the result to the user for confirmation** before proceeding.
 
@@ -212,8 +229,8 @@ Proceed to the **cosmos-benchmark-run** skill to:
 | Script | Purpose |
 |---|---|
 | `scripts/provision-all.sh` | **Orchestrator.** Creates RG → launches Cosmos/AppInsights/VM in parallel → exports credentials → verifies. |
-| `scripts/validate-capacity.sh` | Check region capacity (VM SKU, quotas). JSON output. |
-| `scripts/find-region.sh` | Find first region passing all capacity checks. |
+| `scripts/validate-capacity.sh` | Check region capacity (VM SKU, quotas, restrictions). Logs progress. Finds alternative SKUs. JSON output. |
+| `scripts/find-region.sh` | 4-phase region search: exact→similar in preferred, then fallbacks. Supports `--fallback-regions`, `--stop-on-first`. |
 | `scripts/create-cosmos-accounts.sh` | Create N Cosmos DB accounts with progress logging. |
 | `scripts/export-cosmos-credentials.sh` | Export Cosmos DB credentials to `clientHostAndKey.txt`. |
 | `scripts/provision-benchmark-vm.sh` | Create/connect VM, install tools (JDK/Maven/tmux), save config. |
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh
index bb1817d8be33..292ce60d3cb4 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/find-region.sh
@@ -3,14 +3,22 @@
 #
 # Usage:
 #   ./find-region.sh [--preferred westus2] [--vm-size Standard_D16s_v5] [--vm-count 1] \
-#                    [--cosmos-count 1] [--app-insights-count 1]
+#                    [--cosmos-count 1] [--app-insights-count 1] \
+#                    [--fallback-regions "eastus,centralus,westeurope"] \
+#                    [--stop-on-first true|false]
 #
-# Checks the preferred region first. If it fails, tries candidate regions.
-# Prints the first valid region name to stdout. Exits 1 if none found.
+# Search order:
+#   1. Preferred region with exact SKU
+#   2. Preferred region with similar SKUs (--find-alternatives)
+#   3. Fallback regions with exact SKU
+#   4. Fallback regions with similar SKUs
 #
 # Output (stdout):
 #   Line 1: region name (or "NONE" if no region found)
-#   Line 2+: JSON capacity report for the selected region (or summary of failures)
+#   Line 2: VM size (may differ from requested if using an alternative SKU)
+#   Line 3+: JSON capacity report for the selected region
+#
+# stderr: progress/status messages with running tally
 
 set -uo pipefail
 
@@ -20,6 +28,8 @@ VM_SIZE="Standard_D16s_v5"
 VM_COUNT=1
 COSMOS_COUNT=1
 APP_INSIGHTS_COUNT=1
+FALLBACK_REGIONS=""
+STOP_ON_FIRST=true
 
 while [[ $# -gt 0 ]]; do
   case $1 in
@@ -28,67 +38,227 @@ while [[ $# -gt 0 ]]; do
     --vm-count)           VM_COUNT="$2"; shift 2 ;;
     --cosmos-count)       COSMOS_COUNT="$2"; shift 2 ;;
     --app-insights-count) APP_INSIGHTS_COUNT="$2"; shift 2 ;;
+    --fallback-regions)   FALLBACK_REGIONS="$2"; shift 2 ;;
+    --stop-on-first)      STOP_ON_FIRST="$2"; shift 2 ;;
     *) echo "Unknown option: $1" >&2; exit 1 ;;
   esac
 done
 
-CANDIDATES=("westus2" "eastus" "eastus2" "westus3" "centralus" "northeurope" "westeurope" "southeastasia" "australiaeast")
+# Build candidate list: user-supplied fallback regions replace the defaults
+DEFAULT_CANDIDATES=("westus2" "eastus" "eastus2" "westus3" "centralus" "northeurope" "westeurope" "southeastasia" "australiaeast")
+if [[ -n "$FALLBACK_REGIONS" ]]; then
+  IFS=',' read -ra CANDIDATES <<< "$FALLBACK_REGIONS"
+else
+  CANDIDATES=("${DEFAULT_CANDIDATES[@]}")
+fi
+
+# Remove preferred region from candidates (it's checked first separately)
+FALLBACK=()
+for r in "${CANDIDATES[@]}"; do
+  [[ "$r" == "$PREFERRED" ]] || FALLBACK+=("$r")
+done
+
+# Total regions to check for progress tally (preferred + fallbacks)
+TOTAL=$(( 1 + ${#FALLBACK[@]} ))
+CHECK_NUM=0
+
+# Emit a progress line to stderr, flushed immediately
+progress() {
+  CHECK_NUM=$(( CHECK_NUM + 1 ))
+  printf "[%d/%d] %s\n" "$CHECK_NUM" "$TOTAL" "$1" >&2
+}
 
 validate_region() {
   local region="$1"
+  local find_alt="${2:-false}"
+  local extra_args=()
+  if [[ "$find_alt" == "true" ]]; then
+    extra_args+=("--find-alternatives" "true")
+  fi
   bash "$SCRIPT_DIR/validate-capacity.sh" \
     --region "$region" \
     --vm-size "$VM_SIZE" \
     --vm-count "$VM_COUNT" \
     --cosmos-count "$COSMOS_COUNT" \
-    --app-insights-count "$APP_INSIGHTS_COUNT" 2>/dev/null
+    --app-insights-count "$APP_INSIGHTS_COUNT" \
+    "${extra_args[@]}" 2>/dev/null
 }
 
-# Try preferred region first
-echo "Checking preferred region: $PREFERRED..." >&2
-RESULT=$(validate_region "$PREFERRED")
-if [[ $? -eq 0 ]]; then
-  echo "$PREFERRED"
-  echo "$RESULT"
-  exit 0
-fi
+# Extract a short status description from validate-capacity JSON output
+status_summary() {
+  local json="$1"
+  local find_alt="$2"
+  echo "$json" | python3 -c "
+import sys, json
+try:
+  d = json.load(sys.stdin)
+  sku = d.get('vm_sku', {})
+  if sku.get('passed'):
+    print('available')
+  else:
+    msg = sku.get('message', 'failed')
+    alt_name = d.get('vm_sku', {}).get('suggested_alternative', '')
+    if alt_name:
+      print(f\"{msg} (similar: {alt_name} available)\")
+    else:
+      print(msg)
+except:
+  print('check error')
+" 2>/dev/null || echo "check error"
+}
 
-echo "Preferred region $PREFERRED failed capacity checks." >&2
-echo "$RESULT" | python3 -c "
+# Extract alternative SKU name from JSON if present
+extract_alternative_sku() {
+  local json="$1"
+  echo "$json" | python3 -c "
 import sys, json
 try:
   d = json.load(sys.stdin)
-  for k in ['vm_sku', 'vm_quota', 'cosmos_db', 'app_insights']:
-    if not d[k]['passed']:
-      print(f\"  ❌ {k}: {d[k]['message']}\", file=sys.stderr)
-except: pass
-" 2>&2 || true
+  print(d.get('vm_sku', {}).get('suggested_alternative', ''))
+except:
+  print('')
+" 2>/dev/null || echo ""
+}
 
-echo "" >&2
-echo "Searching candidate regions..." >&2
-
-FAILURES=""
-for REGION in "${CANDIDATES[@]}"; do
-  [[ "$REGION" == "$PREFERRED" ]] && continue
-  echo "  Checking $REGION..." >&2
-  RESULT=$(validate_region "$REGION")
-  if [[ $? -eq 0 ]]; then
-    echo "$REGION"
-    echo "$RESULT"
+# Track the best result found so far
+BEST_REGION=""
+BEST_VM_SIZE=""
+BEST_RESULT=""
+BEST_TYPE=""  # "exact" or "similar"
+
+emit_result() {
+  echo "$BEST_REGION"
+  echo "$BEST_VM_SIZE"
+  echo "$BEST_RESULT"
+}
+
+# ---------------------------------------------------------------------------
+# Phase 1: Preferred region — exact SKU
+# ---------------------------------------------------------------------------
+RESULT=$(validate_region "$PREFERRED" false)
+RC=$?
+SUMMARY=$(status_summary "$RESULT" false)
+if [[ $RC -eq 0 ]]; then
+  progress "$PREFERRED: ✅ $VM_SIZE $SUMMARY"
+  BEST_REGION="$PREFERRED"
+  BEST_VM_SIZE="$VM_SIZE"
+  BEST_RESULT="$RESULT"
+  BEST_TYPE="exact"
+  if [[ "$STOP_ON_FIRST" == "true" ]]; then
+    echo "Found exact SKU match in preferred region." >&2
+    emit_result
     exit 0
   fi
-  FAILURES="$FAILURES\n$REGION: $(echo "$RESULT" | python3 -c "
+else
+  # Phase 2: Preferred region — similar SKUs
+  ALT_RESULT=$(validate_region "$PREFERRED" true)
+  ALT_SUMMARY=$(status_summary "$ALT_RESULT" true)
+  ALT_SKU=$(extract_alternative_sku "$ALT_RESULT")
+  OTHER_CHECKS_OK=$(echo "$ALT_RESULT" | python3 -c "
 import sys, json
 try:
   d = json.load(sys.stdin)
-  fails = [f\"{k}: {d[k]['message']}\" for k in ['vm_sku','vm_quota','cosmos_db','app_insights'] if not d[k]['passed']]
-  print('; '.join(fails))
-except: print('parse error')
-" 2>/dev/null || echo "check error")"
+  cosmos_ok = d.get('cosmos_db', {}).get('passed', False)
+  ai_ok = d.get('app_insights', {}).get('passed', False)
+  quota_ok = d.get('vm_quota', {}).get('passed', False)
+  print('true' if (cosmos_ok and ai_ok and quota_ok) else 'false')
+except: print('false')
+" 2>/dev/null || echo "false")
+  if [[ -n "$ALT_SKU" && "$OTHER_CHECKS_OK" == "true" ]]; then
+    progress "$PREFERRED: ❌ $VM_SIZE $SUMMARY (similar: $ALT_SKU available)"
+    echo "Preferred region has alternative SKU $ALT_SKU; checking fallbacks for exact match..." >&2
+    BEST_REGION="$PREFERRED"
+    BEST_VM_SIZE="$ALT_SKU"
+    BEST_RESULT="$ALT_RESULT"
+    BEST_TYPE="similar"
+  else
+    progress "$PREFERRED: ❌ $VM_SIZE $SUMMARY"
+  fi
+fi
+
+# ---------------------------------------------------------------------------
+# Phase 3: Fallback regions — exact SKU
+# ---------------------------------------------------------------------------
+for REGION in "${FALLBACK[@]}"; do
+  RESULT=$(validate_region "$REGION" false)
+  RC=$?
+  SUMMARY=$(status_summary "$RESULT" false)
+  if [[ $RC -eq 0 ]]; then
+    progress "$REGION: ✅ $VM_SIZE $SUMMARY"
+    # Exact match in a fallback region — always preferred over a similar SKU
+    if [[ "$BEST_TYPE" != "exact" ]]; then
+      BEST_REGION="$REGION"
+      BEST_VM_SIZE="$VM_SIZE"
+      BEST_RESULT="$RESULT"
+      BEST_TYPE="exact"
+    fi
+    if [[ "$STOP_ON_FIRST" == "true" ]]; then
+      echo "Found exact SKU match in $REGION." >&2
+      emit_result
+      exit 0
+    fi
+  else
+    progress "$REGION: ❌ $VM_SIZE $SUMMARY"
+  fi
+done
+
+# If we already have an exact match (stop-on-first=false mode), return it
+if [[ "$BEST_TYPE" == "exact" ]]; then
+  echo "Best option: exact SKU in $BEST_REGION." >&2
+  emit_result
+  exit 0
+fi
+
+# ---------------------------------------------------------------------------
+# Phase 4: Fallback regions — similar SKUs
+# ---------------------------------------------------------------------------
+echo "" >&2
+echo "No exact SKU match found. Searching fallback regions for similar SKUs..." >&2
+for REGION in "${FALLBACK[@]}"; do
+  ALT_RESULT=$(validate_region "$REGION" true)
+  ALT_SKU=$(extract_alternative_sku "$ALT_RESULT")
+  OTHER_CHECKS_OK=$(echo "$ALT_RESULT" | python3 -c "
+import sys, json
+try:
+  d = json.load(sys.stdin)
+  cosmos_ok = d.get('cosmos_db', {}).get('passed', False)
+  ai_ok = d.get('app_insights', {}).get('passed', False)
+  quota_ok = d.get('vm_quota', {}).get('passed', False)
+  print('true' if (cosmos_ok and ai_ok and quota_ok) else 'false')
+except: print('false')
+" 2>/dev/null || echo "false")
+  if [[ -n "$ALT_SKU" && "$OTHER_CHECKS_OK" == "true" ]]; then
+    echo "  $REGION: similar SKU $ALT_SKU available" >&2
+    # Keep the first similar-SKU fallback found, unless preferred already has one
+    if [[ -z "$BEST_REGION" ]]; then
+      BEST_REGION="$REGION"
+      BEST_VM_SIZE="$ALT_SKU"
+      BEST_RESULT="$ALT_RESULT"
+      BEST_TYPE="similar"
+    fi
+    if [[ "$STOP_ON_FIRST" == "true" ]]; then
+      break
+    fi
+  else
+    echo "  $REGION: no similar SKU found" >&2
+  fi
 done
 
+# ---------------------------------------------------------------------------
+# Emit final result
+# ---------------------------------------------------------------------------
+if [[ -n "$BEST_REGION" ]]; then
+  if [[ "$BEST_VM_SIZE" != "$VM_SIZE" ]]; then
+    echo "Selected $BEST_REGION with alternative SKU $BEST_VM_SIZE (requested $VM_SIZE)." >&2
+  else
+    echo "Selected $BEST_REGION with $BEST_VM_SIZE." >&2
+  fi
+  emit_result
+  exit 0
+fi
+
 echo "" >&2
-echo "No candidate region has capacity for all resources." >&2
-echo -e "Failures:$FAILURES" >&2
+echo "No region found with capacity for requested or similar SKUs." >&2
 echo "NONE"
+echo "$VM_SIZE"
 exit 1
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh
index 74fa6b25857a..0600d53220ec 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/validate-capacity.sh
@@ -3,7 +3,7 @@
 #
 # Usage:
 #   ./validate-capacity.sh --region <region> [--vm-size Standard_D16s_v5] [--vm-count 1] \
-#                          [--cosmos-count 1] [--app-insights-count 1]
+#                          [--cosmos-count 1] [--app-insights-count 1] [--find-alternatives true|false]
 #
 # Exit codes:
 #   0 — all checks passed
@@ -18,6 +18,11 @@ VM_SIZE="Standard_D16s_v5"
 VM_COUNT=1
 COSMOS_COUNT=1
 APP_INSIGHTS_COUNT=1
+FIND_ALTERNATIVES=true
+
+log() {
+  printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2
+}
 
 while [[ $# -gt 0 ]]; do
   case $1 in
@@ -26,6 +31,7 @@ while [[ $# -gt 0 ]]; do
     --vm-count)           VM_COUNT="$2"; shift 2 ;;
     --cosmos-count)       COSMOS_COUNT="$2"; shift 2 ;;
     --app-insights-count) APP_INSIGHTS_COUNT="$2"; shift 2 ;;
+    --find-alternatives)  FIND_ALTERNATIVES="$2"; shift 2 ;;
     *) echo "Unknown option: $1" >&2; exit 1 ;;
   esac
 done
@@ -43,60 +49,157 @@ AVAILABLE_VCPUS=0
 COSMOS_AVAILABLE=0
 APP_INSIGHTS_AVAILABLE=0
 SUGGESTED_SKU=""
+RESTRICTION_REASON=""
 
 # --- 1. Resource providers ---
+log "Checking resource provider registrations..."
 for NS in Microsoft.Compute Microsoft.DocumentDB Microsoft.Insights; do
   STATE=$(az provider show --namespace "$NS" --query "registrationState" -o tsv 2>/dev/null || echo "Unknown")
   if [[ "$STATE" != "Registered" ]]; then
-    echo "Registering $NS..." >&2
+    log "Registering $NS..."
     az provider register --namespace "$NS" 2>/dev/null
   fi
 done
+log "Resource provider check complete."
 
 # --- 2. VM SKU availability ---
-# Check if the exact SKU is available (no restrictions)
-RESTRICTED=$(az vm list-skus --location "$REGION" --size "$VM_SIZE" \
-  --query "[?restrictions[?reasonCode=='NotAvailableForSubscription']] | length(@)" -o tsv 2>/dev/null || echo "0")
-TOTAL_SKUS=$(az vm list-skus --location "$REGION" --size "$VM_SIZE" \
-  --query "length(@)" -o tsv 2>/dev/null || echo "0")
+log "Checking VM SKU availability in $REGION..."
+
+# Fetch the requested SKU with its full restrictions array
+SKU_JSON=$(az vm list-skus --location "$REGION" --size "$VM_SIZE" --resource-type virtualMachines \
+  --query "[0].{name:name, restrictions:restrictions, family:family, caps:capabilities[?name=='vCPUs'].value | [0]}" \
+  -o json 2>/dev/null || echo "null")
 
-if [[ "$TOTAL_SKUS" -gt 0 && "$RESTRICTED" -eq 0 ]]; then
+# A SKU is available only if it exists AND has no restrictions at all
+SKU_EXISTS=false
+SKU_UNRESTRICTED=false
+if [[ "$SKU_JSON" != "null" && "$SKU_JSON" != "" ]]; then
+  SKU_EXISTS=true
+  RESTRICTION_COUNT=$(echo "$SKU_JSON" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+r = data.get('restrictions') or []
+print(len(r))
+" 2>/dev/null || echo "0")
+  if [[ "$RESTRICTION_COUNT" -eq 0 ]]; then
+    SKU_UNRESTRICTED=true
+  else
+    # Describe the restriction reasons
+    RESTRICTION_REASON=$(echo "$SKU_JSON" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+reasons = []
+for r in (data.get('restrictions') or []):
+    rtype = r.get('type', 'Unknown')
+    code = r.get('reasonCode', 'Unknown')
+    reasons.append(f'{rtype}/{code}')
+print('; '.join(reasons))
+" 2>/dev/null || echo "Unknown")
+  fi
+fi
+
+# Derive vCPU count from the SKU name (e.g., Standard_D16s_v5 -> 16)
+VCPUS_PER_VM=$(echo "$VM_SIZE" | sed -E 's/Standard_D([0-9]+)[a-z]*_v[0-9]+/\1/')
+
+if [[ "$SKU_EXISTS" == "true" && "$SKU_UNRESTRICTED" == "true" ]]; then
   VM_SKU_OK=true
   VM_SKU_MSG="$VM_SIZE available"
 else
-  VM_SKU_MSG="$VM_SIZE not available"
+  if [[ "$SKU_EXISTS" == "false" ]]; then
+    VM_SKU_MSG="$VM_SIZE not found in $REGION"
+    RESTRICTION_REASON="NotFound"
+  else
+    VM_SKU_MSG="$VM_SIZE restricted ($RESTRICTION_REASON)"
+  fi
   PASS=false
-  # Try to find a similar SKU in the same family
-  # Extract family prefix (e.g., Standard_D16s_v5 -> standardDSv5Family, also try v4)
-  FAMILY_BASE=$(echo "$VM_SIZE" | sed -E 's/Standard_D([0-9]+)s_v[0-9]+/standardDSv/')
-  VCPUS_NEEDED=$(echo "$VM_SIZE" | sed -E 's/Standard_D([0-9]+)s_v[0-9]+/\1/')
-  for VER in 5 4 3; do
-    FAMILY="${FAMILY_BASE}${VER}Family"
-    ALT=$(az vm list-skus --location "$REGION" \
-      --query "[?family=='$FAMILY' && restrictions[0]==null].name" -o tsv 2>/dev/null | head -5)
-    if [[ -n "$ALT" ]]; then
-      # Find a SKU with matching vCPU count
-      for SKU in $ALT; do
-        SKU_VCPUS=$(az vm list-skus --location "$REGION" --size "$SKU" \
-          --query "[0].capabilities[?name=='vCPUs'].value | [0]" -o tsv 2>/dev/null || echo "0")
-        if [[ "$SKU_VCPUS" == "$VCPUS_NEEDED" ]]; then
-          SUGGESTED_SKU="$SKU"
-          break 2
-        fi
-      done
+
+  # Try to find a similar D-series SKU with the same vCPU count
+  if [[ "$FIND_ALTERNATIVES" == "true" ]]; then
+    log "Searching for alternative D-series SKUs with $VCPUS_PER_VM vCPUs..."
+
+    # Single API call: fetch all D-series SKUs with their vCPU count and restrictions
+    ALL_D_SKUS=$(az vm list-skus --location "$REGION" --resource-type virtualMachines \
+      --query "[?starts_with(name, 'Standard_D')].{name:name, vcpus:capabilities[?name=='vCPUs'].value | [0], family:family, restrictions:restrictions}" \
+      -o json 2>/dev/null || echo "[]")
+
+    # Use python3 to filter unrestricted SKUs with matching vCPUs and pick the best match
+    SUGGESTED_SKU=$(echo "$ALL_D_SKUS" | python3 -c "
+import sys, json
+
+data = json.load(sys.stdin)
+requested = '$VM_SIZE'
+vcpus_needed = '$VCPUS_PER_VM'
+
+# Extract generation from SKU name (e.g., Standard_D16s_v5 -> 5)
+def get_generation(name):
+    import re
+    m = re.search(r'_v(\d+)$', name)
+    return int(m.group(1)) if m else 0
+
+req_gen = get_generation(requested)
+candidates = []
+for sku in data:
+    name = sku.get('name', '')
+    if name == requested:
+        continue
+    vcpus = sku.get('vcpus', '0')
+    if str(vcpus) != str(vcpus_needed):
+        continue
+    restrictions = sku.get('restrictions') or []
+    if len(restrictions) > 0:
+        continue
+    gen = get_generation(name)
+    # Score: prefer same generation, then newer generations, then older
+    if gen == req_gen:
+        score = 0
+    elif gen > req_gen:
+        score = gen - req_gen
+    else:
+        score = 100 + (req_gen - gen)
+    candidates.append((score, name))
+
+candidates.sort()
+if candidates:
+    print(candidates[0][1])
+else:
+    print('')
+" 2>/dev/null || echo "")
+
+    if [[ -n "$SUGGESTED_SKU" ]]; then
+      VM_SKU_MSG="$VM_SKU_MSG; similar SKU: $SUGGESTED_SKU"
+      log "Found alternative: $SUGGESTED_SKU"
+    else
+      log "No alternative D-series SKU found with $VCPUS_PER_VM vCPUs."
     fi
-  done
-  if [[ -n "$SUGGESTED_SKU" ]]; then
-    VM_SKU_MSG="$VM_SIZE not available; similar SKU: $SUGGESTED_SKU"
   fi
 fi
+log "VM SKU check complete."
 
 # --- 3. VM vCPU quota ---
-# Parse "Standard DSv5 Family vCPUs" usage line
-VCPUS_PER_VM=$(echo "$VM_SIZE" | sed -E 's/Standard_D([0-9]+)s_v[0-9]+/\1/')
+# Determine which SKU to check quota for (use suggested alternative if primary failed)
+EFFECTIVE_SKU="$VM_SIZE"
+if [[ "$VM_SKU_OK" == "false" && -n "$SUGGESTED_SKU" ]]; then
+  EFFECTIVE_SKU="$SUGGESTED_SKU"
+fi
+
+# Derive the quota family display name from the SKU
+# e.g., Standard_D16s_v5 -> "Standard DSv5 Family", Standard_D16ds_v5 -> "Standard DDSv5 Family"
+QUOTA_FAMILY=$(echo "$EFFECTIVE_SKU" | python3 -c "
+import sys, re
+sku = sys.stdin.read().strip()
+m = re.match(r'Standard_D(\d+)([a-z]*)_v(\d+)', sku)
+if m:
+    suffix = m.group(2).upper()  # e.g., 's' -> 'S', 'ds' -> 'DS'
+    ver = m.group(3)
+    print(f'Standard D{suffix}v{ver} Family')
+else:
+    print('Standard DSv5 Family')
+" 2>/dev/null || echo "Standard DSv5 Family")
+
 VCPUS_NEEDED=$((VCPUS_PER_VM * VM_COUNT))
 
-USAGE_LINE=$(az vm list-usage --location "$REGION" -o tsv 2>/dev/null | grep -i "Standard DSv5 Family" || echo "")
+log "Checking vCPU quota for $QUOTA_FAMILY in $REGION..."
+USAGE_LINE=$(az vm list-usage --location "$REGION" -o tsv 2>/dev/null | grep -i "$QUOTA_FAMILY" || echo "")
 if [[ -n "$USAGE_LINE" ]]; then
   CURRENT_USAGE=$(echo "$USAGE_LINE" | awk '{print $1}')
   LIMIT=$(echo "$USAGE_LINE" | awk '{print $2}')
@@ -114,8 +217,10 @@ else
   VM_QUOTA_MSG="No existing usage found; quota assumed available"
   AVAILABLE_VCPUS=999
 fi
+log "vCPU quota check complete."
 
 # --- 4. Cosmos DB account quota ---
+log "Checking Cosmos DB account quota..."
 CURRENT_COSMOS=$(az cosmosdb list --query "length(@)" -o tsv 2>/dev/null || echo "0")
 COSMOS_LIMIT=50
 COSMOS_AVAILABLE=$((COSMOS_LIMIT - CURRENT_COSMOS))
@@ -126,8 +231,10 @@ else
   COSMOS_MSG="Only $COSMOS_AVAILABLE slots available (need $COSMOS_COUNT, limit $COSMOS_LIMIT)"
   PASS=false
 fi
+log "Cosmos DB check complete."
 
 # --- 5. Application Insights quota ---
+log "Checking Application Insights quota in $REGION..."
 CURRENT_AI=$(az monitor app-insights component list \
   --query "[?location=='$REGION'] | length(@)" -o tsv 2>/dev/null || echo "0")
 AI_LIMIT=200
@@ -139,6 +246,7 @@ else
   APP_INSIGHTS_MSG="Only $AI_AVAILABLE slots available (need $APP_INSIGHTS_COUNT, limit $AI_LIMIT)"
   PASS=false
 fi
+log "Application Insights check complete."
 
 # --- Output JSON summary ---
 cat <<EOF
@@ -149,6 +257,7 @@ cat <<EOF
     "passed": $VM_SKU_OK,
     "requested": "$VM_SIZE",
     "suggested_alternative": "$SUGGESTED_SKU",
+    "restriction_reason": "$RESTRICTION_REASON",
     "message": "$VM_SKU_MSG"
   },
   "vm_quota": {

From fa32d46c855aedb3b6d97b9fe279cd562d5fa23d Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 15:07:31 -0800
Subject: [PATCH 06/22] Add pre-flight capacity gate and logging to
 provision-all.sh

- Add capacity validation step before resource creation that blocks
  unless all checks pass (VM SKU, quota, Cosmos DB, App Insights)
- Add --skip-capacity-check flag to override the gate
- Add timestamped log() function for all progress messages
- Add elapsed time tracking per resource and total provisioning time
- Fix JSON parsing to match validate-capacity.sh output format
- Update SKILL.md to document new behavior and flag

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../cosmos-benchmark-setup-resources/SKILL.md |  13 ++-
 .../scripts/provision-all.sh                  | 110 +++++++++++++-----
 2 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
index 83dba4829f7b..33b6656269ce 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/SKILL.md
@@ -141,11 +141,12 @@ bash scripts/provision-all.sh \
 ```
 
 The orchestrator:
-1. Creates the resource group
-2. Launches **in parallel**: Cosmos DB accounts, App Insights, and VM creation
-3. Waits for all three to complete
-4. Exports Cosmos DB credentials to `$CONFIG_DIR/clientHostAndKey.txt`
-5. Runs `verify-resources.sh` to confirm everything is ready
+1. **Pre-flight capacity gate** — runs `validate-capacity.sh` to verify all resources are available in the region. **Blocks creation unless all checks pass** (override with `--skip-capacity-check`)
+2. Creates the resource group
+3. Launches **in parallel**: Cosmos DB accounts, App Insights, and VM creation
+4. Waits for all three to complete (with elapsed time logging)
+5. Exports Cosmos DB credentials to `$CONFIG_DIR/clientHostAndKey.txt`
+6. Runs `verify-resources.sh` to confirm everything is ready
 
 Each sub-task logs to `$CONFIG_DIR/logs/` for debugging if anything fails.
 
@@ -228,7 +229,7 @@ Proceed to the **cosmos-benchmark-run** skill to:
 
 | Script | Purpose |
 |---|---|
-| `scripts/provision-all.sh` | **Orchestrator.** Creates RG → launches Cosmos/AppInsights/VM in parallel → exports credentials → verifies. |
+| `scripts/provision-all.sh` | **Orchestrator.** Pre-flight capacity gate → RG → parallel Cosmos/AppInsights/VM → credentials → verify. `--skip-capacity-check` overrides gate. |
 | `scripts/validate-capacity.sh` | Check region capacity (VM SKU, quotas, restrictions). Logs progress. Finds alternative SKUs. JSON output. |
 | `scripts/find-region.sh` | 4-phase region search: exact→similar in preferred, then fallbacks. Supports `--fallback-regions`, `--stop-on-first`. |
 | `scripts/create-cosmos-accounts.sh` | Create N Cosmos DB accounts with progress logging. |
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh
index fb8584d62c3f..d91ff828f4a9 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-setup-resources/scripts/provision-all.sh
@@ -6,7 +6,7 @@
 #     [--cosmos-prefix <prefix>] [--cosmos-count <N>] [--cosmos-consistency Session] \
 #     [--app-insights-name <name>] \
 #     [--vm-name <name>] [--vm-size Standard_D16s_v5] [--vm-disk-size 256] \
-#     [--create-key] [--skip-vm-setup]
+#     [--create-key] [--skip-vm-setup] [--skip-capacity-check]
 #
 # Creates resource group, then launches Cosmos DB, App Insights, and VM creation
 # in parallel. Waits for all to complete. Exports credentials. Runs verify-resources.sh.
@@ -19,6 +19,10 @@ set -uo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 
+log() {
+  printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*"
+}
+
 # Required
 CONFIG_DIR=""
 REGION="westus2"
@@ -38,6 +42,7 @@ VM_SIZE="Standard_D16s_v5"
 VM_DISK_SIZE=256
 CREATE_KEY=true
 SKIP_VM_SETUP=false
+SKIP_CAPACITY_CHECK=false
 
 while [[ $# -gt 0 ]]; do
   case $1 in
@@ -53,6 +58,7 @@ while [[ $# -gt 0 ]]; do
     --vm-disk-size)        VM_DISK_SIZE="$2"; shift 2 ;;
     --create-key)          CREATE_KEY=true; shift ;;
     --skip-vm-setup)       SKIP_VM_SETUP=true; shift ;;
+    --skip-capacity-check) SKIP_CAPACITY_CHECK=true; shift ;;
     *) echo "Unknown option: $1" >&2; exit 1 ;;
   esac
 done
@@ -66,6 +72,8 @@ mkdir -p "$CONFIG_DIR"
 LOG_DIR="$CONFIG_DIR/logs"
 mkdir -p "$LOG_DIR"
 
+PROVISION_START=$(date +%s)
+
 echo "============================================="
 echo "  Provisioning Benchmark Resources"
 echo "============================================="
@@ -78,21 +86,56 @@ echo "  VM:             $VM_NAME ($VM_SIZE)"
 echo "============================================="
 echo ""
 
-# --- Step 1: Create resource group (must complete before parallel creation) ---
-echo "[1/4] Creating resource group: $RG in $REGION"
+# --- Step 1: Pre-flight capacity validation ---
+log "[1/5] Pre-flight capacity check: validating $REGION for $VM_SIZE..."
+bash "$SCRIPT_DIR/validate-capacity.sh" \
+  --region "$REGION" --vm-size "$VM_SIZE" --vm-count 1 \
+  --cosmos-count "$COSMOS_COUNT" --app-insights-count 1 \
+  > "$LOG_DIR/capacity-check.json" 2>>"$LOG_DIR/capacity-check.log"
+
+CAPACITY_ALL_PASSED=$(python3 -c "
+import json
+data = json.load(open('$LOG_DIR/capacity-check.json'))
+print('true' if data.get('passed', False) else 'false')
+" 2>/dev/null || echo "false")
+
+if [[ "$CAPACITY_ALL_PASSED" == "true" ]]; then
+  log "  ✅ All capacity checks passed for $REGION"
+else
+  log "  ⚠️  Capacity check failures detected:"
+  python3 -c "
+import json
+data = json.load(open('$LOG_DIR/capacity-check.json'))
+for key in ['vm_sku', 'vm_quota', 'cosmos_db', 'app_insights']:
+    section = data.get(key, {})
+    if not section.get('passed', False):
+        print(f'    - [{key}] {section.get("message", "no details")}')
+" 2>/dev/null || log "    - (could not parse capacity-check.json)"
+  if [[ "$SKIP_CAPACITY_CHECK" == "true" ]]; then
+    log "  ⚠️  --skip-capacity-check set, continuing despite failures."
+  else
+    log "❌ Capacity validation failed. Resources will NOT be created."
+    log "   Use --skip-capacity-check to override this gate."
+    exit 1
+  fi
+fi
+log ""
+
+# --- Step 2: Create resource group (must complete before parallel creation) ---
+log "[2/5] Creating resource group: $RG in $REGION"
 az group create --name "$RG" --location "$REGION" -o none 2>&1 | tee "$LOG_DIR/rg.log"
-echo ""
+log ""
 
-# --- Step 2: Launch parallel resource creation ---
-echo "[2/4] Creating resources in parallel..."
-echo ""
+# --- Step 3: Launch parallel resource creation ---
+log "[3/5] Creating resources in parallel..."
+log ""
 
 COSMOS_PID=""
 AI_PID=""
 VM_PID=""
 
 # Cosmos DB accounts (background)
-echo "  Starting: Cosmos DB ($COSMOS_COUNT accounts)..."
+log "  Starting: Cosmos DB ($COSMOS_COUNT accounts)..."
 (
   bash "$SCRIPT_DIR/create-cosmos-accounts.sh" \
     --rg "$RG" --prefix "$COSMOS_PREFIX" --count "$COSMOS_COUNT" \
@@ -104,7 +147,7 @@ echo "  Starting: Cosmos DB ($COSMOS_COUNT accounts)..."
 COSMOS_PID=$!
 
 # App Insights (background)
-echo "  Starting: Application Insights ($APP_INSIGHTS_NAME)..."
+log "  Starting: Application Insights ($APP_INSIGHTS_NAME)..."
 (
   az monitor app-insights component create \
     --app "$APP_INSIGHTS_NAME" \
@@ -127,7 +170,7 @@ echo "  Starting: Application Insights ($APP_INSIGHTS_NAME)..."
 AI_PID=$!
 
 # VM (background)
-echo "  Starting: VM ($VM_NAME)..."
+log "  Starting: VM ($VM_NAME)..."
 VM_EXTRA_FLAGS=""
 [[ "$CREATE_KEY" == "true" ]] && VM_EXTRA_FLAGS="$VM_EXTRA_FLAGS --create-key"
 [[ "$SKIP_VM_SETUP" == "true" ]] && VM_EXTRA_FLAGS="$VM_EXTRA_FLAGS --skip-setup"
@@ -148,23 +191,24 @@ VM_EXTRA_FLAGS=""
 ) &
 VM_PID=$!
 
-echo ""
-echo "  PIDs: Cosmos=$COSMOS_PID, AppInsights=$AI_PID, VM=$VM_PID"
-echo ""
+log ""
+log "  PIDs: Cosmos=$COSMOS_PID, AppInsights=$AI_PID, VM=$VM_PID"
+log ""
 
-# --- Step 3: Wait for all to complete ---
-echo "[3/4] Waiting for all resources to complete..."
-echo ""
+# --- Step 4: Wait for all to complete ---
+log "[4/5] Waiting for all resources to complete..."
+log ""
 
+START_WAIT=$(date +%s)
 OVERALL_OK=true
 
 wait $COSMOS_PID
 COSMOS_EXIT=$?
 COSMOS_STATUS=$(cat "$LOG_DIR/cosmos-status" 2>/dev/null || echo "COSMOS_UNKNOWN")
 if [[ "$COSMOS_STATUS" == "COSMOS_SUCCESS" ]]; then
-  echo "  ✅ Cosmos DB: $COSMOS_COUNT account(s) created"
+  log "  ✅ Cosmos DB: $COSMOS_COUNT account(s) created ($(( $(date +%s) - START_WAIT ))s elapsed)"
 else
-  echo "  ❌ Cosmos DB: creation failed (exit=$COSMOS_EXIT). See $LOG_DIR/cosmos-accounts.log"
+  log "  ❌ Cosmos DB: creation failed (exit=$COSMOS_EXIT). See $LOG_DIR/cosmos-accounts.log"
   OVERALL_OK=false
 fi
 
@@ -172,9 +216,9 @@ wait $AI_PID
 AI_EXIT=$?
 AI_STATUS=$(cat "$LOG_DIR/ai-status" 2>/dev/null || echo "AI_UNKNOWN")
 if [[ "$AI_STATUS" == "AI_SUCCESS" ]]; then
-  echo "  ✅ App Insights: $APP_INSIGHTS_NAME created"
+  log "  ✅ App Insights: $APP_INSIGHTS_NAME created ($(( $(date +%s) - START_WAIT ))s elapsed)"
 else
-  echo "  ❌ App Insights: creation failed (exit=$AI_EXIT). See $LOG_DIR/app-insights.log"
+  log "  ❌ App Insights: creation failed (exit=$AI_EXIT). See $LOG_DIR/app-insights.log"
   OVERALL_OK=false
 fi
 
@@ -182,36 +226,38 @@ wait $VM_PID
 VM_EXIT=$?
 VM_STATUS=$(cat "$LOG_DIR/vm-status" 2>/dev/null || echo "VM_UNKNOWN")
 if [[ "$VM_STATUS" == "VM_SUCCESS" ]]; then
-  echo "  ✅ VM: $VM_NAME created"
+  log "  ✅ VM: $VM_NAME created ($(( $(date +%s) - START_WAIT ))s elapsed)"
 else
-  echo "  ❌ VM: creation failed (exit=$VM_EXIT). See $LOG_DIR/vm.log"
+  log "  ❌ VM: creation failed (exit=$VM_EXIT). See $LOG_DIR/vm.log"
   OVERALL_OK=false
 fi
 
-echo ""
+log ""
 
 if [[ "$OVERALL_OK" == "false" ]]; then
-  echo "❌ One or more resources failed to create. Check logs in $LOG_DIR/"
+  log "❌ One or more resources failed to create. Check logs in $LOG_DIR/"
   exit 1
 fi
 
-# --- Step 3b: Export Cosmos DB credentials (must happen after accounts are created) ---
-echo "  Exporting Cosmos DB credentials..."
+# --- Step 4b: Export Cosmos DB credentials (must happen after accounts are created) ---
+log "  Exporting Cosmos DB credentials..."
 bash "$SCRIPT_DIR/export-cosmos-credentials.sh" \
   --rg "$RG" --prefix "$COSMOS_PREFIX" --count "$COSMOS_COUNT" \
   --config-dir "$CONFIG_DIR" 2>&1 | tee "$LOG_DIR/export-credentials.log"
 
 if [[ ! -s "$CONFIG_DIR/clientHostAndKey.txt" ]]; then
-  echo "  ❌ Credential export failed. See $LOG_DIR/export-credentials.log"
+  log "  ❌ Credential export failed. See $LOG_DIR/export-credentials.log"
   exit 1
 fi
-echo ""
+log ""
 
-# --- Step 4: Verify all resources ---
-echo "[4/4] Verifying all resources..."
-echo ""
+# --- Step 5: Verify all resources ---
+log "[5/5] Verifying all resources..."
+log ""
 bash "$SCRIPT_DIR/verify-resources.sh" \
   --config-dir "$CONFIG_DIR" \
   --cosmos-count "$COSMOS_COUNT"
 
-exit $?
+VERIFY_EXIT=$?
+log "Total provisioning time: $(( $(date +%s) - PROVISION_START ))s"
+exit $VERIFY_EXIT

From 4dd3696c22ceeee4edc5d0b6009b645f71d66fc4 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 20:37:55 -0800
Subject: [PATCH 07/22] Run benchmarks in tmux with async orchestration

- Wrap benchmark execution in tmux session ('bench') on VM so the
  process survives SSH disconnections
- Add async execution guidance to SKILL.md so the agent runs the
  orchestrator in background mode, keeping the user's context free
- Use scenario-based poll intervals (2min for SIMPLE, 5min for
  EXPAND/CHURN) instead of 10s fixed polling
- Expand monitoring section with local and VM-side status checks

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/SKILL.md      | 39 +++++++++-
 .../scripts/vm-prepare-and-run.sh             | 73 +++++++++++++++----
 2 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
index 37970d3510d4..c788269281b7 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
@@ -5,7 +5,7 @@ description: Build and run Cosmos DB benchmarks — clone repo at a branch/PR/co
 
 # Run Benchmark
 
-Clone, build, and execute a benchmark on a provisioned VM. All operations are implemented as scripts. Each ref uses a single SSH session for checkout → build → verify → run.
+Clone, build, and execute a benchmark on a provisioned VM. All operations are implemented as scripts. Each ref uses a single SSH session for checkout → build → verify → run. The benchmark execution runs inside a **tmux session** on the VM for resilience against SSH disconnections.
 
 ## VM Connection
 
@@ -75,7 +75,7 @@ The orchestrator, for each ref, uses **a single SSH session** to:
 1. Checkout the ref (auto-detects branch/PR/commit/tag)
 2. Build linting-extensions + cosmos benchmark JAR
 3. Verify readiness (JDK, JAR, config, disk)
-4. Execute the benchmark
+4. Execute the benchmark (inside a tmux session for resilience)
 
 Results are saved to `results/<date>-<scenario>-<ref-label>/` on the VM.
 
@@ -100,16 +100,47 @@ Runs are named `<date>-<scenario>-<ref-label>`, e.g.:
 20260302-CHURN-PR-12345
 ```
 
+### Async execution (non-blocking)
+
+**Always run the orchestrator in async mode** so the user can continue working while benchmarks run. Use the `bash` tool with `mode="async"`:
+
+```bash
+# Launches in background, returns a shellId for monitoring
+bash scripts/run-all-refs.sh \
+  --config-dir "$CONFIG_DIR" \
+  --refs "main, fix/telemetry-leak" \
+  --scenario SIMPLE
+```
+
+After launching, the user can:
+- **Continue working** on other tasks in the main context
+- **Check status** at any time (see Monitor progress below)
+- **Get notified** when the orchestrator reports completion via `read_bash`
+
+The benchmark itself runs in a **tmux session** (`bench`) on the VM, so it survives SSH disconnections. Even if the local orchestrator process is interrupted, the benchmark continues on the VM.
+
 ### Monitor progress
 
+**Local orchestrator output** (shows which ref is running, build progress):
+
+Use `read_bash` with the shellId from the async launch to check the latest output.
+
+**VM-side benchmark output** (real-time metrics, live logs):
+
 ```bash
 SSH_CMD="ssh -i $(cat $CONFIG_DIR/vm-key) $(cat $CONFIG_DIR/vm-user)@$(cat $CONFIG_DIR/vm-ip)"
 
-# Peek at output
+# Peek at live benchmark output in the tmux session
 $SSH_CMD "tmux capture-pane -t bench -p | tail -30"
 
-# Check monitor.csv
+# Check if tmux session is still running
+$SSH_CMD "tmux has-session -t bench 2>/dev/null && echo 'Running' || echo 'Finished'"
+
+# Check monitor.csv row count (grows every 60s)
 $SSH_CMD "wc -l ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<run-name>/monitor.csv"
+
+# Attach to live session (interactive — for debugging only)
+$SSH_CMD -t "tmux attach -t bench"
 ```
 
 ## Output Directory Structure
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
index df66f898d582..f40609c515be 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
@@ -115,24 +115,69 @@ if [[ "$READY" != "true" ]]; then
   exit 1
 fi
 
-# --- Step 4: Run ---
+# --- Step 4: Run in tmux (survives SSH disconnection) ---
 echo ""
-echo "=== [4/4] Run: $SCENARIO ==="
+echo "=== [4/4] Run: $SCENARIO (tmux session: bench) ==="
 
+cd "$BENCH_DIR"
+RESULTS_DIR="./results/$RUN_NAME"
+mkdir -p "$RESULTS_DIR"
+
+# End any previous benchmark tmux session gracefully
+tmux send-keys -t bench C-c 2>/dev/null || true
+sleep 1
+tmux send-keys -t bench "exit" Enter 2>/dev/null || true
+sleep 1
+
+# Write run script with resolved paths (executed inside tmux)
+cat > "$RESULTS_DIR/.run.sh" <<EOF
+#!/bin/bash
+set -uo pipefail
 cd "$BENCH_DIR"
 if [[ -f "copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh" ]]; then
-  bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh \
-    "$SCENARIO" "$TENANTS_FILE" "./results/$RUN_NAME" $EXTRA_FLAGS
+  bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh \\
+    "$SCENARIO" "$TENANTS_RESOLVED" "$RESULTS_DIR" $EXTRA_FLAGS
 else
   echo "WARNING: run-benchmark.sh not found, running JAR directly"
-  mkdir -p "results/$RUN_NAME"
-  java -Xmx8g -Xms8g -XX:+UseG1GC \
-    -Xlog:gc*:"results/$RUN_NAME/gc.log" \
-    -jar "$JAR" \
-    -tenantsFile "$TENANTS_RESOLVED" \
-    -reportingDirectory "results/$RUN_NAME/metrics" \
-    2>&1 | tee "results/$RUN_NAME/benchmark.log"
+  JAR=\$(ls target/*jar-with-dependencies.jar 2>/dev/null | head -1)
+  java -Xmx8g -Xms8g -XX:+UseG1GC \\
+    -Xlog:gc*:"$RESULTS_DIR/gc.log" \\
+    -jar "\$JAR" \\
+    -tenantsFile "$TENANTS_RESOLVED" \\
+    -reportingDirectory "$RESULTS_DIR/metrics" \\
+    2>&1 | tee "$RESULTS_DIR/benchmark.log"
+fi
+echo \$? > "$RESULTS_DIR/.exit-code"
+EOF
+chmod +x "$RESULTS_DIR/.run.sh"
+
+# Start benchmark in tmux -- process persists even if SSH disconnects
+tmux new-session -d -s bench "bash '$RESULTS_DIR/.run.sh'"
+echo "  Benchmark running in tmux session 'bench'"
+echo "  Monitor:  tmux capture-pane -t bench -p | tail -30"
+
+# Poll interval based on scenario duration (SIMPLE ~30min, EXPAND ~90min, CHURN varies)
+case "$SCENARIO" in
+  SIMPLE)  POLL_INTERVAL=120 ;;   # 2 min
+  EXPAND)  POLL_INTERVAL=300 ;;   # 5 min
+  CHURN)   POLL_INTERVAL=300 ;;   # 5 min
+  *)       POLL_INTERVAL=120 ;;   # 2 min default
+esac
+
+# Wait for tmux session to complete
+echo "  Poll interval: ${POLL_INTERVAL}s"
+while tmux has-session -t bench 2>/dev/null; do
+  sleep $POLL_INTERVAL
+done
+
+# Read exit code written by the run script
+BENCH_EXIT=$(cat "$RESULTS_DIR/.exit-code" 2>/dev/null || echo 1)
+
+if [[ "$BENCH_EXIT" -eq 0 ]]; then
+  echo ""
+  echo "Completed: $REF ($BRANCH @ $COMMIT) -> results/$RUN_NAME"
+else
+  echo ""
+  echo "Benchmark failed (exit code: $BENCH_EXIT)"
+  exit "$BENCH_EXIT"
 fi
-
-echo ""
-echo "✅ Completed: $REF ($BRANCH @ $COMMIT) → results/$RUN_NAME"

From b79b2ef414b05c039894b3580b7ddb706d6d961e Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 20:43:31 -0800
Subject: [PATCH 08/22] Support remote/branch ref format in benchmark checkout

- Detect refs like 'xinlian12/branchName' by checking if the part
  before the first slash matches an existing git remote
- If remote exists, fetch from that remote; otherwise treat the
  slash as part of the branch name on origin
- Document fork branch format in SKILL.md ref examples

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/SKILL.md      |  1 +
 .../scripts/vm-prepare-and-run.sh             | 22 ++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
index c788269281b7..36a19630da9b 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
@@ -24,6 +24,7 @@ Ask the user for:
    - Multiple for comparison: `main, fix/telemetry-leak`
    - PR + baseline: `PR#12345, main`
    - Commit SHAs: `abc1234, def5678`
+   - Fork branches: `xinlian12/wireConnectionSharingInBenchmark` (auto-detects remote)
 
 2. **Scenario preset** (default: `SIMPLE`):
 
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
index f40609c515be..02a8e4b292b0 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
@@ -55,9 +55,25 @@ elif [[ "$REF" =~ ^[0-9a-f]{7,40}$ ]]; then
   git fetch origin
   git checkout "$REF"
 else
-  git fetch --depth 1 origin "$REF"
-  git checkout "$REF"
-  git pull origin "$REF" 2>/dev/null || true
+  # Detect remote/branch format (e.g., xinlian12/wireConnectionSharingInBenchmark)
+  if [[ "$REF" == */* ]]; then
+    REMOTE_NAME="${REF%%/*}"
+    BRANCH_NAME="${REF#*/}"
+    if git remote | grep -qx "$REMOTE_NAME"; then
+      echo "Fetching $BRANCH_NAME from remote $REMOTE_NAME"
+      git fetch --depth 1 "$REMOTE_NAME" "$BRANCH_NAME"
+      git checkout FETCH_HEAD
+    else
+      # Slash is part of the branch name (e.g., feature/foo on origin)
+      git fetch --depth 1 origin "$REF"
+      git checkout "$REF"
+      git pull origin "$REF" 2>/dev/null || true
+    fi
+  else
+    git fetch --depth 1 origin "$REF"
+    git checkout "$REF"
+    git pull origin "$REF" 2>/dev/null || true
+  fi
 fi
 
 BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "detached")

From 3a61da6e4b7032e2c56fff13f48deb1660b4f19f Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 20:45:39 -0800
Subject: [PATCH 09/22] Add early exit detection and troubleshooting to run
 skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Instruct agent to proactively verify the run is progressing after
  async launch — if the shell exits too quickly, investigate
- Add diagnosis steps: check results dirs, git state, JAR, tmux
- Document common failures table (checkout, build, startup, SSH)
- Require confirming with user before relaunching after a failure

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/SKILL.md      | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
index 36a19630da9b..05b32fd78fe1 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
@@ -144,6 +144,41 @@ $SSH_CMD "wc -l ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<
 $SSH_CMD -t "tmux attach -t bench"
 ```
 
+### Early exit detection and troubleshooting
+
+After launching the orchestrator in async mode, **proactively verify** that the run is progressing. If the async shell exits within a few minutes (expected runtime is 30–90+ min), the run likely failed early.
+
+**Detection**: When checking status via `read_bash`, if the shell has already exited or accepts new commands, investigate immediately — do not assume success.
+
+**Diagnosis steps** (run on the VM via SSH):
+
+```bash
+SSH_CMD="ssh -i $(cat $CONFIG_DIR/vm-key) $(cat $CONFIG_DIR/vm-user)@$(cat $CONFIG_DIR/vm-ip)"
+
+# 1. Check if any new results directories were created today
+$SSH_CMD "ls -lt ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/ | head -5"
+
+# 2. Check git state (last checkout, remotes available)
+$SSH_CMD "cd ~/azure-sdk-for-java && git log --oneline -1 && git remote -v"
+
+# 3. Check if the benchmark JAR exists
+$SSH_CMD "ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/target/*jar-with-dependencies.jar 2>/dev/null || echo 'JAR not found — build may have failed'"
+
+# 4. Check tmux session
+$SSH_CMD "tmux has-session -t bench 2>/dev/null && echo 'Running' || echo 'No session'"
+```
+
+**Common failures and fixes**:
+
+| Symptom | Likely cause | Fix |
+|---|---|---|
+| No new results directory | Checkout failed | Check if ref exists; for fork branches use `remote/branch` format (e.g., `xinlian12/branchName`) — the script auto-detects remotes configured on the VM |
+| JAR not found | Build failed | SSH in and check Maven output; common issues: disk space, dependency download failures |
+| tmux session exited immediately | Benchmark startup error | Check `results/<run-name>/benchmark.log` for errors (e.g., invalid tenants.json, connection failures) |
+| Orchestrator exits but tmux still running | SSH timeout during poll wait | Benchmark is fine — tmux session survives. Check results via SSH directly |
+
+**If diagnosis reveals an issue**, fix it and confirm with the user before relaunching. Do not silently retry.
+
 ## Output Directory Structure
 
 ```

From 86e373008c022263ced75511042a4e3447eb5707 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 20:56:17 -0800
Subject: [PATCH 10/22] Add check-status.sh for single-command VM status checks

- New script checks tmux session, results directories (with per-run
  status), git state, build status, and optionally system resources
- Supports --run-name for run-specific details (monitor samples,
  metrics, disk usage) and --verbose for system resource info
- Updated SKILL.md to reference check-status.sh in monitoring and
  troubleshooting sections
- Fix SSH stdin consumption in while-read loop with -n flag

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/SKILL.md      |  49 +++----
 .../scripts/check-status.sh                   | 137 ++++++++++++++++++
 2 files changed, 157 insertions(+), 29 deletions(-)
 create mode 100755 sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/check-status.sh

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
index 05b32fd78fe1..2087c797b2af 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
@@ -122,27 +122,29 @@ The benchmark itself runs in a **tmux session** (`bench`) on the VM, so it survi
 
 ### Monitor progress
 
-**Local orchestrator output** (shows which ref is running, build progress):
-
-Use `read_bash` with the shellId from the async launch to check the latest output.
-
-**VM-side benchmark output** (real-time metrics, live logs):
+**Quick status check** — use the `check-status.sh` script:
 
 ```bash
-SSH_CMD="ssh -i $(cat $CONFIG_DIR/vm-key) $(cat $CONFIG_DIR/vm-user)@$(cat $CONFIG_DIR/vm-ip)"
+bash scripts/check-status.sh --config-dir "$CONFIG_DIR"
 
-# Peek at live benchmark output in the tmux session
-$SSH_CMD "tmux capture-pane -t bench -p | tail -30"
+# With run-specific details
+bash scripts/check-status.sh --config-dir "$CONFIG_DIR" --run-name 20260303-SIMPLE-main
 
-# Check if tmux session is still running
-$SSH_CMD "tmux has-session -t bench 2>/dev/null && echo 'Running' || echo 'Finished'"
+# With system resource info
+bash scripts/check-status.sh --config-dir "$CONFIG_DIR" --verbose
+```
 
-# Check monitor.csv row count (grows every 60s)
-$SSH_CMD "wc -l ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/<run-name>/monitor.csv"
+The script checks in a single call:
+- Tmux session status (running / completed) with latest output
+- Results directories with per-run status (in progress / completed / failed)
+- Git state (branch, commit)
+- Build status (JAR present)
+- Run-specific details when `--run-name` is provided (monitor samples, metrics, disk)
+- System resources when `--verbose` is set (disk, memory, load)
 
-# Attach to live session (interactive — for debugging only)
-$SSH_CMD -t "tmux attach -t bench"
-```
+**Local orchestrator output** (shows which ref is currently running):
+
+Use `read_bash` with the shellId from the async launch.
 
 ### Early exit detection and troubleshooting
 
@@ -150,22 +152,10 @@ After launching the orchestrator in async mode, **proactively verify** that the
 
 **Detection**: When checking status via `read_bash`, if the shell has already exited or accepts new commands, investigate immediately — do not assume success.
 
-**Diagnosis steps** (run on the VM via SSH):
+**Diagnosis**: Run `check-status.sh` which checks all of the above in one call:
 
 ```bash
-SSH_CMD="ssh -i $(cat $CONFIG_DIR/vm-key) $(cat $CONFIG_DIR/vm-user)@$(cat $CONFIG_DIR/vm-ip)"
-
-# 1. Check if any new results directories were created today
-$SSH_CMD "ls -lt ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/ | head -5"
-
-# 2. Check git state (last checkout, remotes available)
-$SSH_CMD "cd ~/azure-sdk-for-java && git log --oneline -1 && git remote -v"
-
-# 3. Check if the benchmark JAR exists
-$SSH_CMD "ls ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/target/*jar-with-dependencies.jar 2>/dev/null || echo 'JAR not found — build may have failed'"
-
-# 4. Check tmux session
-$SSH_CMD "tmux has-session -t bench 2>/dev/null && echo 'Running' || echo 'No session'"
+bash scripts/check-status.sh --config-dir "$CONFIG_DIR" --verbose
 ```
 
 **Common failures and fixes**:
@@ -204,6 +194,7 @@ Suggest using the **cosmos-benchmark-analyze** skill to download and analyze res
 | `scripts/run-benchmark.sh` | Execute benchmark with monitoring (git metadata, GC log, monitor.csv). |
 | `scripts/monitor.sh` | External JVM monitoring (spawned by run-benchmark.sh). |
 | `scripts/capture-diagnostics.sh` | Capture thread/heap dumps and JFR recordings during a live run. |
+| `scripts/check-status.sh` | **Status checker.** Tmux, results, git, build, system resources — all in one call. |
 | `references/tenants-sample.json` | Template for tenants.json structure. |
 | `references/presets.md` | Preset flag recipes (SIMPLE, EXPAND, CHURN). |
 | `references/scenarios.md` | Full operation catalog (20+ types). |
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/check-status.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/check-status.sh
new file mode 100755
index 000000000000..b95bbd39b3aa
--- /dev/null
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/check-status.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# check-status.sh — Check benchmark status on the VM
+#
+# Usage:
+#   ./check-status.sh --config-dir <path> [--run-name <name>] [--verbose]
+
+set -uo pipefail
+
+CONFIG_DIR=""
+RUN_NAME=""
+VERBOSE=false
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --config-dir)  CONFIG_DIR="$2"; shift 2 ;;
+    --run-name)    RUN_NAME="$2"; shift 2 ;;
+    --verbose)     VERBOSE=true; shift ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+if [[ -z "$CONFIG_DIR" ]]; then
+  echo "Usage: $0 --config-dir <path> [--run-name <name>] [--verbose]" >&2
+  exit 1
+fi
+
+VM_IP=$(cat "$CONFIG_DIR/vm-ip")
+VM_USER=$(cat "$CONFIG_DIR/vm-user")
+VM_KEY=$(cat "$CONFIG_DIR/vm-key")
+SSH_CMD="ssh -i $VM_KEY -o StrictHostKeyChecking=no -o ConnectTimeout=10 $VM_USER@$VM_IP"
+
+BENCH_DIR="~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark"
+
+echo "=== Benchmark Status ==="
+echo "  VM: $VM_USER@$VM_IP"
+echo ""
+
+# 1. tmux session
+echo "--- Tmux Session ---"
+TMUX_STATUS=$($SSH_CMD "tmux has-session -t bench 2>/dev/null && echo 'RUNNING' || echo 'NONE'" 2>/dev/null)
+if [[ "$TMUX_STATUS" == "RUNNING" ]]; then
+  echo "  Status: ✅ Running"
+  echo ""
+  echo "  Latest output:"
+  $SSH_CMD "tmux capture-pane -t bench -p | tail -15" 2>/dev/null | sed 's/^/    /'
+else
+  echo "  Status: ⏹️  No active session (build phase or completed)"
+fi
+echo ""
+
+# 2. Results directories
+echo "--- Results ---"
+RESULTS=$($SSH_CMD "ls -lt $BENCH_DIR/results/ 2>/dev/null | grep '^d' | head -5" 2>/dev/null)
+if [[ -n "$RESULTS" ]]; then
+  echo "$RESULTS" | while read -r line; do
+    DIR_NAME=$(echo "$line" | awk '{print $NF}')
+    DIR_TIME=$(echo "$line" | awk '{print $6, $7, $8}')
+    # Check for exit code file
+    EXIT_FILE=$($SSH_CMD -n "cat $BENCH_DIR/results/$DIR_NAME/.exit-code 2>/dev/null" 2>/dev/null)
+    if [[ -n "$EXIT_FILE" ]]; then
+      if [[ "$EXIT_FILE" == "0" ]]; then
+        STATUS="✅ Completed"
+      else
+        STATUS="❌ Failed (exit=$EXIT_FILE)"
+      fi
+    else
+      # Check if monitor.csv exists and is growing
+      MONITOR_LINES=$($SSH_CMD -n "wc -l < $BENCH_DIR/results/$DIR_NAME/monitor.csv 2>/dev/null" 2>/dev/null | tr -d ' ')
+      if [[ -n "$MONITOR_LINES" && "$MONITOR_LINES" -gt 0 ]]; then
+        STATUS="🔄 In progress (monitor: ${MONITOR_LINES} samples)"
+      else
+        STATUS="⏳ Starting or build-only"
+      fi
+    fi
+    echo "  $DIR_NAME  ($DIR_TIME)  $STATUS"
+  done
+else
+  echo "  No results directories found"
+fi
+echo ""
+
+# 3. Git state
+echo "--- Git ---"
+$SSH_CMD "cd ~/azure-sdk-for-java && echo \"  Branch: \$(git rev-parse --abbrev-ref HEAD 2>/dev/null)\" && echo \"  Commit: \$(git log --oneline -1 2>/dev/null)\"" 2>/dev/null
+echo ""
+
+# 4. Run-specific details
+if [[ -n "$RUN_NAME" ]]; then
+  echo "--- Run: $RUN_NAME ---"
+  RUN_DIR="$BENCH_DIR/results/$RUN_NAME"
+
+  # git-info.json
+  GIT_INFO=$($SSH_CMD "cat $RUN_DIR/git-info.json 2>/dev/null" 2>/dev/null)
+  if [[ -n "$GIT_INFO" ]]; then
+    echo "  Git info: $GIT_INFO"
+  fi
+
+  # Monitor CSV
+  MONITOR_LINES=$($SSH_CMD "wc -l < $RUN_DIR/monitor.csv 2>/dev/null" 2>/dev/null | tr -d ' ')
+  if [[ -n "$MONITOR_LINES" && "$MONITOR_LINES" -gt 0 ]]; then
+    echo "  Monitor samples: $MONITOR_LINES"
+    if [[ "$VERBOSE" == "true" ]]; then
+      echo "  Last 3 monitor entries:"
+      $SSH_CMD "tail -3 $RUN_DIR/monitor.csv 2>/dev/null" 2>/dev/null | sed 's/^/    /'
+    fi
+  fi
+
+  # Metrics files
+  METRIC_COUNT=$($SSH_CMD "ls $RUN_DIR/metrics/*.csv 2>/dev/null | wc -l" 2>/dev/null | tr -d ' ')
+  if [[ -n "$METRIC_COUNT" && "$METRIC_COUNT" -gt 0 ]]; then
+    echo "  Metric files: $METRIC_COUNT"
+  fi
+
+  # Disk usage
+  DISK=$($SSH_CMD "du -sh $RUN_DIR 2>/dev/null" 2>/dev/null | awk '{print $1}')
+  if [[ -n "$DISK" ]]; then
+    echo "  Disk usage: $DISK"
+  fi
+  echo ""
+fi
+
+# 5. JAR status
+echo "--- Build ---"
+JAR=$($SSH_CMD "ls $BENCH_DIR/target/*jar-with-dependencies.jar 2>/dev/null | head -1" 2>/dev/null)
+if [[ -n "$JAR" ]]; then
+  echo "  JAR: ✅ $(basename $JAR)"
+else
+  echo "  JAR: ❌ Not found (build may be in progress or failed)"
+fi
+echo ""
+
+# 6. System resources (verbose only)
+if [[ "$VERBOSE" == "true" ]]; then
+  echo "--- System ---"
+  $SSH_CMD "echo \"  Disk: \$(df -h / | tail -1 | awk '{print \$4}') available\" && echo \"  Memory: \$(free -h 2>/dev/null | awk '/Mem:/{print \$3\"/\"\$2}' || echo 'N/A')\" && echo \"  Load: \$(uptime | sed 's/.*load average/load average/')\"" 2>/dev/null
+  echo ""
+fi

From 38d43689aacd03871767e9c3a6e28b94a6e97851 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:05:46 -0800
Subject: [PATCH 11/22] Copy scripts to VM via SCP instead of stdin piping

- SCP vm-prepare-and-run.sh, run-benchmark.sh, monitor.sh, and
  capture-diagnostics.sh to ~/benchmark-scripts/ on the VM
- Execute remotely via 'bash ~/benchmark-scripts/vm-prepare-and-run.sh'
  instead of 'bash -s' stdin piping which broke heredocs
- Update vm-prepare-and-run.sh to reference co-located scripts from
  ~/benchmark-scripts/ in the tmux run script

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../scripts/run-all-refs.sh                   | 23 +++++++++++++++----
 .../scripts/vm-prepare-and-run.sh             |  5 ++--
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
index 451fee359b11..7c2e7eb223f5 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
@@ -4,8 +4,8 @@
 # Usage:
 #   ./run-all-refs.sh --config-dir <path> --refs "main,fix/leak" [--scenario SIMPLE]
 #
-# For each ref, runs vm-prepare-and-run.sh on the VM via a single SSH session
-# (checkout → build → verify → run — all in one connection).
+# Copies scripts to the VM via SCP, then for each ref executes
+# vm-prepare-and-run.sh remotely (checkout → build → verify → run).
 
 set -uo pipefail
 
@@ -37,6 +37,19 @@ VM_IP=$(cat "$CONFIG_DIR/vm-ip")
 VM_USER=$(cat "$CONFIG_DIR/vm-user")
 VM_KEY=$(cat "$CONFIG_DIR/vm-key")
 SSH_CMD="ssh -i $VM_KEY -o StrictHostKeyChecking=no $VM_USER@$VM_IP"
+SCP_CMD="scp -i $VM_KEY -o StrictHostKeyChecking=no"
+
+# Copy scripts to VM (avoids stdin piping issues with heredocs)
+VM_SCRIPTS_DIR="~/benchmark-scripts"
+$SSH_CMD "mkdir -p $VM_SCRIPTS_DIR"
+for SCRIPT_FILE in vm-prepare-and-run.sh run-benchmark.sh monitor.sh capture-diagnostics.sh; do
+  if [[ -f "$SCRIPT_DIR/$SCRIPT_FILE" ]]; then
+    $SCP_CMD "$SCRIPT_DIR/$SCRIPT_FILE" "$VM_USER@$VM_IP:$VM_SCRIPTS_DIR/$SCRIPT_FILE"
+  fi
+done
+$SSH_CMD "chmod +x $VM_SCRIPTS_DIR/*.sh"
+echo "Scripts copied to VM:$VM_SCRIPTS_DIR"
+echo ""
 
 IFS=',' read -ra REFS <<< "$REFS_CSV"
 TOTAL=${#REFS[@]}
@@ -65,9 +78,9 @@ for i in "${!REFS[@]}"; do
   echo "$SEQ Starting: $REF → $RUN_NAME"
   echo "   (single SSH session: checkout → build → verify → run)"
 
-  # Send vm-prepare-and-run.sh to VM and execute — 1 SSH session per ref
-  $SSH_CMD "bash -s" < "$SCRIPT_DIR/vm-prepare-and-run.sh" \
-    -- "$REF" "$SCENARIO" "$TENANTS_FILE" "$RUN_NAME" $EXTRA_FLAGS
+  # Execute the script already on the VM (copied via SCP at startup)
+  $SSH_CMD "bash $VM_SCRIPTS_DIR/vm-prepare-and-run.sh \
+    '$REF' '$SCENARIO' '$TENANTS_FILE' '$RUN_NAME' $EXTRA_FLAGS"
   RUN_EXIT=$?
 
   if [[ $RUN_EXIT -eq 0 ]]; then
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
index 02a8e4b292b0..472b8d3d1f42 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
@@ -150,8 +150,9 @@ cat > "$RESULTS_DIR/.run.sh" <<EOF
 #!/bin/bash
 set -uo pipefail
 cd "$BENCH_DIR"
-if [[ -f "copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh" ]]; then
-  bash copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh \\
+VM_SCRIPTS_DIR=~/benchmark-scripts
+if [[ -f "\$VM_SCRIPTS_DIR/run-benchmark.sh" ]]; then
+  bash "\$VM_SCRIPTS_DIR/run-benchmark.sh" \\
     "$SCENARIO" "$TENANTS_RESOLVED" "$RESULTS_DIR" $EXTRA_FLAGS
 else
   echo "WARNING: run-benchmark.sh not found, running JAR directly"

From ecba2567e47f3a2c13ff6e19a97677d27c73c316 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:08:48 -0800
Subject: [PATCH 12/22] Use repo scripts after checkout, SCP only the
 bootstrapper

- run-all-refs.sh now only SCPs vm-prepare-and-run.sh (the bootstrapper)
  instead of all 4 scripts
- After checkout, vm-prepare-and-run.sh resolves scripts from the
  cloned repo (copilot/skills/.../scripts/) so they match the ref
  being benchmarked
- Falls back to ~/benchmark-scripts/ if the repo doesn't include
  the scripts yet (e.g., older branches)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../cosmos-benchmark-run/scripts/run-all-refs.sh | 14 +++++++-------
 .../scripts/vm-prepare-and-run.sh                | 16 +++++++++++++---
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
index 7c2e7eb223f5..b33245a9444b 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
@@ -39,16 +39,16 @@ VM_KEY=$(cat "$CONFIG_DIR/vm-key")
 SSH_CMD="ssh -i $VM_KEY -o StrictHostKeyChecking=no $VM_USER@$VM_IP"
 SCP_CMD="scp -i $VM_KEY -o StrictHostKeyChecking=no"
 
-# Copy scripts to VM (avoids stdin piping issues with heredocs)
+# Copy bootstrapper to VM (avoids stdin piping issues with heredocs).
+# Only vm-prepare-and-run.sh is copied here — after checkout, the remaining
+# scripts (run-benchmark.sh, monitor.sh, etc.) are resolved from the cloned
+# repo so they match the ref being benchmarked. Falls back to ~/benchmark-scripts/
+# if the repo version doesn't include them yet.
 VM_SCRIPTS_DIR="~/benchmark-scripts"
 $SSH_CMD "mkdir -p $VM_SCRIPTS_DIR"
-for SCRIPT_FILE in vm-prepare-and-run.sh run-benchmark.sh monitor.sh capture-diagnostics.sh; do
-  if [[ -f "$SCRIPT_DIR/$SCRIPT_FILE" ]]; then
-    $SCP_CMD "$SCRIPT_DIR/$SCRIPT_FILE" "$VM_USER@$VM_IP:$VM_SCRIPTS_DIR/$SCRIPT_FILE"
-  fi
-done
+$SCP_CMD "$SCRIPT_DIR/vm-prepare-and-run.sh" "$VM_USER@$VM_IP:$VM_SCRIPTS_DIR/vm-prepare-and-run.sh"
 $SSH_CMD "chmod +x $VM_SCRIPTS_DIR/*.sh"
-echo "Scripts copied to VM:$VM_SCRIPTS_DIR"
+echo "Bootstrapper copied to VM:$VM_SCRIPTS_DIR/vm-prepare-and-run.sh"
 echo ""
 
 IFS=',' read -ra REFS <<< "$REFS_CSV"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
index 472b8d3d1f42..84b978a9d7bf 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
@@ -80,6 +80,17 @@ BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "detached")
 COMMIT=$(git rev-parse --short HEAD)
 echo "Checked out: $BRANCH @ $COMMIT"
 
+# Resolve script directory: prefer repo scripts (match the ref), fall back to ~/benchmark-scripts
+REPO_SCRIPTS_DIR="$BENCH_DIR/copilot/skills/cosmos-benchmark-run/scripts"
+FALLBACK_SCRIPTS_DIR=~/benchmark-scripts
+if [[ -d "$REPO_SCRIPTS_DIR" ]]; then
+  VM_SCRIPTS_DIR="$REPO_SCRIPTS_DIR"
+  echo "Using repo scripts: $VM_SCRIPTS_DIR"
+else
+  VM_SCRIPTS_DIR="$FALLBACK_SCRIPTS_DIR"
+  echo "Using fallback scripts: $VM_SCRIPTS_DIR"
+fi
+
 # --- Step 2: Build ---
 echo ""
 echo "=== [2/4] Build ==="
@@ -150,9 +161,8 @@ cat > "$RESULTS_DIR/.run.sh" <<EOF
 #!/bin/bash
 set -uo pipefail
 cd "$BENCH_DIR"
-VM_SCRIPTS_DIR=~/benchmark-scripts
-if [[ -f "\$VM_SCRIPTS_DIR/run-benchmark.sh" ]]; then
-  bash "\$VM_SCRIPTS_DIR/run-benchmark.sh" \\
+if [[ -f "$VM_SCRIPTS_DIR/run-benchmark.sh" ]]; then
+  bash "$VM_SCRIPTS_DIR/run-benchmark.sh" \\
     "$SCENARIO" "$TENANTS_RESOLVED" "$RESULTS_DIR" $EXTRA_FLAGS
 else
   echo "WARNING: run-benchmark.sh not found, running JAR directly"

From 9374352431d846f945ebcf8f55c8c0461a2da9f8 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:11:56 -0800
Subject: [PATCH 13/22] Add --force-copy-scripts flag for testing local script
 changes

- run-all-refs.sh: --force-copy-scripts copies ALL scripts to VM
  (not just the bootstrapper) and passes --force-scripts to the
  bootstrapper
- vm-prepare-and-run.sh: --force-scripts overrides repo-first
  resolution, using ~/benchmark-scripts/ (the SCP'd copies) instead
- Default behavior unchanged: repo scripts used after checkout

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../scripts/run-all-refs.sh                   | 33 ++++++++++++++-----
 .../scripts/vm-prepare-and-run.sh             | 23 +++++++++++--
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
index b33245a9444b..bd9d4d7a7bac 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
@@ -3,6 +3,7 @@
 #
 # Usage:
 #   ./run-all-refs.sh --config-dir <path> --refs "main,fix/leak" [--scenario SIMPLE]
+#   ./run-all-refs.sh --config-dir <path> --refs "main" --force-copy-scripts
 #
 # Copies scripts to the VM via SCP, then for each ref executes
 # vm-prepare-and-run.sh remotely (checkout → build → verify → run).
@@ -16,6 +17,7 @@ SCENARIO="SIMPLE"
 REFS_CSV=""
 TENANTS_FILE="~/tenants.json"
 EXTRA_FLAGS=""
+FORCE_COPY_SCRIPTS=false
 
 while [[ $# -gt 0 ]]; do
   case $1 in
@@ -24,6 +26,7 @@ while [[ $# -gt 0 ]]; do
     --refs)          REFS_CSV="$2"; shift 2 ;;
     --tenants-file)  TENANTS_FILE="$2"; shift 2 ;;
     --extra-flags)   EXTRA_FLAGS="$2"; shift 2 ;;
+    --force-copy-scripts) FORCE_COPY_SCRIPTS=true; shift ;;
     *) echo "Unknown option: $1" >&2; exit 1 ;;
   esac
 done
@@ -39,16 +42,26 @@ VM_KEY=$(cat "$CONFIG_DIR/vm-key")
 SSH_CMD="ssh -i $VM_KEY -o StrictHostKeyChecking=no $VM_USER@$VM_IP"
 SCP_CMD="scp -i $VM_KEY -o StrictHostKeyChecking=no"
 
-# Copy bootstrapper to VM (avoids stdin piping issues with heredocs).
-# Only vm-prepare-and-run.sh is copied here — after checkout, the remaining
-# scripts (run-benchmark.sh, monitor.sh, etc.) are resolved from the cloned
-# repo so they match the ref being benchmarked. Falls back to ~/benchmark-scripts/
-# if the repo version doesn't include them yet.
+# Copy scripts to VM.
+# Default: only the bootstrapper (vm-prepare-and-run.sh) is copied; after
+# checkout, remaining scripts are resolved from the cloned repo.
+# --force-copy-scripts: copies ALL scripts and tells the bootstrapper to
+# prefer ~/benchmark-scripts/ over repo versions (for testing local changes).
 VM_SCRIPTS_DIR="~/benchmark-scripts"
 $SSH_CMD "mkdir -p $VM_SCRIPTS_DIR"
-$SCP_CMD "$SCRIPT_DIR/vm-prepare-and-run.sh" "$VM_USER@$VM_IP:$VM_SCRIPTS_DIR/vm-prepare-and-run.sh"
-$SSH_CMD "chmod +x $VM_SCRIPTS_DIR/*.sh"
-echo "Bootstrapper copied to VM:$VM_SCRIPTS_DIR/vm-prepare-and-run.sh"
+if [[ "$FORCE_COPY_SCRIPTS" == "true" ]]; then
+  for SCRIPT_FILE in vm-prepare-and-run.sh run-benchmark.sh monitor.sh capture-diagnostics.sh; do
+    if [[ -f "$SCRIPT_DIR/$SCRIPT_FILE" ]]; then
+      $SCP_CMD "$SCRIPT_DIR/$SCRIPT_FILE" "$VM_USER@$VM_IP:$VM_SCRIPTS_DIR/$SCRIPT_FILE"
+    fi
+  done
+  $SSH_CMD "chmod +x $VM_SCRIPTS_DIR/*.sh"
+  echo "All scripts copied to VM:$VM_SCRIPTS_DIR (force mode)"
+else
+  $SCP_CMD "$SCRIPT_DIR/vm-prepare-and-run.sh" "$VM_USER@$VM_IP:$VM_SCRIPTS_DIR/vm-prepare-and-run.sh"
+  $SSH_CMD "chmod +x $VM_SCRIPTS_DIR/*.sh"
+  echo "Bootstrapper copied to VM:$VM_SCRIPTS_DIR/vm-prepare-and-run.sh"
+fi
 echo ""
 
 IFS=',' read -ra REFS <<< "$REFS_CSV"
@@ -79,8 +92,10 @@ for i in "${!REFS[@]}"; do
   echo "   (single SSH session: checkout → build → verify → run)"
 
   # Execute the script already on the VM (copied via SCP at startup)
+  FORCE_FLAG=""
+  [[ "$FORCE_COPY_SCRIPTS" == "true" ]] && FORCE_FLAG="--force-scripts"
   $SSH_CMD "bash $VM_SCRIPTS_DIR/vm-prepare-and-run.sh \
-    '$REF' '$SCENARIO' '$TENANTS_FILE' '$RUN_NAME' $EXTRA_FLAGS"
+    '$REF' '$SCENARIO' '$TENANTS_FILE' '$RUN_NAME' $FORCE_FLAG $EXTRA_FLAGS"
   RUN_EXIT=$?
 
   if [[ $RUN_EXIT -eq 0 ]]; then
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
index 84b978a9d7bf..ac9dc2e0cb4f 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
@@ -14,7 +14,19 @@
 
 set -uo pipefail
 
-REF="${1:?Usage: $0 <ref> <scenario> <tenants-file> <run-name> [extra-flags...]}"
+# Extract --force-scripts flag if present (can appear anywhere in args)
+FORCE_SCRIPTS=false
+ARGS=()
+for arg in "$@"; do
+  if [[ "$arg" == "--force-scripts" ]]; then
+    FORCE_SCRIPTS=true
+  else
+    ARGS+=("$arg")
+  fi
+done
+set -- "${ARGS[@]}"
+
+REF="${1:?Usage: $0 <ref> <scenario> <tenants-file> <run-name> [--force-scripts] [extra-flags...]}"
 SCENARIO="${2:-SIMPLE}"
 TENANTS_FILE="${3:-~/tenants.json}"
 RUN_NAME="${4:-$(date +%Y%m%d)-${SCENARIO}-run}"
@@ -80,10 +92,15 @@ BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "detached")
 COMMIT=$(git rev-parse --short HEAD)
 echo "Checked out: $BRANCH @ $COMMIT"
 
-# Resolve script directory: prefer repo scripts (match the ref), fall back to ~/benchmark-scripts
+# Resolve script directory:
+#   --force-scripts → use ~/benchmark-scripts/ (for testing local script changes)
+#   default         → prefer repo scripts (match the ref), fall back to ~/benchmark-scripts/
 REPO_SCRIPTS_DIR="$BENCH_DIR/copilot/skills/cosmos-benchmark-run/scripts"
 FALLBACK_SCRIPTS_DIR=~/benchmark-scripts
-if [[ -d "$REPO_SCRIPTS_DIR" ]]; then
+if [[ "$FORCE_SCRIPTS" == "true" ]]; then
+  VM_SCRIPTS_DIR="$FALLBACK_SCRIPTS_DIR"
+  echo "Using forced scripts: $VM_SCRIPTS_DIR (--force-scripts)"
+elif [[ -d "$REPO_SCRIPTS_DIR" ]]; then
   VM_SCRIPTS_DIR="$REPO_SCRIPTS_DIR"
   echo "Using repo scripts: $VM_SCRIPTS_DIR"
 else

From 745bfcde04e0a63c61c6e6bcaf752517e1b7330e Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:21:01 -0800
Subject: [PATCH 14/22] Wrap entire pipeline in tmux, not just the benchmark
 step

- run-all-refs.sh now starts vm-prepare-and-run.sh inside a tmux
  session, so checkout, build, verify AND run all survive SSH
  disconnection
- vm-prepare-and-run.sh Step 4 simplified: runs run-benchmark.sh
  directly (no nested tmux, no .run.sh heredoc generation)
- Polling and exit code logic moved to run-all-refs.sh orchestrator

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../scripts/run-all-refs.sh                   | 37 ++++++++++--
 .../scripts/vm-prepare-and-run.sh             | 58 +++++--------------
 2 files changed, 44 insertions(+), 51 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
index bd9d4d7a7bac..0fba4249bb23 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
@@ -88,15 +88,40 @@ for i in "${!REFS[@]}"; do
   SEQ="[$((i+1))/$TOTAL]"
 
   echo ""
-  echo "$SEQ Starting: $REF → $RUN_NAME"
-  echo "   (single SSH session: checkout → build → verify → run)"
+  echo "$SEQ Starting: $REF -> $RUN_NAME"
+  echo "   (tmux session: checkout -> build -> verify -> run)"
 
-  # Execute the script already on the VM (copied via SCP at startup)
   FORCE_FLAG=""
   [[ "$FORCE_COPY_SCRIPTS" == "true" ]] && FORCE_FLAG="--force-scripts"
-  $SSH_CMD "bash $VM_SCRIPTS_DIR/vm-prepare-and-run.sh \
-    '$REF' '$SCENARIO' '$TENANTS_FILE' '$RUN_NAME' $FORCE_FLAG $EXTRA_FLAGS"
-  RUN_EXIT=$?
+  BENCH_DIR_VM="~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark"
+  EXIT_CODE_FILE="$BENCH_DIR_VM/results/$RUN_NAME/.exit-code"
+
+  # End any previous tmux session gracefully
+  $SSH_CMD "tmux send-keys -t bench C-c 2>/dev/null; sleep 1; tmux send-keys -t bench exit Enter 2>/dev/null; sleep 1" 2>/dev/null || true
+
+  # Start entire pipeline in tmux (all steps survive SSH disconnection)
+  $SSH_CMD "mkdir -p $BENCH_DIR_VM/results/$RUN_NAME && \
+    tmux new-session -d -s bench \
+    'bash $VM_SCRIPTS_DIR/vm-prepare-and-run.sh \
+      \"$REF\" \"$SCENARIO\" \"$TENANTS_FILE\" \"$RUN_NAME\" $FORCE_FLAG $EXTRA_FLAGS; \
+      echo \$? > $EXIT_CODE_FILE'"
+  echo "$SEQ tmux session started on VM"
+
+  # Poll until tmux session ends
+  case "$SCENARIO" in
+    SIMPLE)  POLL_INTERVAL=120 ;;
+    EXPAND)  POLL_INTERVAL=300 ;;
+    CHURN)   POLL_INTERVAL=300 ;;
+    *)       POLL_INTERVAL=120 ;;
+  esac
+  echo "$SEQ Polling every ${POLL_INTERVAL}s..."
+  while $SSH_CMD "tmux has-session -t bench 2>/dev/null" 2>/dev/null; do
+    sleep $POLL_INTERVAL
+  done
+
+  # Read exit code from the VM
+  RUN_EXIT=$($SSH_CMD "cat $EXIT_CODE_FILE 2>/dev/null || echo 1" 2>/dev/null)
+  RUN_EXIT=$(echo "$RUN_EXIT" | tr -d '[:space:]')
 
   if [[ $RUN_EXIT -eq 0 ]]; then
     echo "$SEQ ✅ Completed: $REF → results/$RUN_NAME"
diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
index ac9dc2e0cb4f..a2be23534844 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/vm-prepare-and-run.sh
@@ -159,63 +159,31 @@ if [[ "$READY" != "true" ]]; then
   exit 1
 fi
 
-# --- Step 4: Run in tmux (survives SSH disconnection) ---
+# --- Step 4: Run ---
+# The entire vm-prepare-and-run.sh runs inside a tmux session started by
+# run-all-refs.sh, so all steps (checkout, build, verify, run) survive
+# SSH disconnection. No nested tmux needed here.
 echo ""
-echo "=== [4/4] Run: $SCENARIO (tmux session: bench) ==="
+echo "=== [4/4] Run: $SCENARIO ==="
 
 cd "$BENCH_DIR"
 RESULTS_DIR="./results/$RUN_NAME"
 mkdir -p "$RESULTS_DIR"
 
-# End any previous benchmark tmux session gracefully
-tmux send-keys -t bench C-c 2>/dev/null || true
-sleep 1
-tmux send-keys -t bench "exit" Enter 2>/dev/null || true
-sleep 1
-
-# Write run script with resolved paths (executed inside tmux)
-cat > "$RESULTS_DIR/.run.sh" <<EOF
-#!/bin/bash
-set -uo pipefail
-cd "$BENCH_DIR"
 if [[ -f "$VM_SCRIPTS_DIR/run-benchmark.sh" ]]; then
-  bash "$VM_SCRIPTS_DIR/run-benchmark.sh" \\
+  bash "$VM_SCRIPTS_DIR/run-benchmark.sh" \
     "$SCENARIO" "$TENANTS_RESOLVED" "$RESULTS_DIR" $EXTRA_FLAGS
+  BENCH_EXIT=$?
 else
   echo "WARNING: run-benchmark.sh not found, running JAR directly"
-  JAR=\$(ls target/*jar-with-dependencies.jar 2>/dev/null | head -1)
-  java -Xmx8g -Xms8g -XX:+UseG1GC \\
-    -Xlog:gc*:"$RESULTS_DIR/gc.log" \\
-    -jar "\$JAR" \\
-    -tenantsFile "$TENANTS_RESOLVED" \\
-    -reportingDirectory "$RESULTS_DIR/metrics" \\
+  java -Xmx8g -Xms8g -XX:+UseG1GC \
+    -Xlog:gc*:"$RESULTS_DIR/gc.log" \
+    -jar "$JAR" \
+    -tenantsFile "$TENANTS_RESOLVED" \
+    -reportingDirectory "$RESULTS_DIR/metrics" \
     2>&1 | tee "$RESULTS_DIR/benchmark.log"
+  BENCH_EXIT=$?
 fi
-echo \$? > "$RESULTS_DIR/.exit-code"
-EOF
-chmod +x "$RESULTS_DIR/.run.sh"
-
-# Start benchmark in tmux -- process persists even if SSH disconnects
-tmux new-session -d -s bench "bash '$RESULTS_DIR/.run.sh'"
-echo "  Benchmark running in tmux session 'bench'"
-echo "  Monitor:  tmux capture-pane -t bench -p | tail -30"
-
-# Poll interval based on scenario duration (SIMPLE ~30min, EXPAND ~90min, CHURN varies)
-case "$SCENARIO" in
-  SIMPLE)  POLL_INTERVAL=120 ;;   # 2 min
-  EXPAND)  POLL_INTERVAL=300 ;;   # 5 min
-  CHURN)   POLL_INTERVAL=300 ;;   # 5 min
-  *)       POLL_INTERVAL=120 ;;   # 2 min default
-esac
-
-# Wait for tmux session to complete
-echo "  Poll interval: ${POLL_INTERVAL}s"
-while tmux has-session -t bench 2>/dev/null; do
-  sleep $POLL_INTERVAL
-done
-
-# Read exit code written by the run script
-BENCH_EXIT=$(cat "$RESULTS_DIR/.exit-code" 2>/dev/null || echo 1)
 
 if [[ "$BENCH_EXIT" -eq 0 ]]; then
   echo ""

From 2b103807dd225f5fb4bd9b6e5567d9d329243318 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:28:09 -0800
Subject: [PATCH 15/22] Fix tmux launch quoting with launcher script approach

- Write a small /tmp/bench-launch.sh on the VM that wraps
  vm-prepare-and-run.sh and writes the exit code
- Avoids nested quoting issues (SSH -> tmux -> bash -> args)
- Fix stale EXIT_CODE_FILE variable reference

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../scripts/run-all-refs.sh                   | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
index 0fba4249bb23..d2a8826c267f 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
@@ -94,17 +94,22 @@ for i in "${!REFS[@]}"; do
   FORCE_FLAG=""
   [[ "$FORCE_COPY_SCRIPTS" == "true" ]] && FORCE_FLAG="--force-scripts"
   BENCH_DIR_VM="~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark"
-  EXIT_CODE_FILE="$BENCH_DIR_VM/results/$RUN_NAME/.exit-code"
 
   # End any previous tmux session gracefully
-  $SSH_CMD "tmux send-keys -t bench C-c 2>/dev/null; sleep 1; tmux send-keys -t bench exit Enter 2>/dev/null; sleep 1" 2>/dev/null || true
+  $SSH_CMD 'tmux send-keys -t bench C-c 2>/dev/null; sleep 1; tmux send-keys -t bench exit Enter 2>/dev/null; sleep 1' 2>/dev/null || true
+
+  # Write a small launcher script on the VM to avoid nested quoting issues
+  $SSH_CMD "cat > /tmp/bench-launch.sh << 'LAUNCHER'
+#!/bin/bash
+bash ~/benchmark-scripts/vm-prepare-and-run.sh "\$@"
+EXIT_CODE=\$?
+echo \$EXIT_CODE > ~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark/results/\$4/.exit-code
+exit \$EXIT_CODE
+LAUNCHER
+chmod +x /tmp/bench-launch.sh"
 
   # Start entire pipeline in tmux (all steps survive SSH disconnection)
-  $SSH_CMD "mkdir -p $BENCH_DIR_VM/results/$RUN_NAME && \
-    tmux new-session -d -s bench \
-    'bash $VM_SCRIPTS_DIR/vm-prepare-and-run.sh \
-      \"$REF\" \"$SCENARIO\" \"$TENANTS_FILE\" \"$RUN_NAME\" $FORCE_FLAG $EXTRA_FLAGS; \
-      echo \$? > $EXIT_CODE_FILE'"
+  $SSH_CMD "mkdir -p $BENCH_DIR_VM/results/$RUN_NAME && tmux new-session -d -s bench 'bash /tmp/bench-launch.sh $REF $SCENARIO $TENANTS_FILE $RUN_NAME $FORCE_FLAG $EXTRA_FLAGS'"
   echo "$SEQ tmux session started on VM"
 
   # Poll until tmux session ends
@@ -120,7 +125,7 @@ for i in "${!REFS[@]}"; do
   done
 
   # Read exit code from the VM
-  RUN_EXIT=$($SSH_CMD "cat $EXIT_CODE_FILE 2>/dev/null || echo 1" 2>/dev/null)
+  RUN_EXIT=$($SSH_CMD "cat $BENCH_DIR_VM/results/$RUN_NAME/.exit-code 2>/dev/null || echo 1" 2>/dev/null)
   RUN_EXIT=$(echo "$RUN_EXIT" | tr -d '[:space:]')
 
   if [[ $RUN_EXIT -eq 0 ]]; then

From b538c4853b08b1b3f112236d76e59131b98b1829 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:33:29 -0800
Subject: [PATCH 16/22] Fix tilde expansion in BENCH_DIR_VM path

Use $HOME instead of ~ in double-quoted string to ensure correct
path expansion when interpolated into SSH commands.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
index d2a8826c267f..1338aa9a2c31 100755
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-all-refs.sh
@@ -93,7 +93,7 @@ for i in "${!REFS[@]}"; do
 
   FORCE_FLAG=""
   [[ "$FORCE_COPY_SCRIPTS" == "true" ]] && FORCE_FLAG="--force-scripts"
-  BENCH_DIR_VM="~/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark"
+  BENCH_DIR_VM="\$HOME/azure-sdk-for-java/sdk/cosmos/azure-cosmos-benchmark"
 
   # End any previous tmux session gracefully
   $SSH_CMD 'tmux send-keys -t bench C-c 2>/dev/null; sleep 1; tmux send-keys -t bench exit Enter 2>/dev/null; sleep 1' 2>/dev/null || true

From d2fe7bb39b8d0d1175a00c298526616dee729feb Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:37:58 -0800
Subject: [PATCH 17/22] Fix MODULE_DIR in run-benchmark.sh for non-repo script
 locations

When run-benchmark.sh is executed from ~/benchmark-scripts/ (SCP'd
copy), SCRIPT_DIR/../ doesn't point to the benchmark module. Fall
back to PWD if the script's parent doesn't contain a target/ dir.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/scripts/run-benchmark.sh    | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
index 6039a1975d34..e10846df147f 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
@@ -18,7 +18,13 @@ EXTRA_ARGS="$*"
 
 mkdir -p "$OUTPUT_DIR"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# MODULE_DIR: use the script's parent if it contains a target/ dir (repo layout),
+# otherwise use the current working directory (caller is expected to cd to the
+# benchmark module before invoking this script).
 MODULE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+if [[ ! -d "$MODULE_DIR/target" && -d "$PWD/target" ]]; then
+    MODULE_DIR="$PWD"
+fi
 
 # Git metadata
 BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")

From 33bf001ff864cdb929494a6c87479d69557d157b Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:42:26 -0800
Subject: [PATCH 18/22] Fix benchmark main class to
 com.azure.cosmos.benchmark.Main

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/scripts/run-benchmark.sh        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
index e10846df147f..53fbdbb19f33 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
@@ -58,7 +58,7 @@ JVM_OPTS="-Xmx8g -Xms8g -XX:+UseG1GC -XX:MaxDirectMemorySize=2g \
 # Start benchmark process (not piped, so we get the real PID)
 java $JVM_OPTS \
   -cp "$BENCHMARK_JAR" \
-  com.azure.cosmos.benchmark.BenchmarkOrchestrator \
+  com.azure.cosmos.benchmark.Main \
   --tenantsFile "$TENANTS_FILE" \
   --scenario "$SCENARIO" \
   --outputDir "$OUTPUT_DIR" \

From 4278f1849fdedba239e17cb79d370d407fe6874f Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 21:59:53 -0800
Subject: [PATCH 19/22] Fix run-benchmark.sh: use single-dash JCommander flags,
 add -reportingDirectory

- --tenantsFile -> -tenantsFile (JCommander uses single dash)
- Remove --scenario and --outputDir (not valid Configuration params)
- Add -reportingDirectory for CSV metrics output

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/scripts/run-benchmark.sh     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
index 53fbdbb19f33..909603e68f09 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/scripts/run-benchmark.sh
@@ -59,9 +59,8 @@ JVM_OPTS="-Xmx8g -Xms8g -XX:+UseG1GC -XX:MaxDirectMemorySize=2g \
 java $JVM_OPTS \
   -cp "$BENCHMARK_JAR" \
   com.azure.cosmos.benchmark.Main \
-  --tenantsFile "$TENANTS_FILE" \
-  --scenario "$SCENARIO" \
-  --outputDir "$OUTPUT_DIR" \
+  -tenantsFile "$TENANTS_FILE" \
+  -reportingDirectory "${OUTPUT_DIR}/metrics" \
   $EXTRA_ARGS \
   > >(tee "${OUTPUT_DIR}/benchmark.log") 2>&1 &
 JAVA_PID=$!

From 24ad95367ba87c253208a1f0149e35bdbb7e6cf6 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 22:08:26 -0800
Subject: [PATCH 20/22] Make post-launch verification mandatory in run skill

Replace fire-and-forget async launch with a two-step workflow:
Step A: Launch orchestrator with sync mode (initial_wait: 60)
Step B: Mandatory verify via check-status.sh within 90s

Prevents the agent from telling the user 'it's running' without
actually confirming tmux is alive and results directory exists.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/cosmos-benchmark-run/SKILL.md      | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
index 2087c797b2af..79f03f0ea1de 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
+++ b/sdk/cosmos/azure-cosmos-benchmark/copilot/skills/cosmos-benchmark-run/SKILL.md
@@ -101,25 +101,37 @@ Runs are named `<date>-<scenario>-<ref-label>`, e.g.:
 20260302-CHURN-PR-12345
 ```
 
-### Async execution (non-blocking)
+### Launch and verify (required two-step)
 
-**Always run the orchestrator in async mode** so the user can continue working while benchmarks run. Use the `bash` tool with `mode="async"`:
+**Step A — Launch the orchestrator** using `mode="sync"` with `initial_wait: 60`. This gives enough time to see script copy + tmux launch output while keeping the shell attached for polling:
 
 ```bash
-# Launches in background, returns a shellId for monitoring
 bash scripts/run-all-refs.sh \
   --config-dir "$CONFIG_DIR" \
   --refs "main, fix/telemetry-leak" \
   --scenario SIMPLE
 ```
 
-After launching, the user can:
+**Step B — Verify the run is progressing** (MANDATORY, do not skip). Within 90 seconds of launch, run `check-status.sh` to confirm the tmux session is alive and a new results directory exists:
+
+```bash
+bash scripts/check-status.sh --config-dir "$CONFIG_DIR"
+```
+
+**What to check**:
+- ✅ Tmux session is "Running"
+- ✅ A new results directory matching `<date>-<scenario>-<ref-label>` exists
+- ✅ Status is "⏳ Starting or build-only" or "🔄 Running" (not "❌ Failed")
+
+**If any check fails**, investigate immediately — read `benchmark.log`, diagnose the issue, and report to the user before retrying. Do NOT tell the user "it's running" without verifying.
+
+The benchmark itself runs in a **tmux session** (`bench`) on the VM, so it survives SSH disconnections. Even if the local orchestrator process is interrupted, the benchmark continues on the VM.
+
+After verification, the user can:
 - **Continue working** on other tasks in the main context
 - **Check status** at any time (see Monitor progress below)
 - **Get notified** when the orchestrator reports completion via `read_bash`
 
-The benchmark itself runs in a **tmux session** (`bench`) on the VM, so it survives SSH disconnections. Even if the local orchestrator process is interrupted, the benchmark continues on the VM.
-
 ### Monitor progress
 
 **Quick status check** — use the `check-status.sh` script:
@@ -148,7 +160,7 @@ Use `read_bash` with the shellId from the async launch.
 
 ### Early exit detection and troubleshooting
 
-After launching the orchestrator in async mode, **proactively verify** that the run is progressing. If the async shell exits within a few minutes (expected runtime is 30–90+ min), the run likely failed early.
+The mandatory verify step (Step B above) catches most early failures. Additionally, if the orchestrator shell exits unexpectedly during polling (expected runtime is 30–90+ min), investigate immediately.
 
 **Detection**: When checking status via `read_bash`, if the shell has already exited or accepts new commands, investigate immediately — do not assume success.
 

From 3639600fa19217a436fbf46aefb58ad17129e8c3 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 22:21:37 -0800
Subject: [PATCH 21/22] Parallelize benchmark client creation across tenants

Previously, createBenchmarks() initialized Cosmos clients sequentially
in a for loop. With 50 tenants, each taking ~10-15s (connect + create
DB/container + populate docs), initialization alone took ~8-10 minutes.

Now submits all tenant initializations to the existing ExecutorService
in parallel, collecting results via Future.get(). With 50 tenants on
a 50-thread pool, initialization completes in ~15-20s instead.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../benchmark/BenchmarkOrchestrator.java      | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
index e8f840f0c481..500154eee34f 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
@@ -166,7 +166,7 @@ private void runLifecycleLoop(BenchmarkConfig config, MetricRegistry registry,
                 logger.info("[LIFECYCLE] CYCLE_START cycle={} timestamp={}", cycle, Instant.now());
 
                 // 1. Create clients
-                List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry);
+                List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry, executor);
                 reporter.report();
                 logger.info("[LIFECYCLE] POST_CREATE cycle={} clients={} timestamp={}",
                     cycle, benchmarks.size(), Instant.now());
@@ -219,11 +219,23 @@ private void runLifecycleLoop(BenchmarkConfig config, MetricRegistry registry,
             totalCycles, durationSec, Instant.now());
     }
 
-    private List<AsyncBenchmark<?>> createBenchmarks(BenchmarkConfig config, MetricRegistry registry) {
-        List<AsyncBenchmark<?>> benchmarks = new ArrayList<>();
-        for (TenantWorkloadConfig tenant : config.getTenantWorkloads()) {
-            benchmarks.add(createBenchmarkForOperation(tenant, registry));
+    private List<AsyncBenchmark<?>> createBenchmarks(BenchmarkConfig config, MetricRegistry registry,
+                                                      ExecutorService executor) {
+        List<TenantWorkloadConfig> tenants = config.getTenantWorkloads();
+        List<Future<AsyncBenchmark<?>>> futures = new ArrayList<>(tenants.size());
+        for (TenantWorkloadConfig tenant : tenants) {
+            futures.add(executor.submit(() -> createBenchmarkForOperation(tenant, registry)));
+        }
+
+        List<AsyncBenchmark<?>> benchmarks = new ArrayList<>(tenants.size());
+        for (Future<AsyncBenchmark<?>> f : futures) {
+            try {
+                benchmarks.add(f.get());
+            } catch (Exception e) {
+                throw new RuntimeException("Failed to create benchmark client", e);
+            }
         }
+        logger.info("Created {} benchmark clients in parallel", benchmarks.size());
         return benchmarks;
     }
 

From aa147ec45bf612917aa6ddb447f7e93cf1372640 Mon Sep 17 00:00:00 2001
From: Annie Liang <anniemac@Annies-MacBook-Pro.local>
Date: Mon, 2 Mar 2026 22:22:48 -0800
Subject: [PATCH 22/22] Revert "Parallelize benchmark client creation across
 tenants"

This reverts commit 3639600fa19217a436fbf46aefb58ad17129e8c3.
---
 .../benchmark/BenchmarkOrchestrator.java      | 22 +++++--------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
index 500154eee34f..e8f840f0c481 100644
--- a/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
+++ b/sdk/cosmos/azure-cosmos-benchmark/src/main/java/com/azure/cosmos/benchmark/BenchmarkOrchestrator.java
@@ -166,7 +166,7 @@ private void runLifecycleLoop(BenchmarkConfig config, MetricRegistry registry,
                 logger.info("[LIFECYCLE] CYCLE_START cycle={} timestamp={}", cycle, Instant.now());
 
                 // 1. Create clients
-                List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry, executor);
+                List<AsyncBenchmark<?>> benchmarks = createBenchmarks(config, registry);
                 reporter.report();
                 logger.info("[LIFECYCLE] POST_CREATE cycle={} clients={} timestamp={}",
                     cycle, benchmarks.size(), Instant.now());
@@ -219,23 +219,11 @@ private void runLifecycleLoop(BenchmarkConfig config, MetricRegistry registry,
             totalCycles, durationSec, Instant.now());
     }
 
-    private List<AsyncBenchmark<?>> createBenchmarks(BenchmarkConfig config, MetricRegistry registry,
-                                                      ExecutorService executor) {
-        List<TenantWorkloadConfig> tenants = config.getTenantWorkloads();
-        List<Future<AsyncBenchmark<?>>> futures = new ArrayList<>(tenants.size());
-        for (TenantWorkloadConfig tenant : tenants) {
-            futures.add(executor.submit(() -> createBenchmarkForOperation(tenant, registry)));
-        }
-
-        List<AsyncBenchmark<?>> benchmarks = new ArrayList<>(tenants.size());
-        for (Future<AsyncBenchmark<?>> f : futures) {
-            try {
-                benchmarks.add(f.get());
-            } catch (Exception e) {
-                throw new RuntimeException("Failed to create benchmark client", e);
-            }
+    private List<AsyncBenchmark<?>> createBenchmarks(BenchmarkConfig config, MetricRegistry registry) {
+        List<AsyncBenchmark<?>> benchmarks = new ArrayList<>();
+        for (TenantWorkloadConfig tenant : config.getTenantWorkloads()) {
+            benchmarks.add(createBenchmarkForOperation(tenant, registry));
         }
-        logger.info("Created {} benchmark clients in parallel", benchmarks.size());
         return benchmarks;
     }