diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a3afb2f6b..ec8d8667a 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -323,6 +323,21 @@ qwen3.5-fp8-mi355x-sglang-agentic:
       search-space:
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
+qwen3.5-fp8-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
 qwen3.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -653,10 +668,6 @@ kimik2.5-fp4-mi355x-vllm:
 # its fixed-seq-len sweep is unaffected.
 #   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
 kimik2.5-fp4-mi355x-vllm-agentic:
-  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
-  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
-  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
-  # includes all subsequent ROCm offload work.
   image: vllm/vllm-openai-rocm:v0.21.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
@@ -669,16 +680,9 @@ kimik2.5-fp4-mi355x-vllm-agentic:
     - duration: 1800
       search-space:
       - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      # CPU offload only above the KV cliff. Lower concurrencies fit
-      # entirely on-GPU, so paying the offload-path overhead there would
-      # just slow them down without measuring anything new.
-      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
-      # TP=4 probe: half-node layout doubles per-GPU weight footprint
-      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
-      # cliff-region concurrencies on both offload modes so we can directly
-      # compare TP=4 vs TP=8 at the same conc points.
+      - { tp: 8, offloading: lmcache,  conc-list: [32, 40, 48, 56] }
       - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
+      - { tp: 4, offloading: lmcache,  conc-list: [16, 24, 32, 40] }
 
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
@@ -701,6 +705,22 @@ kimik2.5-fp4-mi355x-atom:
       - { tp: 8, conc-start: 4, conc-end: 128 }
       - { tp: 4, conc-start: 4, conc-end: 128 }
 
+dsv4-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
+
 minimaxm2.5-fp8-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.21.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -1833,6 +1853,29 @@ dsv4-fp4-mi355x-sglang:
       - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
       - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32 }
 
+# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
+# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - scenarios: replaced fixed-seq-len with agentic-coding.
+# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
+# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
+# comparability. Offload sweep is none-only (SGLang has no equivalent of
+# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
+dsv4-fp4-mi355x-sglang-agentic:
+  image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
+      - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
+
 # DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
 # nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
 # on 2026-05-05, so any nightly built after that includes the
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f8cc486b2..e77a2916f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1726,7 +1726,7 @@ dsv4-fp4-b200-sglang:
   framework: sglang
   multinode: false
   # Two recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-  # are selected inside benchmarks/single_node/dsv4_fp4_b200.sh by DP_ATTENTION:
+  # are selected inside benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh by DP_ATTENTION:
   #   low-latency  (DP_ATTENTION=false): TP-only, flashinfer_mxfp4
   #   DP-attention  (DP_ATTENTION=true):  DP-attn + DeepEP + mega_moe opts
   # The DP-attention recipe covers both "balanced" (conc 64-128) and
@@ -1781,8 +1781,10 @@ dsv4-fp4-b200-vllm:
 # the original dsv4-fp4-b200-vllm entry is left identical to origin/main so
 # its fixed-seq-len sweep is unaffected.
 #   - runner: 'b200-dsv4' -> 'b200-dgxc'
+#   - image:  bumped to a custom v0.21.0 build (cquil/vllm-openai:v0.21.0-8813c92)
+#     to test SimpleCPUOffloadConnector lazy_offload behavior on a newer vLLM.
 dsv4-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
+  image: cquil/vllm-openai:v0.21.0-dsv4-offloading
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dgxc
@@ -1793,11 +1795,16 @@ dsv4-fp4-b200-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%).
-      # Re-add when investigating regressions in offload=none.
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
+      # no-offload curve against the new cc-traces-weka-no-subagents-051826
+      # dataset (98 traces, v5-only + CC ≥ 2.1.139). cpu-offload entries
+      # removed for this iteration; restore from prior commits if revisiting
+      # offload regressions.
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16] }
+      # Native vLLM CPU offload with HMA enabled. The benchmark script sizes
+      # the aggregate native offload pool to the same 2.8 TB target used for
+      # the blocked LMCache experiment.
+      # - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [12, 16, 24, 32, 48, 64] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [12, 16, 24, 32, 48, 64] }
 
 dsv4-fp4-b200-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
@@ -2013,7 +2020,7 @@ dsv4-fp4-b300-sglang:
   framework: sglang
   multinode: false
   # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-  # are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC:
+  # are selected inside benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh by CONC:
   #   low-latency    (CONC <= 32):       TP-only
   #   balanced       (32 < CONC <= 128): + DP-attn
   #   max-throughput (CONC > 128):       + DP-attn
@@ -2039,7 +2046,7 @@ dsv4-fp4-b300-sglang:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
-  # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
+  # selected inside benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh by
   # DP_ATTENTION:
   #   dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192
   #                     + EAGLE (3,1,4) + mem-fraction 0.90
@@ -2453,6 +2460,21 @@ qwen3.5-fp8-b300-sglang:
       search-space:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
+qwen3.5-fp8-b300-sglang-agentic-hicache:
+  image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: b300
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
 qwen3.5-fp4-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
@@ -2677,13 +2699,32 @@ kimik2.5-fp4-b200-vllm:
 # Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below;
 # the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so
 # its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2'
+#   - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.21.0'
 #   - runner: 'b200' -> 'b200-dgxc'
 kimik2.5-fp4-b200-vllm-agentic:
-  # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that
-  # cleared the agentic-coding warmup crash on max_model_len=131072 +
-  # prefix caching.
-  image: vllm/vllm-openai:v0.20.2
+  # v0.21.0 ships a newer huggingface_hub that resolves LFS content correctly
+  # in `hf download` (1.14.0 in v0.20.x silently fetched LFS pointer files,
+  # which pyarrow then choked on with "Missing a name for object member" --
+  # see run 26536606210). v0.20.x's flashinfer fix for the agentic-coding
+  # warmup crash on max_model_len=131072 + prefix caching is included.
+  image: vllm/vllm-openai:v0.21.0
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
+      # - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
+      # - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
+
+kimik2.5-fp4-b200-vllm-agentic-lmcache:
+  image: vllm/vllm-openai:v0.21.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b200-dgxc
@@ -2695,9 +2736,9 @@ kimik2.5-fp4-b200-vllm-agentic:
     - duration: 1800
       search-space:
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
-      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
+      - { tp: 8, ep: 1, offloading: lmcache,  conc-list: [16, 24, 32, 36] }
       - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
-      - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
+      - { tp: 4, ep: 1, offloading: lmcache,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
 
 # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
 # does not have a B300-specific recipe, so this config reuses the existing
@@ -2775,12 +2816,7 @@ dsr1-fp8-b300-sglang-mtp:
 #   - precision: 'fp8' -> 'fp4'
 #   - framework: 'sglang' -> 'vllm'
 kimik2.5-fp4-b300-vllm-agentic:
-  # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
-  # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
-  # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
-  # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
-  # INT4 B300 sister already uses successfully.
-  image: vllm/vllm-openai:v0.20.0-cu130
+  image: vllm/vllm-openai:v0.21.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b300
@@ -2793,6 +2829,7 @@ kimik2.5-fp4-b300-vllm-agentic:
       search-space:
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
       - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+      - { tp: 8, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
 
 dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14
@@ -3044,12 +3081,13 @@ dsv4-fp4-b300-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs. Re-add when investigating regressions in offload=none.
-      - { tp: 4, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
+      # no-offload curve against the new cc-traces-weka-no-subagents-051826
+      # dataset (98 traces, v5-only + CC ≥ 2.1.139). cpu-offload entries
+      # removed for this iteration; restore from prior commits if revisiting
+      # offload regressions.
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
 
 dsv4-fp4-b300-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
@@ -4766,7 +4804,7 @@ minimaxm2.5-fp8-h200-vllm:
 # (either main had none or had a different conc/offload sweep).
 # The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main.
 minimaxm2.5-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
+  image: vllm/vllm-openai:v0.21.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: h200
@@ -8756,6 +8794,153 @@ dsv4-fp4-gb300-dynamo-vllm:
           ep: 16
           dp-attn: true
 
+# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons
+# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to
+# origin/main so its fixed-seq-len sweep is unaffected.
+#   - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape
+#     mirroring the conc=192 point in the base entry's fixed-seq-len sweep.
+#   - additional-settings.CONFIG_FILE: points at the new agentic recipe under
+#     recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh
+#     overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC
+#     branch). Local-overlay pattern mirrors the existing 8k1k overlay.
+dsv4-fp4-gb300-dynamo-vllm-agentic:
+  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  # gb300-nv (not generic gb300) — the generic label is shared by both NV
+  # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards.
+  # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml
+  # + actual runner label listings). Pins agentic to the NVIDIA cluster
+  # for initial validation. Drop -nv suffix to widen later.
+  runner: gb300-nv
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # Low-latency: same 1p6d shape as the mid tier but at much lower conc
+      # (32 vs 192). 32/6 ≈ 5 seqs per decode worker — well below saturation,
+      # so each request gets ~6× the per-request decode compute it would get
+      # at conc=192. Reuses the 1p6d recipe; no separate recipe file needed.
+      - spec-decoding: none
+        conc-list: [32]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml"
+        decode:
+          num-worker: 6
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # Mid: 1 prefill (DEP=4) + 6 decode (TP=4). 7 nodes / 28 GPUs.
+      # Mirrors fixed-seq-len conc=192 entry.
+      - spec-decoding: none
+        conc-list: [192]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml"
+        decode:
+          num-worker: 6
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # High-throughput: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 nodes /
+      # 24 GPUs. Smallest 4096-class shape in fixed-seq-len; deep_gemm_mega_moe
+      # on both sides. Mirrors fixed-seq-len conc=4096 entry (4p1d variant).
+      - spec-decoding: none
+        conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+# CoreWeave sibling of dsv4-fp4-gb300-dynamo-vllm-agentic — same image,
+# recipes, and search space; only `runner` differs (gb300-cw vs gb300-nv).
+# Kept as a separate config (not a label-widening on the -nv entry)
+# because we dispatch NV and CW as independent sweep runs — bundling
+# both SKUs into one `gh workflow run` invocation lets a fault on one
+# cascade-cancel the other (see prior R20–R23 outages). The two sibling
+# configs share recipe files via the same launch_gb300-cw.sh IS_AGENTIC
+# overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe
+# applies to both clusters with no duplication.
+dsv4-fp4-gb300-cw-dynamo-vllm-agentic:
+  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300-cw
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # Low-latency: 1p6d at conc=32.
+      - spec-decoding: none
+        conc-list: [32]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml"
+        decode:
+          num-worker: 6
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # Mid: 1p6d at conc=192.
+      - spec-decoding: none
+        conc-list: [192]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml"
+        decode:
+          num-worker: 6
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # High-throughput: 4p1d at conc=4096.
+      - spec-decoding: none
+        conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
 dsv4-fp4-gb300-dynamo-sglang:
   image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -9214,6 +9399,31 @@ qwen3.5-fp8-h100-sglang:
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
 
+# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below;
+# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main
+# so its fixed-seq-len sweep is unaffected.
+#   - scenarios: replaced fixed-seq-len with agentic-coding.
+#   - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster).
+# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130).
+# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with-
+# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache
+# tends to flake on first runs and conc 16 covers the cliff. The bench script
+# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant.
+qwen3.5-fp8-h100-sglang-agentic:
+  image: lmsysorg/sglang:v0.5.12-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: h100-dgxc
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 8, offloading: none,    conc-list: [1, 2, 4, 8, 12, 14, 16] }
+      - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
+
 qwen3.5-fp8-h100-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 27d9a098e..eee8405d0 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -116,19 +116,19 @@ mi325x-disagg:
 - 'mi325x-amds_07'
 - 'mi325x-amds_08'
 mi355x:
-- 'mi355x-amds_0'
-- 'mi355x-amds_1'
-- 'mi355x-amds_2'
-- 'mi355x-amds_3'
-- 'mi355x-amds_4'
-- 'mi355x-amds_5'
-- 'mi355x-amds_6'
-- 'mi355x-amds_7'
-- 'mi355x-amds_8'
+- 'mi355x-amds_00'
+- 'mi355x-amds_01'
+- 'mi355x-amds_02'
+- 'mi355x-amds_03'
+- 'mi355x-amds_04'
+- 'mi355x-amds_05'
+- 'mi355x-amds_06'
+- 'mi355x-amds_07'
+- 'mi355x-amds_08'
 mi355x-disagg:
-- 'mi355x-amds_6'
-- 'mi355x-amds_7'
-- 'mi355x-amds_8'
+- 'mi355x-amds_06'
+- 'mi355x-amds_07'
+- 'mi355x-amds_08'
 gb200:
 - gb200-nv_0
 - gb200-nv_1
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index f901b1ff7..81727ef39 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -139,7 +139,7 @@ env:
   EVAL_ONLY: ${{ inputs.eval-only }}
   EVAL_CONC: ${{ inputs.eval-conc }}
   SCENARIO_TYPE: ${{ inputs.scenario-type }}
-  SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }}
+  SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
   IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
   CONC: ${{ inputs.conc }}
   DURATION: ${{ inputs.duration }}
@@ -291,8 +291,8 @@ jobs:
             LOGS/agentic/benchmark_command.txt
             LOGS/agentic/workload_distribution_summary.txt
             LOGS/agentic/workload_distribution_plots.png
-            LOGS/agentic/trace_replay/detailed_results.csv
-            LOGS/agentic/trace_replay/debug_trace.jsonl
+            LOGS/agentic/aiperf_artifacts/detailed_results.csv
+            LOGS/agentic/aiperf_artifacts/debug_trace.jsonl
           if-no-files-found: ignore
 
       - name: Upload eval results (if any)
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index cca6031c3..2148def36 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -73,7 +73,7 @@ on:
         type: string
         default: 'fixed-seq-len'
       offloading:
-        description: "KV offload backend for agentic scenarios (none/cpu/ssd)"
+        description: "KV offload backend for agentic scenarios (none/cpu/ssd/lmcache/lmcache-mp/hicache)"
         required: false
         type: string
         default: 'none'
@@ -109,7 +109,7 @@ env:
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
   SCENARIO_TYPE: ${{ inputs.scenario-type }}
-  SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }}
+  SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }}
   IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }}
   OFFLOADING: ${{ inputs.offloading }}
   TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
@@ -151,7 +151,7 @@ jobs:
           fi
 
           # Cleanup results/ from a prior job on this runner. Agentic jobs
-          # write to fixed subpaths (trace_replay/, metrics_*, etc.), so stale
+          # write to fixed subpaths (aiperf_artifacts/, metrics_*, etc.), so stale
           # data from a previous job would otherwise be picked up as this
           # job's output when replay fails early.
           rm -rf "${{ github.workspace }}/results" 2>/dev/null || true
@@ -226,7 +226,7 @@ jobs:
           path: agg_${{ env.RESULT_FILENAME }}.json
 
       - name: Upload agentic aggregated result
-        if: ${{ inputs.scenario-type == 'agentic-coding' }}
+        if: ${{ always() && inputs.scenario-type == 'agentic-coding' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: bmk_agentic_${{ env.RESULT_FILENAME }}
@@ -239,33 +239,36 @@ jobs:
           name: agentic_${{ env.RESULT_FILENAME }}
           path: |
             results/server.log
+            results/lmcache_server.log
             results/benchmark.log
             results/config.yaml
+            results/lmcache_command.txt
+            results/sglang_command.txt
             results/vllm_command.txt
             results/benchmark_command.txt
             results/workload_distribution_summary.txt
             results/workload_distribution_plots.png
             results/metrics_plots.png
-            results/trace_replay/profile_export.jsonl
-            results/trace_replay/profile_export_aiperf.json
-            results/trace_replay/profile_export_aiperf.csv
-            results/trace_replay/profile_export_aiperf_timeslices.json
-            results/trace_replay/profile_export_aiperf_timeslices.csv
-            results/trace_replay/profile_export_aiperf_aggregate.json
-            results/trace_replay/profile_export_aiperf_aggregate.csv
-            results/trace_replay/profile_export_aiperf_collated.json
-            results/trace_replay/server_metrics_export.json
-            results/trace_replay/server_metrics_export.jsonl
-            results/trace_replay/server_metrics_export.csv
-            results/trace_replay/server_metrics_export.parquet
-            results/trace_replay/gpu_telemetry_export.jsonl
-            results/trace_replay/logs/aiperf.log
-            results/trace_replay/logs/*.log
+            results/aiperf_artifacts/profile_export.jsonl
+            results/aiperf_artifacts/profile_export_aiperf.json
+            results/aiperf_artifacts/profile_export_aiperf.csv
+            results/aiperf_artifacts/profile_export_aiperf_timeslices.json
+            results/aiperf_artifacts/profile_export_aiperf_timeslices.csv
+            results/aiperf_artifacts/profile_export_aiperf_aggregate.json
+            results/aiperf_artifacts/profile_export_aiperf_aggregate.csv
+            results/aiperf_artifacts/profile_export_aiperf_collated.json
+            results/aiperf_artifacts/server_metrics_export.json
+            results/aiperf_artifacts/server_metrics_export.jsonl
+            results/aiperf_artifacts/server_metrics_export.csv
+            results/aiperf_artifacts/server_metrics_export.parquet
+            results/aiperf_artifacts/gpu_telemetry_export.jsonl
+            results/aiperf_artifacts/logs/aiperf.log
+            results/aiperf_artifacts/logs/*.log
           # Excluded by design (multi-GB debug artifacts, not consumed by
-          # post-processing): results/trace_replay/inputs.json (pre-formatted
+          # post-processing): results/aiperf_artifacts/inputs.json (pre-formatted
           # request bodies — the mmap'd binary equivalent is rebuilt from
           # --public-dataset + --random-seed) and
-          # results/trace_replay/profile_export_raw.jsonl (full HTTP bodies
+          # results/aiperf_artifacts/profile_export_raw.jsonl (full HTTP bodies
           # per request — recoverable by re-running the same trace).
           if-no-files-found: ignore
 
@@ -274,7 +277,9 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
-          path: ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
+          path: |
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }}
           if-no-files-found: ignore
 
       - name: Upload GPU metrics
diff --git a/.gitmodules b/.gitmodules
index 03670a881..fb9b1cc76 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,8 +1,4 @@
-[submodule "utils/trace-replay"]
-	path = utils/trace-replay
-	url = https://github.com/callanjfox/kv-cache-tester.git
-	branch = agentx-minimized
 [submodule "utils/aiperf"]
 	path = utils/aiperf
 	url = https://github.com/cquil11/aiperf.git
-	branch = cjq/weka-live-assistant-responses
+	branch = cjq/agentx-v0.3-subagents
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f5e39b4cf..cb66d75f5 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -9,6 +9,13 @@ export PYTHONDONTWRITEBYTECODE=1
 export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}"
 mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
 
+# Inference server port shared by every benchmark recipe. Launchers that need
+# a non-default value (e.g. launch_mi355x-amds.sh derives PORT from RUNNER_NAME
+# to avoid collisions across concurrent gh-runners on a shared host) set PORT
+# themselves before sourcing this file; the `:-` fallback only kicks in when
+# nothing upstream set it.
+export PORT="${PORT:-8888}"
+
 # --------------------------------
 # GPU monitoring helpers
 # --------------------------------
@@ -883,9 +890,6 @@ run_eval() {
 INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
 AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}"
 AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}"
-# TRACE_REPLAY_DIR retained for any out-of-tree consumer that still
-# imports the kv-cache-tester scripts. Not used by the helpers below.
-TRACE_REPLAY_DIR="${TRACE_REPLAY_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/trace-replay}"
 
 agentic_pip_install() {
     local pip_install=(python3 -m pip install)
@@ -907,12 +911,27 @@ ensure_hf_cli() {
 }
 
 resolve_trace_source() {
-    local dataset="semianalysisai/cc-traces-weka-no-subagents-051226"
-    # aiperf reads the corpus via its public-dataset registry; the loader
-    # under the hood pulls from semianalysisai/cc-traces-weka-no-subagents-051226
-    # (949 traces, no-subagents variant — see plugins.yaml).
-    TRACE_SOURCE_FLAG="--public-dataset semianalysis_cc_traces_weka"
-    echo "Loading traces via aiperf public-dataset: semianalysis_cc_traces_weka ($dataset)"
+    # Per-recipe override: set WEKA_LOADER_OVERRIDE to one of the aiperf
+    # public-dataset loader names allowed by the inferencex-agentx-mvp
+    # scenario. Used by recipes whose servers have non-default context
+    # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
+    # unfiltered 052726 corpus and switches to the 256k-capped variant).
+    local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
+    local dataset
+    case "$loader" in
+        semianalysis_cc_traces_weka_with_subagents)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-052726"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
+            ;;
+        *)
+            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
+            exit 1
+            ;;
+    esac
+    TRACE_SOURCE_FLAG="--public-dataset $loader"
+    echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
     # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
     # for model weights) so subsequent runs read from cache instead of
     # re-downloading every job.
@@ -921,6 +940,12 @@ resolve_trace_source() {
 }
 
 install_agentic_deps() {
+    # vllm/vllm-openai container ships without git. pip needs git to
+    # introspect the aiperf source tree on install. Install on demand;
+    # no-op when git is already present (e.g. AMD images that ship it).
+    if ! command -v git >/dev/null 2>&1; then
+        apt-get update && apt-get install -y git
+    fi
     agentic_pip_install --quiet urllib3 requests 2>/dev/null || true
     agentic_pip_install -q -r "$AGENTIC_DIR/requirements.txt"
     # Editable install of aiperf from the submodule — gives us the
@@ -943,22 +968,25 @@ install_agentic_deps() {
 build_replay_cmd() {
     # aiperf invocation for the inferencex-agentx-mvp scenario.
     #
-    # Live-assistant mode is on by default
-    # (AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES=1): the loader emits
-    # user-only deltas and the worker threads the server's live assistant
-    # response back into the session. This preserves cache-hit reuse on
-    # the just-generated KV blocks at the cost of hash-id fidelity past
-    # turn 0 — which is exactly what we want for benchmark numbers.
+    # Pre-canned assistant replay is the default: recorded assistant responses
+    # are used for future prompt construction, and live server responses are
+    # discarded. Set AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES=1 explicitly
+    # to use live-assistant mode, where the loader emits user-only deltas and
+    # the worker threads the server's live assistant response back into the
+    # session.
     #
-    # The scenario plugin locks: --cache-bust first_turn_prefix,
-    # --inter-turn-delay-cap-seconds 60, etc., and auto-injects them — so
-    # we do not pass them. See utils/aiperf/docs/tutorials/agentx-mvp.md.
+    # The scenario plugin locks: --cache-bust first_turn_prefix and
+    # --trace-idle-gap-cap-seconds 60 (per-trace idle-gap compression
+    # against parent + subagent request-start timestamps; supersedes the
+    # legacy --use-think-time-only / --inter-turn-delay-cap-seconds path),
+    # and auto-injects them — so we do not pass them. See
+    # utils/aiperf/docs/tutorials/agentx-mvp.md.
     local result_dir="$1"
-    local duration="${DURATION:-1800}"
+    local duration="$DURATION"
 
-    export AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES=1
+    export AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES="${AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES:-0}"
     # Dataset configuration (load + reconstruct + inputs.json + mmap)
-    # routinely takes 4-5 min for the 949-trace weka corpus on fast /tmp
+    # routinely takes 4-5 min for the Weka corpus on fast /tmp
     # (B300) but can stretch to 14 min on slower /tmp + parallel contention
     # (observed on H200 where all 14 R3 jobs hit aiperf's 900s Configure
     # Profiling timeout simultaneously). Bump to 1800s to absorb 3x
@@ -976,12 +1004,11 @@ build_replay_cmd() {
     REPLAY_CMD+=" --concurrency $CONC"
     REPLAY_CMD+=" --benchmark-duration $duration"
     REPLAY_CMD+=" --random-seed 42"
-    # Abort the run if real-failure rate exceeds 5% after a grace floor of
-    # max(CONC, 10) records. Context-overflow records are dropped from the
-    # failure tally in AGENTIC_REPLAY scenarios (see record_processor_service
-    # in the aiperf submodule), so this threshold measures only real failures
-    # (server 5xx, parse errors, malformed responses).
-    REPLAY_CMD+=" --failed-request-threshold 0.05"
+    # Fail runs once more than 10% of requests error. This keeps known
+    # transient low-rate failures from killing long sweeps while still
+    # catching malformed payloads or server crashes before they get aggregated
+    # as benchmarkable data.
+    REPLAY_CMD+=" --failed-request-threshold 0.10"
     # Sample each trajectory's warmup start position uniformly from
     # [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream).
     # Avoids starting trajectories right at turn 0 where the KV cache is
@@ -1002,11 +1029,18 @@ build_replay_cmd() {
     # need trust_remote_code=True to load. Benign for models without
     # custom tokenizer code, so we set it unconditionally.
     REPLAY_CMD+=" --tokenizer-trust-remote-code"
-    # Default --num-dataset-entries is 100; the weka corpus has 949. Cap
-    # at 949 so all unique traces are loaded (the loader treats this as a
-    # ``min(cap, available)`` ceiling, not a target — see
+    # Keep replay inputs inside the same context window used to launch the
+    # server. The WEKA corpus contains a few very long parent/subagent traces;
+    # if we mmap and replay them against a smaller-context server they become
+    # deterministic 4xxs and can still pressure the engine while queued.
+    if [ -n "${MAX_MODEL_LEN:-}" ] && [ "$MAX_MODEL_LEN" != "0" ]; then
+        REPLAY_CMD+=" --max-context-length $MAX_MODEL_LEN"
+    fi
+    # Default --num-dataset-entries is 100; the with-subagents Weka corpus
+    # has 472. Cap at 472 so all unique traces are loaded (the loader treats
+    # this as a ``min(cap, available)`` ceiling, not a target — see
     # semianalysis_cc_traces_weka.py).
-    REPLAY_CMD+=" --num-dataset-entries 949"
+    REPLAY_CMD+=" --num-dataset-entries 472"
     # 1-second timeslices on the server-metrics scrape so the post-run
     # plotter has per-window time series (KV usage, cache hit rate,
     # throughput, etc.). Matches kv-cache-tester's poll_interval=1.0
@@ -1014,7 +1048,7 @@ build_replay_cmd() {
     # Without this, aiperf only emits aggregate stats and the 6x2 panels
     # collapse to flat lines.
     REPLAY_CMD+=" --slice-duration 1.0"
-    REPLAY_CMD+=" --output-artifact-dir $result_dir/trace_replay"
+    REPLAY_CMD+=" --output-artifact-dir $result_dir/aiperf_artifacts"
     # The inferencex-agentx-mvp scenario enforces a 900s minimum
     # benchmark duration. For smoke tests with shorter durations, opt
     # into --unsafe-override (the run's submission_valid will be flagged
@@ -1038,3 +1072,27 @@ write_agentic_result_json() {
     # missing in a stripped-down image). The agg JSON is the success gate.
     python3 "$INFMAX_CONTAINER_WORKSPACE/utils/generate_aiperf_plots.py" "$result_dir" 2>&1 || true
 }
+
+run_agentic_replay_and_write_outputs() {
+    local result_dir="$1"
+    local replay_rc
+
+    echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt"
+
+    set +e
+    set -x
+    $REPLAY_CMD 2>&1 | tee "$result_dir/benchmark.log"
+    replay_rc=${PIPESTATUS[0]}
+    set +x
+    set -e
+
+    write_agentic_result_json "$result_dir"
+
+    python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
+        "$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true
+
+    if [ "$replay_rc" -ne 0 ]; then
+        echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2
+        return "$replay_rc"
+    fi
+}
diff --git a/benchmarks/multi_node/agentic_srt.sh b/benchmarks/multi_node/agentic_srt.sh
index 2be99bf58..a0e9e243c 100644
--- a/benchmarks/multi_node/agentic_srt.sh
+++ b/benchmarks/multi_node/agentic_srt.sh
@@ -9,14 +9,9 @@ set -x
 INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/infmax-workspace}"
 source "$INFMAX_CONTAINER_WORKSPACE/benchmarks/benchmark_lib.sh"
 
-check_env_vars MODEL MODEL_PREFIX FRAMEWORK PRECISION CONC RESULT_FILENAME
+check_env_vars MODEL MODEL_PREFIX FRAMEWORK PRECISION CONC RESULT_FILENAME DURATION
 
-PORT="${PORT:-8000}"
 RESULT_DIR="${RESULT_DIR:-/logs/agentic}"
-DURATION="${DURATION:-1800}"
-MAX_DELAY="${MAX_DELAY:-60}"
-ADVANCE_MIN="${ADVANCE_MIN:-0.0}"
-ADVANCE_MAX="${ADVANCE_MAX:-0.7}"
 
 mkdir -p "$RESULT_DIR"
 
@@ -24,18 +19,4 @@ resolve_trace_source
 install_agentic_deps
 
 build_replay_cmd "$RESULT_DIR"
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set +e
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log"
-REPLAY_RC=${PIPESTATUS[0]}
-set -e
-
-write_agentic_result_json "$RESULT_DIR"
-
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
-
-if [ "$REPLAY_RC" -ne 0 ]; then
-    echo "WARNING: agentic trace replay exited with code $REPLAY_RC after writing available results" >&2
-fi
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml
new file mode 100644
index 000000000..fb7b9fd97
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml
@@ -0,0 +1,177 @@
+name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4-agentic"
+
+# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml.
+# Topology is identical (1 prefill DEP=4 + 6 decode TP=4, 28 GPUs across 7
+# GB300 nodes + 1 dedicated NATS/etcd infra node) so we can compare against
+# the fixed-seq-len 1p6d baseline at the same concurrency point (192).
+#
+# Divergence vs the 8k1k sibling:
+#   - benchmark.type:        sa-bench -> custom (hands off to agentic_srt.sh)
+#   - max-model-len:         removed (let vLLM derive from model config; agentic
+#                              trajectories blow past any small explicit cap)
+#   - no-enable-prefix-caching: dropped (prefix caching MUST be on for
+#                              trajectory reuse — entire point of agentic)
+# Note: --enable-auto-tool-choice / --tool-call-parser / --reasoning-parser
+# are NOT set on the worker. The dynamo-vllm worker entrypoint doesn't
+# accept them (different arg parser than `vllm serve`). In disagg, chat
+# parsing happens at the dynamo frontend, not at the worker.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 6
+  prefill_workers: 1
+  decode_workers: 6
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+infra:
+  etcd_nats_dedicated_node: true
+  # Raise NATS server max_payload from the 1 MiB default to 32 MiB.
+  # Agentic prompts at 50k-200k DSv4 tokens serialize to JSON at ~10-15
+  # bytes/token, easily clearing 1-3 MB per request. Without this, every
+  # long-prompt prefill RPC gets rejected by the NATS server with
+  # "maximum payload exceeded" (visible in infra.out), and the dynamo
+  # frontend surfaces a misleading "NATS request ... deadline has elapsed"
+  # (it never gets a reply because the publish was rejected). 32 MiB gives
+  # ~10x headroom over the largest observed payload (3.2 MB) without
+  # crossing NATS's 64 MiB hard cap or Dynamo's 16 MiB advisory limit.
+  nats_max_payload_mb: 32
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      enforce-eager: true
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+# sbatch + srun resource grants for clusters without per-GPU defaults.
+#
+# mem=0: allocate all available node memory (~868 GB on CW gb300). Without
+# this, sbatch only requests ntasks × DefMemPerCPU = 8 × 4 GB = 32 GB for
+# the whole job and worker cgroups OOM-kill mid model load (R7-R11 hit
+# this; sacct showed AllocTRES mem=4G per step).
+#
+# cpus-per-task=72: give each task one CW gb300 NUMA socket (144 cores
+# split 2 × 72). Critical for the *infra step* (etcd + nats) which
+# srtctl spawns without --gres=gpu — on CW that means DefMemPerCPU
+# applies and the step gets 1 CPU by default. With 24 dynamo DP ranks
+# all hammering etcd for lease keep-alives, single-CPU etcd can't keep
+# up and dies (R12 hit this; etcd reported max-cpu-set=1, leases
+# deadline-exceeded, infra SIGKILL'd at 16:35:49). 72 CPUs is plenty
+# for both etcd + nats AND for vLLM worker auxiliary threads.
+#
+# nv gb300 doesn't need this because cluster default DefCpuPerGPU=35
+# auto-allocates 4*35=140 CPUs per GPU-bearing task; cw has no per-GPU
+# default. Setting it here is safe on both because the value is ≤ node
+# CPU count.
+#
+# srun_options.mem=0 forces each srun step to use the full node memory
+# (without it, srun steps default back to cpus_per_task × DefMemPerCPU).
+# Docs: docs/config-reference.md#sbatch_directives + #srun_options.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+  # gb300-nv: pyxis maps the calling user (sa-shared) into the container as
+  # uid 345200007. dpkg refuses to run without EUID 0 even though
+  # ENROOT_ROOTFS_WRITABLE=1 makes the rootfs writable, so the agentic_srt
+  # apt-get install git step fails. --container-remap-root asks pyxis to
+  # remap us to uid 0 inside the container, matching the gb300-cw behavior.
+  # No-op on cw (already root). srt-slurm renders empty-string values as
+  # flag-only srun args (see core/slurm.py:250).
+  container-remap-root: ""
+
+benchmark:
+  type: custom
+  command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh
+  env:
+    INFMAX_CONTAINER_WORKSPACE: /infmax-workspace
+    RESULT_DIR: /logs/agentic
+    PORT: "8000"
+    IS_MULTINODE: "true"
+    # Container-side path of the aiperf mmap dataset cache; the host-side
+    # mount is wired via launch_gb300-*.sh's srtslurm.yaml default_mounts.
+    # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files
+    # per dataset on every run.
+    AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml
new file mode 100644
index 000000000..f1bd9b1e9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml
@@ -0,0 +1,136 @@
+name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4-keepalive"
+
+# Keepalive variant of disagg-gb300-1p6d-dep4-tp4-agentic.yaml: same
+# server topology (1P + 6D = 7 vLLM workers + 1 NATS/etcd infra node)
+# but `benchmark.command` is replaced with a long sleep instead of
+# agentic_srt.sh. Brings up the server and parks the orchestrator so
+# you can hammer aiperf from outside without competing with the
+# launcher's own aiperf invocation.
+#
+# Usage:
+#   cd <srt-slurm-clone>
+#   srtctl apply --no-preflight -f \
+#       recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml
+#   tail -F outputs/<JOB_ID>/logs/sweep_<JOB_ID>.log
+#   # wait for "Model is ready. Have 4 prefills and 6 decodes."
+#   # then run aiperf against http://<head_node>:8000 from anywhere
+#   # tear down:  scancel <JOB_ID>
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 6
+  prefill_workers: 1
+  decode_workers: 6
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+infra:
+  etcd_nats_dedicated_node: true
+  # See sibling 1p6d agentic recipe for rationale — NATS 1 MiB default
+  # rejects long agentic prompts; 32 MiB gives ~10x headroom.
+  nats_max_payload_mb: 32
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      enforce-eager: true
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+  container-remap-root: ""
+
+# THIS IS THE KEY DIFF vs the agentic sibling: use srt-slurm's
+# first-class `manual` benchmark mode instead of spawning agentic_srt.sh.
+# In manual mode, BenchmarkStageMixin.run_benchmark() (see
+# src/srtctl/cli/mixins/benchmark_stage.py:131-141) brings up workers
+# + frontend, logs "Frontend URL: http://<head_node>:8000", then sleeps
+# in a 5s health-check loop waiting only for worker failures or
+# scancel/Ctrl+C. External clients (your aiperf shell) drive the
+# server directly. No competing benchmark container, no sleep hack.
+benchmark:
+  type: manual
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml
new file mode 100644
index 000000000..bb8fc6df8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml
@@ -0,0 +1,176 @@
+name: "svf-vllm-disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic"
+
+# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml.
+# Max-throughput shape: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 GB300
+# nodes (4P + 2D = 24 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra
+# node. Sized for concurrency 4096 with deep_gemm_mega_moe on both workers.
+#
+# Divergence vs the 8k1k sibling:
+#   - benchmark.type:        sa-bench -> custom (hands off to agentic_srt.sh)
+#   - max-model-len:         removed (let vLLM derive from model config; agentic
+#                              trajectories blow past any small explicit cap)
+#   - no-enable-prefix-caching: dropped (prefix caching MUST be on for
+#                              trajectory reuse — entire point of agentic)
+# Note: --enable-auto-tool-choice / --tool-call-parser / --reasoning-parser
+# are NOT set on the worker. The dynamo-vllm worker entrypoint doesn't
+# accept them (different arg parser than `vllm serve`). In disagg, chat
+# parsing happens at the dynamo frontend, not at the worker.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+  # See sibling 1p6d recipe for rationale — NATS 1 MiB default rejects
+  # agentic prompts; 32 MiB gives ~10x headroom over observed payloads.
+  nats_max_payload_mb: 32
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      no-async-scheduling: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      trust-remote-code: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+# sbatch + srun resource grants for clusters without per-GPU defaults.
+#
+# mem=0: allocate all available node memory (~868 GB on CW gb300). Without
+# this, sbatch only requests ntasks × DefMemPerCPU = 8 × 4 GB = 32 GB for
+# the whole job and worker cgroups OOM-kill mid model load (R7-R11 hit
+# this; sacct showed AllocTRES mem=4G per step).
+#
+# cpus-per-task=72: give each task one CW gb300 NUMA socket (144 cores
+# split 2 × 72). Critical for the *infra step* (etcd + nats) which
+# srtctl spawns without --gres=gpu — on CW that means DefMemPerCPU
+# applies and the step gets 1 CPU by default. With 24 dynamo DP ranks
+# all hammering etcd for lease keep-alives, single-CPU etcd can't keep
+# up and dies (R12 hit this; etcd reported max-cpu-set=1, leases
+# deadline-exceeded, infra SIGKILL'd at 16:35:49). 72 CPUs is plenty
+# for both etcd + nats AND for vLLM worker auxiliary threads.
+#
+# nv gb300 doesn't need this because cluster default DefCpuPerGPU=35
+# auto-allocates 4*35=140 CPUs per GPU-bearing task; cw has no per-GPU
+# default. Setting it here is safe on both because the value is ≤ node
+# CPU count.
+#
+# srun_options.mem=0 forces each srun step to use the full node memory
+# (without it, srun steps default back to cpus_per_task × DefMemPerCPU).
+# Docs: docs/config-reference.md#sbatch_directives + #srun_options.
+sbatch_directives:
+  mem: "0"
+  cpus-per-task: "72"
+srun_options:
+  mem: "0"
+  # gb300-nv: pyxis maps the calling user (sa-shared) into the container as
+  # uid 345200007. dpkg refuses to run without EUID 0 even though
+  # ENROOT_ROOTFS_WRITABLE=1 makes the rootfs writable, so the agentic_srt
+  # apt-get install git step fails. --container-remap-root asks pyxis to
+  # remap us to uid 0 inside the container, matching the gb300-cw behavior.
+  # No-op on cw (already root). srt-slurm renders empty-string values as
+  # flag-only srun args (see core/slurm.py:250).
+  container-remap-root: ""
+
+benchmark:
+  type: custom
+  command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh
+  env:
+    INFMAX_CONTAINER_WORKSPACE: /infmax-workspace
+    RESULT_DIR: /logs/agentic
+    PORT: "8000"
+    IS_MULTINODE: "true"
+    # Container-side path of the aiperf mmap dataset cache; the host-side
+    # mount is wired via launch_gb300-*.sh's srtslurm.yaml default_mounts.
+    # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files
+    # per dataset on every run.
+    AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache"
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
index af275e6ef..f9955adc7 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
 SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-5}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
@@ -67,14 +61,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
index f7c7f9ca1..ff76b768d 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -60,14 +55,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
index 03dee8dd0..108347479 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
@@ -19,18 +19,17 @@ set -x
 #
 # Required env vars:
 #   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none        - vLLM GPU KV only, with DSv4 hybrid KV manager enabled.
+#   cpu         - vLLM native OffloadingConnector, with hybrid KV manager enabled.
+#   lmcache-mp  - Temporarily disabled for DSv4. LMCache PR #3261 must merge
+#                 first so LMCacheMPConnector can support HMA block-id tuples.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
-DP_ATTENTION=${DP_ATTENTION:-false}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=1000000
 fi
@@ -51,45 +50,145 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
+HYBRID_KV_ARGS=(--no-disable-hybrid-kv-cache-manager)
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
 case "$OFFLOADING" in
     none) ;;
     cpu)
         # b200-dgxc compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
-        # individual jobs to a fraction of that. Aim for ~1.5 TB total host
-        # CPU pool across the engine(s).
+        # individual jobs to a fraction of that. Aim for ~1.2 TB total native
+        # CPU offload pool across the engine(s); previously 2.8 TB but every
+        # DP-attn worker stalled for 4+ min during pinned-CPU-tensor allocation
+        # and the shm_broadcast watchdog killed them (run 26246044726). 150 GB
+        # per worker (1.2 TB / 8) completes the alloc within the 60 s window.
         #
-        # SimpleCPUOffloadConnector divides cpu_bytes_to_use by
-        # parallel_config.world_size (= TP*PP, NOT including DP — see
-        # vllm/config/parallel.py and parallel.py docstrings). So:
-        #   - DP-attn=true  → each of $TP DP engines has world_size=1 in
-        #     its parallel_config; the connector does no internal divide,
-        #     and each engine torch.zeros + pin_tensor allocates the full
-        #     --kv_offloading_size value. Pre-divide by $TP here so the
-        #     aggregate host commit ≈ TOTAL_CPU_DRAM_GB.
-        #   - DP-attn=false → single engine with world_size=TP. Pass the
-        #     full TOTAL_CPU_DRAM_GB; the connector's internal divide
-        #     yields TOTAL/TP per rank, and TP-shared mmap (PR #37206)
-        #     keeps the aggregate at TOTAL.
-        TOTAL_CPU_DRAM_GB=1500
+        # Native --kv-offloading-size becomes OffloadingConnector's
+        # cpu_bytes_to_use. For DP-attn there are $TP independent DP engines,
+        # so pre-divide to keep aggregate host commit near TOTAL_CPU_DRAM_GB.
+        # For pure TP, vLLM treats the size as the total across TP ranks.
+        TOTAL_CPU_DRAM_GB=1200
         if [ "$DP_ATTENTION" = "true" ]; then
             PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP))
         else
             PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
         fi
-        PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024))
-        # Use --kv-transfer-config JSON to also pass lazy_offload=true. Eager
-        # mode (default) hits an AssertionError in
-        # vllm/v1/core/kv_cache_utils.py:269 popleft_n at low/mid CONC; lazy
-        # mode defers the store path and clears low/mid CONC at 80-100%.
-        # See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob.
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}"
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        OFFLOAD_ARGS=(
+            --kv-offloading-backend native
+            --kv-offloading-size "$PER_ENGINE_GB"
+        )
+        ;;
+    lmcache-mp)
+        { set +x; } 2>/dev/null
+        # LMCacheMPConnector needs HMA support before it can run DSv4 with the
+        # hybrid KV manager. Re-enable this path after
+        # https://github.com/LMCache/LMCache/pull/3261 is merged.
+        echo "Error: OFFLOADING=lmcache-mp is disabled for DSv4 until LMCache PR #3261 adds HMA support." >&2
+        exit 1
+
+        # LMCache docs recommend MP mode for production: start an external
+        # `lmcache server`, then point vLLM's LMCacheMPConnector at it. For
+        # vLLM >= 0.20, prefer the LMCache-shipped connector module because it
+        # tracks the latest server protocol ahead of vLLM's vendored copy.
+        #
+        # Important DSv4 caveat: LMCacheMPConnector currently only accepts the
+        # non-hybrid KV block layout. The connector raises if vLLM returns the
+        # hybrid block-id tuple used by the CSA/HCA hybrid KV manager. This
+        # mode therefore disables the hybrid manager; `none` and `cpu` keep it
+        # enabled for the normal B200 DSv4 path.
+        agentic_pip_install --quiet --no-cache-dir lmcache
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        TOTAL_CPU_DRAM_GB=2800
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-200}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        HYBRID_KV_ARGS=(--disable-hybrid-kv-cache-manager)
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+        )
         ;;
     *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache-mp)" >&2
         exit 1
         ;;
 esac
@@ -120,25 +219,31 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve "$MODEL" \
---host 0.0.0.0 \
---port "$PORT" \
---trust-remote-code \
---kv-cache-dtype fp8 \
---block-size 256 \
-"${PARALLEL_ARGS[@]}" \
-"${EP_ARGS[@]}" \
---compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
---attention_config.use_fp4_indexer_cache=True \
---tokenizer-mode deepseek_v4 \
---tool-call-parser deepseek_v4 \
---enable-auto-tool-choice \
---reasoning-parser deepseek_v4 \
---enable-prefix-caching \
---no-disable-hybrid-kv-cache-manager \
---max-model-len "$MAX_MODEL_LEN" \
---max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --trust-remote-code
+    --kv-cache-dtype fp8
+    --block-size 256
+    "${PARALLEL_ARGS[@]}"
+    "${EP_ARGS[@]}"
+    --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+    --attention_config.use_fp4_indexer_cache=True
+    --tokenizer-mode deepseek_v4
+    --tool-call-parser deepseek_v4
+    --enable-auto-tool-choice
+    --reasoning-parser deepseek_v4
+    --enable-prefix-caching
+    "${HYBRID_KV_ARGS[@]}"
+    --max-model-len "$MAX_MODEL_LEN"
+    --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
@@ -147,14 +252,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index e21b31e7a..f6748a5f8 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -22,15 +22,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
-DP_ATTENTION=${DP_ATTENTION:-false}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=1000000
 fi
@@ -147,14 +140,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
new file mode 100755
index 000000000..99aec25fe
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -0,0 +1,162 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang.
+# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len
+# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json
+# / analyze_benchmark_distributions) swapped in for run_benchmark_serving.
+#
+# This launcher does NOT support CPU offload. SGLang's KV offload paths are
+# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic
+# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
+
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=1000000
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ROCR/HIP visibility under slurm cgroups.
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# Reject anything other than none: this launcher has no SGLang CPU-offload
+# wiring (different surface than vLLM's SimpleCPUOffloadConnector).
+case "$OFFLOADING" in
+    none) ;;
+    *)
+        echo "Error: dsv4_fp4_mi355x_sglang.sh only supports OFFLOADING=none (got '$OFFLOADING')" >&2
+        exit 1
+        ;;
+esac
+
+# Transformers in the container doesn't recognize the `deepseek_v4` model_type.
+# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this
+# by writing a patched config to /tmp, but in practice isn't catching the error
+# in this image. Patch the cached config.json directly instead: set model_type
+# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep
+# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native
+# DSv4 model class (python/sglang/srt/models/deepseek_v4.py).
+python3 << PYEOF
+import json
+from huggingface_hub import hf_hub_download
+path = hf_hub_download(repo_id="$MODEL", filename="config.json")
+with open(path) as f:
+    config = json.load(f)
+if config.get("model_type") == "deepseek_v4":
+    config["model_type"] = "deepseek_v3"
+    with open(path, "w") as f:
+        json.dump(config, f, indent=2)
+    print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3")
+else:
+    print(f"No patch needed: model_type is {config.get('model_type')!r}")
+PYEOF
+
+# DSv4 FP4-experts path. Mirrors the env block in the fixed-seq-len sibling
+# (benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh), which tracks the active
+# block in python/run_dsv4.sh on the amd/deepseek_v4 branch:
+#   SGLANG_DSV4_FP4_EXPERTS=True   -> route experts through FP4 kernels
+#   SGLANG_FORCE_TRITON_MOE_FP8=0  -> dispatch MoE through aiter and apply
+#                                    the swiglu_limit clamp in the triton
+#                                    MoE fallback path.
+export SGLANG_REASONING_EFFORT=max
+export SGLANG_OPT_USE_FUSED_COMPRESS=true
+export SGLANG_OPT_USE_OLD_COMPRESSOR=true
+export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
+export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
+export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
+export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
+export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
+export SGLANG_OPT_USE_TILELANG_MHC_POST=false
+export SGLANG_OPT_USE_AITER_MHC_PRE=true
+export SGLANG_OPT_USE_AITER_MHC_POST=true
+export SGLANG_ENABLE_THINKING=1
+export SGLANG_USE_AITER=1
+export SGLANG_USE_ROCM700A=1
+export SGLANG_TOPK_TRANSFORM_512_TORCH=0
+export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1
+export SGLANG_DSV4_FP4_EXPERTS=True
+export SGLANG_OPT_DPSK_V4_RADIX=0
+export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false
+export SGLANG_OPT_USE_FUSED_STORE_CACHE=false
+export SGLANG_FORCE_TRITON_MOE_FP8=0
+export SGLANG_HACK_FLASHMLA_BACKEND=tilelang
+export SGLANG_OPT_USE_TILELANG_INDEXER=true
+export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200
+# vllm agentic launcher so the agentic sweep can probe both interactivity and
+# throughput regimes.
+PARALLEL_ARGS=(--tensor-parallel-size "$TP")
+if [ "$DP_ATTENTION" = "true" ]; then
+    PARALLEL_ARGS+=(
+        --dp "$TP"
+        --enable-dp-attention
+        --enable-prefill-delayer
+    )
+fi
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    PARALLEL_ARGS+=(--ep-size "$EP_SIZE")
+fi
+
+# --max-running-requests is per-engine. With DP-attn each DP engine handles
+# only CONC/$TP sequences in steady state (the agentic harness load-balances
+# users across DP ranks), so size the per-engine cap to that.
+# Pure TP is a single engine and sees all CONC sequences itself.
+if [ "$DP_ATTENTION" = "true" ]; then
+    PER_ENGINE_MAX_RUNNING=$(( CONC / TP ))
+    [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1
+else
+    PER_ENGINE_MAX_RUNNING=$CONC
+fi
+
+echo "Starting sglang server..."
+python3 -m sglang.launch_server \
+    --model-path "$MODEL" \
+    --host=0.0.0.0 \
+    --port "$PORT" \
+    "${PARALLEL_ARGS[@]}" \
+    --trust-remote-code \
+    --attention-backend compressed \
+    --max-running-requests "$PER_ENGINE_MAX_RUNNING" \
+    --cuda-graph-max-bs "$PER_ENGINE_MAX_RUNNING" \
+    --page-size 256 \
+    --context-length "$MAX_MODEL_LEN" \
+    --chunked-prefill-size 8192 \
+    --disable-shared-experts-fusion \
+    --tool-call-parser deepseekv4 \
+    --reasoning-parser deepseek-v4 \
+    --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \
+    --watchdog-timeout 1800 > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
index 8049c1082..0a0177983 100755
--- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -11,13 +11,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=800000
 fi
@@ -71,14 +66,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 6795086a3..500b456f5 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -63,7 +58,6 @@ python3 -m sglang.launch_server \
     --nsa-decode-backend tilelang \
     --kv-cache-dtype fp8_e4m3 \
     --tokenizer-worker-num $((TP*2)) \
-    --disable-radix-cache \
     --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
@@ -73,14 +67,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
index 91c289d7c..259c19586 100755
--- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -65,7 +60,6 @@ python3 -m sglang.launch_server \
 --chunked-prefill-size 32768 \
 --max-prefill-tokens 32768 \
 --enable-flashinfer-allreduce-fusion \
---disable-radix-cache \
 --stream-interval 30 \
 --context-length $MAX_MODEL_LEN \
 --enable-metrics \
@@ -78,14 +72,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
index 284bf3be2..6e921db58 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -74,14 +69,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
index dce4f4250..557986b0d 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 # Agentic matrix entries don't set max-model-len, so the workflow passes 0.
 # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
@@ -78,14 +73,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
index c8050fe12..1592a8d5c 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 # Agentic matrix entries don't set max-model-len, so the workflow passes 0.
 # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
@@ -78,14 +73,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
index 962210577..eb1883ff1 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 # Agentic matrix entries don't set max-model-len, so the workflow passes 0.
 # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
@@ -91,14 +86,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
index 38ccac035..99e29c819 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 # Agentic matrix entries don't set max-model-len, so the workflow passes 0.
 # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
@@ -90,14 +85,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index a1c95f64a..ad0b4495a 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -5,17 +5,17 @@ set -x
 # Agentic trace replay benchmark for Kimi-K2.5 NVFP4 on B200 using vLLM.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native simple CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -30,9 +30,61 @@ install_agentic_deps
 
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
 case "$OFFLOADING" in
     none)
         ;;
@@ -44,10 +96,70 @@ case "$OFFLOADING" in
         # the full eager sweep before.
         TOTAL_CPU_DRAM_GB=2500
         export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        agentic_pip_install --quiet --no-cache-dir lmcache
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode
+        # owns that pool in the external LMCache server instead of passing
+        # --kv-offloading-size through vLLM's integrated LMCache convenience
+        # path, which divides the value by TP and then hits a large single-shot
+        # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend.
+        TOTAL_CPU_DRAM_GB=2500
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector builds its ZMQ endpoint by concatenating
+        # lmcache.mp.host and lmcache.mp.port, and its default host already
+        # includes the tcp:// scheme. Keep the server bind host raw, but pass
+        # a ZMQ-style host string to the connector.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
+        # Initial allocation is deliberately small; --l1-size-gb above is the
+        # actual pool capacity and grows lazily as the run fills the cache.
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+            --disable-hybrid-kv-cache-manager
+        )
         ;;
     *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2
         exit 1
         ;;
 esac
@@ -64,20 +176,27 @@ export PYTHONNOUSERSITE=1
 # unsafe.
 export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
 
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---tensor-parallel-size=$TP \
---gpu-memory-utilization 0.90 \
---max-num-seqs $CONC \
---reasoning-parser kimi_k2 \
---tool-call-parser kimi_k2 \
---compilation_config.pass_config.fuse_allreduce_rms true \
---kv-cache-dtype fp8 \
---max-cudagraph-capture-size 2048 \
---stream-interval 20 \
---trust-remote-code \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    --gpu-memory-utilization 0.90
+    --max-num-seqs "$CONC"
+    --reasoning-parser kimi_k2
+    --tool-call-parser kimi_k2
+    --compilation_config.pass_config.fuse_allreduce_rms true
+    --kv-cache-dtype fp8
+    --max-cudagraph-capture-size 2048
+    --stream-interval 20
+    --trust-remote-code
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
@@ -86,14 +205,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
index d5975b1c4..8cebe4f20 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
@@ -6,16 +6,16 @@ set -x
 #
 # Required env vars:
 #   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native simple CPU offload.
+#   lmcache - in-process LMCacheConnectorV1 via vLLM's lmcache backend.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -32,7 +32,9 @@ install_agentic_deps
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
 case "$OFFLOADING" in
     none) ;;
     cpu)
@@ -43,28 +45,65 @@ case "$OFFLOADING" in
         # inside the cgroup for vLLM worker RSS + page cache.
         TOTAL_CPU_DRAM_GB=2500
         export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        agentic_pip_install --quiet --no-cache-dir lmcache
+        python3 -c "import lmcache.integration.vllm.vllm_v1_adapter" >/dev/null
+
+        # B300 NV nodes expose ~2.82 TiB to the job cgroup. Keep the LMCache
+        # CPU pool at 2.5 TB to match the native offload envelope while leaving
+        # headroom for vLLM workers and page cache. vLLM divides this total
+        # across TP ranks for --kv-offloading-backend=lmcache.
+        TOTAL_CPU_DRAM_GB=2500
+        export LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        # Avoid pinning the full 2.5 TB during engine startup. LMCache grows
+        # the CPU allocator as agentic prefixes accumulate in the replay.
+        export LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR="${LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR:-true}"
+        export LMCACHE_LAZY_MEMORY_INITIAL_RATIO="${LMCACHE_LAZY_MEMORY_INITIAL_RATIO:-0.01}"
+        export LMCACHE_LAZY_MEMORY_STEP_RATIO="${LMCACHE_LAZY_MEMORY_STEP_RATIO:-0.02}"
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        OFFLOAD_ARGS=(
+            --kv-offloading-backend lmcache
+            --kv-offloading-size "$TOTAL_CPU_DRAM_GB"
+            --disable-hybrid-kv-cache-manager
+        )
         ;;
-    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
+    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2; exit 1 ;;
 esac
 
 echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---tensor-parallel-size=$TP \
---gpu-memory-utilization 0.90 \
---max-num-seqs $CONC \
---reasoning-parser kimi_k2 \
---tool-call-parser kimi_k2 \
---compilation_config.pass_config.fuse_allreduce_rms true \
---kv-cache-dtype fp8 \
---max-cudagraph-capture-size 2048 \
---stream-interval 20 \
---trust-remote-code \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    --gpu-memory-utilization 0.90
+    --max-num-seqs "$CONC"
+    --reasoning-parser kimi_k2
+    --tool-call-parser kimi_k2
+    --compilation_config.pass_config.fuse_allreduce_rms true
+    --kv-cache-dtype fp8
+    --max-cudagraph-capture-size 2048
+    --stream-interval 20
+    --trust-remote-code
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
@@ -73,14 +112,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index c72076118..fd0ce3677 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -5,18 +5,24 @@ set -x
 # Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0.
+# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this
+# script we need the concrete value so AgentX filters prompt+max_tokens against
+# the same limit vLLM enforces.
+if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
+    MAX_MODEL_LEN=262144
+fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -43,6 +49,522 @@ if [ "${TP}" -lt 8 ]; then
   export VLLM_ROCM_USE_AITER_RMSNORM=0
 fi
 
+write_lmcache_rocm_mp_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/sitecustomize.py" <<'PY'
+"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
+
+import os
+import threading
+
+if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
+    import builtins
+    import sys
+
+    _orig_import = builtins.__import__
+
+    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
+        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
+
+        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
+            return
+
+        _orig_init = _LazyMemoryAllocator.__init__
+        _orig_allocate = _LazyMemoryAllocator.allocate
+        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
+
+        def _expand_to(self, target_size: int) -> None:
+            target_size = min(
+                self._final_size,
+                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
+            )
+            lock = self._agentic_rocm_demand_expand_lock
+            with lock:
+                if target_size <= self._curr_size:
+                    return
+
+                start_size = self._curr_size
+                while self._curr_size < target_size:
+                    commit_start = self._curr_size
+                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
+                    while self._curr_size < commit_target:
+                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
+                        self._curr_size += self.PIN_CHUNK_SIZE
+                    self._commit_expansion(self._curr_size - commit_start)
+
+                self._log_expansion_progress(self._curr_size - start_size)
+
+        def _retry_with_demand_expansion(self, allocate_once):
+            obj = allocate_once()
+            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
+            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
+
+            while obj is None and self._curr_size < self._final_size:
+                _expand_to(self, self._curr_size + step_bytes)
+                obj = allocate_once()
+
+            return obj
+
+        def _patched_init(self, *args, **kwargs):
+            _orig_init(self, *args, **kwargs)
+            self._agentic_rocm_demand_expand_lock = threading.Lock()
+
+            # LMCache MP's upstream LazyMemoryAllocator currently expands to
+            # the final pinned size in a background thread. On ROCm Kimi TP4,
+            # vLLM reaches KV-cache registration only after that 2.5 TB pool
+            # is fully pinned, and the server-side IPC open path can stall
+            # before acknowledging register_kv_caches. Keep the same final
+            # capacity, but pin/commit extra host memory only when L1
+            # allocations actually need it.
+            self._stop_expand.set()
+            self._expand_thread.join()
+            _lazy_memory_allocator.logger.info(
+                "Agentic ROCm patch: using demand-driven LMCache pinned "
+                "memory expansion; final capacity remains %s MB",
+                self._final_size >> 20,
+            )
+
+        def _patched_allocate(
+            self,
+            shapes,
+            dtypes,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
+            )
+
+        def _patched_batched_allocate(
+            self,
+            shapes,
+            dtypes,
+            batch_size,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_batched_allocate(
+                    self, shapes, dtypes, batch_size, fmt, allocator_type
+                ),
+            )
+
+        _LazyMemoryAllocator.__init__ = _patched_init
+        _LazyMemoryAllocator.allocate = _patched_allocate
+        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
+        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
+
+    def _patch_l1_memory_manager(_memory_manager) -> None:
+        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
+        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
+        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
+            return
+        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
+            return
+
+        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
+
+        def _patched_get_memory_usage(self):
+            allocator = getattr(self, "_allocator", None)
+            if isinstance(allocator, _LazyMemoryAllocator):
+                address_manager = allocator.get_address_manager()
+                used_size = (
+                    address_manager.get_heap_size() - address_manager.get_free_size()
+                )
+                return used_size, allocator._final_size
+            return _orig_get_memory_usage(self)
+
+        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
+        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
+
+    def _maybe_patch_lazy_memory_allocator() -> None:
+        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
+        if module is not None and hasattr(module, "LazyMemoryAllocator"):
+            _patch_lazy_memory_allocator(module)
+
+    def _maybe_patch_l1_memory_manager() -> None:
+        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
+        if module is not None and hasattr(module, "L1MemoryManager"):
+            _patch_l1_memory_manager(module)
+
+    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
+        module = _orig_import(name, globals, locals, fromlist, level)
+        if name == "lmcache.v1.lazy_memory_allocator" or (
+            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
+        ):
+            _maybe_patch_lazy_memory_allocator()
+        if name == "lmcache.v1.distributed.memory_manager" or (
+            name.startswith("lmcache")
+            and "lmcache.v1.distributed.memory_manager" in sys.modules
+        ):
+            _maybe_patch_l1_memory_manager()
+        return module
+
+    builtins.__import__ = _agentic_rocm_import
+    _maybe_patch_lazy_memory_allocator()
+    _maybe_patch_l1_memory_manager()
+
+if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
+    import torch
+    import lmcache.non_cuda_equivalents as lmc
+
+    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
+        _DTYPE_BY_NAME = {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "float32": torch.float32,
+        }
+
+        def _dtype_from_env() -> torch.dtype:
+            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
+            try:
+                return _DTYPE_BY_NAME[name]
+            except KeyError as exc:
+                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
+
+        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+            block_stride = shape_desc.block_stride_elems or (
+                shape_desc.bs * shape_desc.nh * shape_desc.hs
+            )
+            base = lmc._tensor_from_ptr(
+                ptr,
+                (shape_desc.nb * block_stride,),
+                dtype,
+                device,
+            )
+            return torch.as_strided(
+                base,
+                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
+                (block_stride, shape_desc.nh * shape_desc.hs, 1),
+            )
+
+        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+            return lmc._tensor_from_ptr(
+                ptr,
+                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
+                dtype,
+                device,
+            )
+
+        def multi_layer_block_kv_transfer(
+            group_kv_pointers,
+            tmp_buffer_ptrs,
+            block_ids,
+            paged_memory_device,
+            direction,
+            shape_desc,
+            lmcache_chunk_size,
+            gpu_kv_format,
+            skip_blocks=0,
+        ) -> None:
+            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
+            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
+            # fallback has no block-transfer entrypoint yet, so implement the
+            # same gather/scatter contract with torch indexing on ROCm.
+            if shape_desc.kv_size != 1:
+                raise NotImplementedError(
+                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
+                )
+
+            dtype = _dtype_from_env()
+            device = (
+                paged_memory_device
+                if isinstance(paged_memory_device, torch.device)
+                else torch.device(paged_memory_device)
+            )
+            num_layers = int(group_kv_pointers.numel())
+            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
+            direction_name = getattr(direction, "name", str(direction))
+
+            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
+                start = chunk_idx * blocks_per_chunk
+                end = start + blocks_per_chunk
+                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
+
+                dest_slot_offset = 0
+                if skip_blocks and chunk_idx == 0:
+                    chunk_blocks = chunk_blocks[int(skip_blocks):]
+                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
+                if chunk_blocks.numel() == 0:
+                    continue
+
+                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
+                tmp = _tmp_view(
+                    int(tmp_ptr),
+                    shape_desc,
+                    num_layers,
+                    lmcache_chunk_size,
+                    dtype,
+                    device,
+                )
+
+                for layer_idx in range(num_layers):
+                    paged = _paged_view(
+                        int(group_kv_pointers[layer_idx].item()),
+                        shape_desc,
+                        dtype,
+                        device,
+                    )
+                    tmp_slice = tmp[
+                        0,
+                        layer_idx,
+                        dest_slot_offset : dest_slot_offset + num_slots,
+                        :,
+                    ]
+                    if direction_name == "D2H":
+                        gathered = paged.index_select(0, chunk_blocks).reshape(
+                            num_slots, shape_desc.nh * shape_desc.hs
+                        )
+                        tmp_slice.copy_(gathered)
+                    elif direction_name == "H2D":
+                        src = tmp_slice.reshape(
+                            int(chunk_blocks.numel()),
+                            shape_desc.bs,
+                            shape_desc.nh * shape_desc.hs,
+                        )
+                        paged.index_copy_(0, chunk_blocks, src)
+                    else:
+                        raise ValueError(f"Unsupported transfer direction: {direction}")
+
+        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
+
+# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ----
+if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0":
+    import chunked_connector_patch  # noqa: F401
+
+# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ----
+import scheduler_assertion_patch  # noqa: F401
+PY
+}
+
+write_chunked_connector_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/chunked_connector_patch.py" <<'PY'
+"""
+Monkey-patch for LMCacheMPConnector to add chunked KV loading.
+
+Fixes GPU block exhaustion deadlock at high concurrency by capping
+the number of external tokens reported AND retrieved per scheduling step.
+
+Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=<tokens> and import this
+module from sitecustomize.py before LMCache is loaded.
+"""
+
+import logging
+import os
+import sys
+import builtins
+
+logger = logging.getLogger("chunked_lmcache_patch")
+
+_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768"))
+
+# Per-request chunk tracking (module-level, survives across calls)
+_chunk_state: dict[str, dict] = {}
+
+
+def _apply_patch():
+    """Patch LMCacheMPConnector in-place."""
+    mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector")
+    if mod is None:
+        return
+    cls = getattr(mod, "LMCacheMPConnector", None)
+    if cls is None or getattr(cls, "_chunked_patch_applied", False):
+        return
+
+    LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None)
+    _orig_get_matched = cls.get_num_new_matched_tokens
+    _orig_get_finished = cls.get_finished
+
+    def _get_blocks_per_chunk(self):
+        block_size = getattr(self, "block_size", 1)
+        return max(1, _MAX_TOKENS // block_size)
+
+    def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens):
+        full_match = _orig_get_matched(self, request, num_computed_tokens)
+        if full_match <= 0 or _MAX_TOKENS <= 0:
+            return full_match
+
+        req_id = request.request_id
+        block_size = getattr(self, "block_size", 1)
+        blocks_per_chunk = _get_blocks_per_chunk(self)
+        full_match_blocks = full_match // block_size
+
+        state = _chunk_state.get(req_id)
+        if state is None or state.get("num_computed_at_start") != num_computed_tokens:
+            state = {
+                "full_match_blocks": full_match_blocks,
+                "chunk_end_blocks": 0,
+                "num_computed_at_start": num_computed_tokens,
+                "lookup_done": False,
+            }
+            _chunk_state[req_id] = state
+
+        if state["lookup_done"]:
+            return 0
+
+        remaining = state["full_match_blocks"] - state["chunk_end_blocks"]
+        if remaining <= 0:
+            state["lookup_done"] = True
+            return 0
+
+        this_chunk = min(remaining, blocks_per_chunk)
+        state["chunk_end_blocks"] += this_chunk
+        if state["chunk_end_blocks"] >= state["full_match_blocks"]:
+            state["lookup_done"] = True
+
+        capped = this_chunk * block_size
+        if capped < full_match:
+            logger.debug(
+                "Chunked LMCache: req %s capped %d -> %d tokens "
+                "(chunk %d/%d blocks)",
+                req_id, full_match, capped, this_chunk, full_match_blocks,
+            )
+
+        # Cap the tracker's hit blocks to match what we report
+        tracker = getattr(request, "kv_transfer_params", None)
+        if tracker is not None:
+            orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0)
+            if orig_hits > this_chunk:
+                tracker.num_lmcache_hit_blocks = this_chunk
+
+        return capped
+
+    def _patched_get_finished(self, scheduler_output):
+        result = _orig_get_finished(self, scheduler_output)
+        # Clean up chunk state for finished requests.
+        # vLLM passes scheduler_output as a set of request-ID strings
+        # (not a SchedulerOutput object), so iterate directly when it
+        # is a set/frozenset; fall back to the attribute path for
+        # forward compatibility.
+        if isinstance(scheduler_output, (set, frozenset)):
+            finished = scheduler_output
+        else:
+            finished = getattr(scheduler_output, "finished_req_ids", [])
+        for req in finished:
+            _chunk_state.pop(req, None)
+        return result
+
+    cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens
+    cls.get_finished = _patched_get_finished
+    cls._chunked_patch_applied = True
+    logger.info(
+        "Chunked LMCache connector patch applied "
+        "(max_tokens_per_load=%d)", _MAX_TOKENS,
+    )
+
+
+_orig_import = builtins.__import__
+
+
+def _patching_import(name, *args, **kwargs):
+    module = _orig_import(name, *args, **kwargs)
+    if (
+        name == "lmcache.integration.vllm.lmcache_mp_connector"
+        or (
+            name.startswith("lmcache")
+            and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules
+        )
+    ):
+        _apply_patch()
+    return module
+
+
+builtins.__import__ = _patching_import
+_apply_patch()
+PY
+}
+
+write_scheduler_assertion_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY'
+"""
+Patch vLLM scheduler to handle stale finished_recving gracefully.
+
+The assertion at scheduler.py crashes when a KV transfer reports
+"finished recving" but the request is already in RUNNING state.
+This happens when transfers complete asynchronously and the scheduler
+has already moved the request forward.
+
+Fix: Instead of asserting, log a warning and skip.
+"""
+
+import logging
+import sys
+import builtins
+
+logger = logging.getLogger("scheduler_assertion_patch")
+
+
+def _apply_patch():
+    """Patch vLLM scheduler's _update_from_kv_xfer_finished."""
+    sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler")
+    if sched_mod is None:
+        return
+    req_mod = sys.modules.get("vllm.v1.request")
+    if req_mod is None:
+        return
+    Scheduler = getattr(sched_mod, "Scheduler", None)
+    RequestStatus = getattr(req_mod, "RequestStatus", None)
+    if Scheduler is None or RequestStatus is None:
+        return
+    if getattr(Scheduler, "_kv_xfer_patch_applied", False):
+        return
+
+    _orig_update = Scheduler._update_from_kv_xfer_finished
+
+    def _patched_update(self, kv_connector_output):
+        if self.connector is not None:
+            self.connector.update_connector_output(kv_connector_output)
+        for req_id in kv_connector_output.finished_recving or ():
+            if req_id not in self.requests:
+                continue
+            req = self.requests[req_id]
+            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                self.finished_recving_kv_req_ids.add(req_id)
+            elif RequestStatus.is_finished(req.status):
+                self._free_blocks(self.requests[req_id])
+            else:
+                logger.warning(
+                    "Stale finished_recving for req %s in status %s; skipping.",
+                    req_id, req.status.name,
+                )
+        for req_id in kv_connector_output.finished_sending or ():
+            if req_id not in self.requests:
+                continue
+            self._free_blocks(self.requests[req_id])
+
+    Scheduler._update_from_kv_xfer_finished = _patched_update
+    Scheduler._kv_xfer_patch_applied = True
+    logger.info("Scheduler KV transfer assertion patch applied")
+
+
+_orig_import = builtins.__import__
+
+
+def _patching_import(name, *args, **kwargs):
+    module = _orig_import(name, *args, **kwargs)
+    if (
+        name == "vllm.v1.core.sched.scheduler"
+        or (
+            name.startswith("vllm")
+            and "vllm.v1.core.sched.scheduler" in sys.modules
+        )
+    ):
+        _apply_patch()
+    return module
+
+
+builtins.__import__ = _patching_import
+_apply_patch()
+PY
+}
+
 # Workaround for MEC FW <177 RCCL memory reclaim issue
 version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
 if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
@@ -54,47 +576,233 @@ export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
 case "$OFFLOADING" in
     none) ;;
     cpu)
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
         # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
-        # reserve 2.5 TB for the simple CPU offload connector (leaves
-        # ~200 GB headroom for worker RSS / page cache / slurm cgroup).
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
         TOTAL_CPU_DRAM_GB=2500
-        # Pure TP (no DP-attn): single engine, world_size=TP.
-        # SimpleCPUOffloadConnector internally divides cpu_bytes_to_use by
-        # world_size, so pass the full TOTAL_CPU_DRAM_GB.
-        PER_ENGINE_BYTES=$((TOTAL_CPU_DRAM_GB * 1024 * 1024 * 1024))
-        # JSON form (rather than --kv_offloading_backend native shortcut) so
-        # we can pass lazy_offload=true. Eager mode (the shortcut default)
-        # can hit a popleft_n AssertionError in vllm/v1/core/kv_cache_utils.py
-        # at low/mid CONC; lazy defers the store path. Matches the H200
-        # Kimi int4 launcher which cleared 17/17 with this pattern.
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        agentic_pip_install --quiet --no-cache-dir lmcache
+        # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
+        # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
+        # during Kimi fused-MoE model inspection it imports nixl_ep whenever
+        # that module is importable, even when this run is not using EP/NIXL
+        # kernels. The CUDA extension then fails immediately on AMD nodes with
+        # "ImportError: libcuda.so.1".
+        #
+        # LMCache MP also uses CuPy stream APIs while registering vLLM's KV
+        # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
+        # with cudaErrorInsufficientDriver when LMCache touches the stream. Use
+        # the ROCm 7 CuPy wheel so the same API dispatches through HIP.
+        python3 -m pip uninstall -y \
+            nixl nixl-cu12 nixl-cu13 nixl_ep \
+            >/dev/null 2>&1 || true
+        python3 -m pip uninstall -y \
+            cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
+            >/dev/null 2>&1 || true
+        agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
+        python3 - <<'PY'
+import importlib.util
+import sys
+
+spec = importlib.util.find_spec("nixl_ep")
+if spec is not None:
+    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
+    print(
+        "Error: nixl_ep is still importable after LMCache install; "
+        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
+        f"location={locations}",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+
+try:
+    from cupy_backends.cuda.api import runtime as cupy_runtime
+except Exception as exc:
+    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
+    sys.exit(1)
+
+if not getattr(cupy_runtime, "is_hip", False):
+    print(
+        "Error: CuPy is still using the CUDA backend after installing "
+        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+PY
+        LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
+        write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
+        write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR"
+        write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR"
+        export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
+        export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
+        export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
+        # Cap external KV tokens loaded per scheduling step to prevent GPU
+        # block exhaustion deadlock at high concurrency (c>=32).  Default
+        # 32768 keeps peak block demand within the GPU KV pool.  Set to 0 to
+        # disable chunking (only safe at low concurrency).
+        export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}"
+        export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        TOTAL_CPU_DRAM_GB=2500
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+            --disable-hybrid-kv-cache-manager
+        )
         ;;
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
-if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
 
 echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---tensor-parallel-size=$TP \
-$EP \
---gpu-memory-utilization 0.90 \
---block-size=1 \
---trust-remote-code \
---max-num-seqs $CONC \
---mm-encoder-tp-mode data \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    "${EP_ARGS[@]}"
+    --gpu-memory-utilization 0.90
+    --block-size=1
+    --trust-remote-code
+    --max-model-len "$MAX_MODEL_LEN"
+    --max-num-seqs "$CONC"
+    --mm-encoder-tp-mode data
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
@@ -103,14 +811,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
index 9ebe02ae8..697d3fa45 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -69,14 +64,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
index a69669c07..2fd3b381c 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -70,14 +65,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
index e8b7e49fe..97929e43e 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
@@ -9,13 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -80,14 +75,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
index 1fcbfb4ba..38ef72b56 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
@@ -9,15 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
-DP_ATTENTION=${DP_ATTENTION:-false}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -30,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -80,14 +78,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
index fa9c91a80..4ce131cba 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -85,14 +84,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
index 2516656e2..9f2d83a0b 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -85,14 +84,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
index b339be956..d21690da6 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -79,14 +78,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
index 2e5f96d4f..ed59991cb 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -79,14 +78,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
index 82343bae9..260bbdc68 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -35,6 +29,11 @@ rocm-smi || true
 amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -86,14 +85,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
index 509070bf1..edac27a45 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -35,6 +29,11 @@ rocm-smi || true
 amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -83,14 +82,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index 316b35f63..39dd63293 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -35,6 +29,11 @@ rocm-smi || true
 amd-smi || true
 
 # ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
 resolve_trace_source
 install_agentic_deps
 
@@ -87,14 +86,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
index d3c5df245..4ba87976b 100755
--- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
 SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
@@ -59,7 +53,6 @@ python3 -m sglang.launch_server \
 --chunked-prefill-size 32768 \
 --max-prefill-tokens 32768 \
 --context-length $MAX_MODEL_LEN \
---disable-radix-cache \
 --attention-backend trtllm_mha \
 --moe-runner-backend flashinfer_trtllm \
 --enable-flashinfer-allreduce-fusion \
@@ -75,14 +68,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
index 30b5f8cb9..3432af5c9 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
 SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
@@ -59,7 +53,6 @@ python3 -m sglang.launch_server \
 --chunked-prefill-size 32768 \
 --max-prefill-tokens 32768 \
 --context-length $MAX_MODEL_LEN \
---disable-radix-cache \
 --attention-backend trtllm_mha \
 --moe-runner-backend flashinfer_trtllm \
 --enable-flashinfer-allreduce-fusion \
@@ -75,14 +68,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
new file mode 100755
index 000000000..9d9c1d7d5
--- /dev/null
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Qwen3.5 FP8 on B300 using SGLang.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV only with radix cache disabled.
+#   hicache - SGLang HiCache with local CPU hierarchical cache.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10}
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+nvidia-smi
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # HiCache extends RadixAttention, so do not pass --disable-radix-cache.
+        # B300 nodes have about 2 TB of usable CPU DRAM. Qwen3.5's hybrid
+        # GDN/Mamba path allocates two HiCache host pools per TP rank: one for
+        # hierarchical KV cache and one for hierarchical Mamba cache. Keep this
+        # local to the script because the workflow currently passes a generic
+        # default for TOTAL_CPU_DRAM_GB, not a platform-specific value.
+        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}"
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # SGLang --hicache-size is per rank per host pool, while the workflow
+        # input is a node-total DRAM budget. Divide by TP and the number of
+        # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning.
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size 64
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend kernel
+            --hicache-mem-layout page_first
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting SGLang server..."
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PYTHONNOUSERSITE=1
+export NCCL_NVLS_ENABLE=1
+export SGL_ENABLE_JIT_DEEPGEMM=false
+export SGLANG_ENABLE_FLASHINFER_GEMM=true
+
+{ set +x; } 2>/dev/null
+SGLANG_CMD=(
+    python3 -m sglang.launch_server
+    --model-path="$MODEL"
+    --host=0.0.0.0
+    --port="$PORT"
+    --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
+    --trust-remote-code
+    --tensor-parallel-size="$TP"
+    --data-parallel-size=1
+    --expert-parallel-size="$EP_SIZE"
+    --enable-symm-mem
+    --quantization fp8
+    --kv-cache-dtype fp8_e4m3
+    --mamba-ssm-dtype bfloat16
+    --attention-backend trtllm_mha
+    --moe-runner-backend flashinfer_trtllm
+    --cuda-graph-max-bs "$CONC"
+    --max-running-requests "$CONC"
+    --max-prefill-tokens 16384
+    --chunked-prefill-size 16384
+    --mem-fraction-static 0.80
+    --stream-interval 50
+    --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"
+    --tokenizer-worker-num 6
+    --tokenizer-path "$MODEL"
+    --context-length "$MAX_MODEL_LEN"
+    --enable-metrics
+    "${CACHE_ARGS[@]}"
+)
+printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
+"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
new file mode 100755
index 000000000..95f0397a0
--- /dev/null
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
@@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Qwen3.5 FP8 on H100 using SGLang.
+#
+# H100 has 80 GB HBM3 (vs B300's 192 GB), so weights + KV fit tighter.
+# Mem-fraction-static lowered to 0.75 and chunked-prefill-size halved to
+# 8192 (mirrors fixed_seq_len/qwen3.5_fp8_h100.sh). Attention backend is
+# flashinfer (sm_90); the trtllm_mha path is Blackwell-only.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV only (RadixAttention prefix cache stays on —
+#             agentic workloads rely on >95% theoretical hit rate).
+#   hicache - SGLang HiCache with local CPU hierarchical cache.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+nvidia-smi
+
+# ---- Resolve traces and install deps ----------------------------------------
+# H100 max_model_len caps at 131k (HBM-bound). The unfiltered with-subagents
+# corpus has requests up to ~1M proxy tokens that the server would reject.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k); even
+# at 131k context, the rejection rate is much lower than against the
+# unfiltered corpus.
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # HiCache extends RadixAttention, so do not pass --disable-radix-cache.
+        # H100 nodes typically expose ~1.5-2 TB usable CPU DRAM; Qwen3.5's
+        # hybrid GDN/Mamba path allocates two HiCache host pools per TP rank
+        # (one KV, one Mamba). Workflow passes a generic TOTAL_CPU_DRAM_GB, so
+        # keep the per-rank-per-pool conversion local to this script.
+        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-1500}"
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size 64
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend kernel
+            --hicache-mem-layout page_first
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+SGLANG_MULTI_TOKENIZER=/sgl-workspace/sglang/python/sglang/srt/managers/multi_tokenizer_mixin.py
+if ! sed -n '/elif isinstance(output, BatchStrOutput):/,/input_token_logprobs_val=_extract_field_by_index/p' "$SGLANG_MULTI_TOKENIZER" \
+    | grep -q 'cached_tokens_details=_extract_field_by_index'; then
+    sed -i '/elif isinstance(output, BatchStrOutput):/,/input_token_logprobs_val=_extract_field_by_index/ {
+        /cached_tokens=_extract_field_by_index(output, "cached_tokens", i),/a\
+            cached_tokens_details=_extract_field_by_index(\
+                output, "cached_tokens_details", i\
+            ),
+    }' "$SGLANG_MULTI_TOKENIZER"
+fi
+
+{ set +x; } 2>/dev/null
+SGLANG_CMD=(
+    python3 -m sglang.launch_server
+    --model-path="$MODEL"
+    --host=0.0.0.0
+    --port="$PORT"
+    --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
+    --trust-remote-code
+    --tensor-parallel-size="$TP"
+    --data-parallel-size=1
+    --expert-parallel-size="$EP_SIZE"
+    --quantization fp8
+    --kv-cache-dtype fp8_e4m3
+    --mamba-ssm-dtype bfloat16
+    --attention-backend flashinfer
+    --enable-flashinfer-allreduce-fusion
+    # --cuda-graph-max-bs "$CONC"
+    # --max-running-requests "$CONC"
+    # --max-prefill-tokens 8192
+    # --chunked-prefill-size 8192
+    --mem-fraction-static 0.75
+    --stream-interval 50
+    --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"
+    --tokenizer-worker-num 6
+    --tokenizer-path "$MODEL"
+    --enable-metrics
+    "${CACHE_ARGS[@]}"
+)
+printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
+"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index 13efe215e..aef9650ca 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -9,14 +9,8 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR
+check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-MAX_DELAY=${MAX_DELAY:-60}
-ADVANCE_MIN=${ADVANCE_MIN:-0.0}
-ADVANCE_MAX=${ADVANCE_MAX:-0.7}
-EP_SIZE=${EP_SIZE:-1}
 if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
     MAX_MODEL_LEN=131072
 fi
@@ -52,7 +46,6 @@ python3 -m sglang.launch_server \
     --enable-aiter-allreduce-fusion \
     --cuda-graph-max-bs $CONC \
     --max-running-requests $CONC \
-    --disable-radix-cache \
     --max-prefill-tokens 32768 \
     --scheduler-recv-interval 30 \
     --mem-fraction-static 0.8 \
@@ -66,14 +59,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
-
-set -x
-$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
-set +x
-
-write_agentic_result_json "$RESULT_DIR"
-
-# ---- Post-processing --------------------------------------------------------
-python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
-    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
new file mode 100755
index 000000000..5427d0d31
--- /dev/null
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
@@ -0,0 +1,142 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV only with radix cache disabled.
+#   hicache - SGLang HiCache with local CPU hierarchical cache.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid
+        # GDN/Mamba path allocates two HiCache host pools per TP rank: one for
+        # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB
+        # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per
+        # host pool, not 250 GB. Keep overrides for one-off tuning.
+        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}"
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on
+        # MI355X, which requires page_size=1. The kernel/page_first HiCache
+        # transfer path faults on first prefill in this mode on ROCm, so keep
+        # the default on the safer direct/layer_first copy path. These remain
+        # env-overridable for future SGLang/ROCm fixes.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        # SGLang --hicache-size is per rank per host pool, while the workflow
+        # input is a node-total DRAM budget. Divide by TP and the number of
+        # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning.
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness, but SGLang's internal warmup
+        # request has timed out after 600s on this Qwen MI355X path. Let aiperf
+        # own benchmark traffic instead of blocking server readiness on it.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Keep request concurrency as the swept variable, but do not force
+        # HiCache runs to capture ROCm graphs at every high concurrency point.
+        # The conc=32 HiCache job crashed after startup readiness, before any
+        # aiperf traffic, while conc=16 is the highest known-good capture size
+        # for this model/server path. Requests above the capture size can still
+        # run; they just do not require a larger captured graph at startup.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+{ set +x; } 2>/dev/null
+SGLANG_CMD=(
+    python3 -m sglang.launch_server
+    --attention-backend triton
+    --model-path "$MODEL"
+    --host=0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size "$TP"
+    --ep-size "$EP_SIZE"
+    --trust-remote-code
+    --tokenizer-worker-num 6
+    --enable-aiter-allreduce-fusion
+    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
+    --max-running-requests "$CONC"
+    --max-prefill-tokens 32768
+    --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"
+    --mem-fraction-static 0.8
+    --context-length "$MAX_MODEL_LEN"
+    --enable-metrics
+    "${CACHE_ARGS[@]}"
+    "${WARMUP_ARGS[@]}"
+)
+printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
+"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh
index 76bfabaf1..fa1fd407f 100644
--- a/benchmarks/single_node/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -21,7 +21,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/dsr1_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp4_b200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh
index a47abbf21..4a76a82d4 100755
--- a/benchmarks/single_node/dsr1_fp4_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh
@@ -4,7 +4,7 @@
 # Mirrors dsr1_fp4_b200.sh and adds the speculative-* flags from
 # dsr1_fp8_b200_mtp.sh (the production B200 sglang MTP template).
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -31,7 +31,6 @@ if [[ $TP -ne 8 ]]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [[ $CONC -ge 16 ]]; then
   SCHEDULER_RECV_INTERVAL=30
diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp4_b200_trt.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt.sh
index d57dc72cb..d2186df2c 100644
--- a/benchmarks/single_node/dsr1_fp4_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -41,7 +41,6 @@ fi
 echo "MOE_BACKEND set to '$MOE_BACKEND'"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsr1-fp4.yml"
 
 cat > $EXTRA_CONFIG_FILE << EOF
diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt_mtp.sh
index e4f8b50e7..15d93458a 100644
--- a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -37,7 +37,6 @@ fi
 echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml"
 
 cat > $EXTRA_CONFIG_FILE << EOF
diff --git a/benchmarks/single_node/dsr1_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b300.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp4_b300.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b300.sh
index 917f4f5f3..334203123 100644
--- a/benchmarks/single_node/dsr1_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # DSR1 FP4 B200 SGLang recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -25,7 +25,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh
similarity index 96%
rename from benchmarks/single_node/dsr1_fp4_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh
index a062726df..bb6ce75cb 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -28,7 +28,6 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom.sh
index 31554fc22..6ae8f92ba 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom_mtp.sh
similarity index 96%
rename from benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom_mtp.sh
index 1d557684e..8447a8b2a 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp4_mi355x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_mtp.sh
index a505b65d0..4499736e2 100755
--- a/benchmarks/single_node/dsr1_fp4_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_mtp.sh
@@ -3,7 +3,7 @@
 # DeepSeek-R1-0528 MXFP4 on MI355X with EAGLE/MTP speculative decoding.
 # Mirrors dsr1_fp4_mi355x.sh and adds the speculative-* flags.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -34,7 +34,6 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp8_b200.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200.sh
index abfecfe44..8a016bb2a 100644
--- a/benchmarks/single_node/dsr1_fp8_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -23,7 +23,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $TP -eq 8 ]]; then
diff --git a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp8_b200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_mtp.sh
index 45cfccc3e..1ad0c9041 100755
--- a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -23,7 +23,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 export SGLANG_ENABLE_JIT_DEEPGEMM=false
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # MTP only supports TP=8 for now
 if [[ $TP -ne 8 ]]; then
diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp8_b200_trt.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt.sh
index b593535f3..b0457614e 100644
--- a/benchmarks/single_node/dsr1_fp8_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -49,7 +49,6 @@ fi
 echo "MOE_BACKEND set to '$MOE_BACKEND'"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsr1-fp8.yml"
 
 cat > $EXTRA_CONFIG_FILE << EOF
diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt_mtp.sh
index e51b73384..16f13710e 100644
--- a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -55,7 +55,6 @@ fi
 echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml"
 
 cat > $EXTRA_CONFIG_FILE << EOF
diff --git a/benchmarks/single_node/dsr1_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp8_b300.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300.sh
index 2d475bc0b..2599b7126 100644
--- a/benchmarks/single_node/dsr1_fp8_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # DSR1 FP8 B200 SGLang recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -27,7 +27,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $TP -eq 8 ]]; then
diff --git a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsr1_fp8_b300_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300_mtp.sh
index d16cbcf8e..b60971ae5 100755
--- a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300_mtp.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # DSR1 FP8 B200 SGLang MTP recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -27,7 +27,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 export SGLANG_ENABLE_JIT_DEEPGEMM=false
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # MTP only supports TP=8 for now
 if [[ $TP -ne 8 ]]; then
diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_h200.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200.sh
index 2c05e8d14..db846b4d2 100644
--- a/benchmarks/single_node/dsr1_fp8_h200.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ pip3 install --user --break-system-packages sentencepiece
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
diff --git a/benchmarks/single_node/dsr1_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_h200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_mtp.sh
index 7929a0904..611f600f6 100755
--- a/benchmarks/single_node/dsr1_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_mtp.sh
@@ -6,7 +6,7 @@
 # Keeps the H200's flashinfer attention backend (no trtllm_mla path on
 # H200 for this image).
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -33,7 +33,6 @@ if [[ $TP -ne 8 ]]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
 SPECULATIVE_NUM_STEPS=2
diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_h200_trt.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt.sh
index 0a62abc90..c59eb8625 100644
--- a/benchmarks/single_node/dsr1_fp8_h200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -28,7 +28,6 @@ MOE_BACKEND="CUTLASS"
 echo "MOE_BACKEND set to '$MOE_BACKEND'"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsr1-fp8.yml"
 
 cat > $EXTRA_CONFIG_FILE << EOF
diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt_mtp.sh
index fcea69e3d..c544af6ed 100644
--- a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -34,7 +34,6 @@ fi
 echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml"
 
 # If ISL=8192 and DP_ATTENTION=true, export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192
diff --git a/benchmarks/single_node/dsr1_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi300x.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_mi300x.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi300x.sh
index b9d46225e..da95c0e7a 100644
--- a/benchmarks/single_node/dsr1_fp8_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi300x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -34,7 +34,6 @@ export SGLANG_USE_AITER=1
 export SGLANG_AITER_MLA_PERSIST=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_mi325x.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x.sh
index a06a206d2..6b1c50265 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/bash
 
 # Source benchmark utilities early
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
diff --git a/benchmarks/single_node/dsr1_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_mi325x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x_mtp.sh
index d792bc7e9..8251c169a 100755
--- a/benchmarks/single_node/dsr1_fp8_mi325x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x_mtp.sh
@@ -3,7 +3,7 @@
 # DeepSeek-R1-0528 FP8 on MI325X with EAGLE/MTP speculative decoding.
 # Mirrors dsr1_fp8_mi325x.sh and adds the speculative-* flags.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x.sh
similarity index 96%
rename from benchmarks/single_node/dsr1_fp8_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x.sh
index ea9ecefe8..d8b596826 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -25,7 +25,6 @@ export RCCL_MSCCL_ENABLE=0
 export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom.sh
index 31554fc22..6ae8f92ba 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh
similarity index 96%
rename from benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh
index 69179cec0..e4943488f 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsr1_fp8_mi355x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_mtp.sh
index c1d43f153..d8fc1590b 100755
--- a/benchmarks/single_node/dsr1_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_mtp.sh
@@ -3,7 +3,7 @@
 # DeepSeek-R1-0528 FP8 on MI355X with EAGLE/MTP speculative decoding.
 # Mirrors dsr1_fp8_mi355x.sh and adds the speculative-* flags.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -31,7 +31,6 @@ export RCCL_MSCCL_ENABLE=0
 export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Keep server-side speculative decoding capacity aligned with the matrix row.
 MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-$CONC}"
diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
index 070e987a0..e1d031854 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -35,7 +35,6 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 # Drop the runner conditional once lmsys moves sglang back out of /workspace.
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 
 echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b200_trt.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
index 40669cd15..e4a24dea2 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh
@@ -4,7 +4,7 @@
 # already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at
 # runtime from this benchmark path.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -54,7 +54,6 @@ fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
 
 MOE_BACKEND="TRTLLM"
diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b200_trt_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
index d7308bbf5..9e5c88212 100644
--- a/benchmarks/single_node/dsv4_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh
@@ -3,7 +3,7 @@
 # DeepSeek-V4-Pro B200 TensorRT-LLM MTP variant. The configured image already
 # contains the DeepSeek-V4 TRTLLM build; this path only toggles speculative MTP.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -53,7 +53,6 @@ fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
 MOE_BACKEND="TRTLLM"
diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b200_vllm.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh
index 312d41472..1ef273224 100755
--- a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh
@@ -4,7 +4,7 @@
 # sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode
 # (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size).
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # DeepSeek-V4-Pro weights are large; engine startup can exceed the default
 # 600s. Give it an hour to load.
diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b200_vllm_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh
index 21b40eeb8..6846223e8 100755
--- a/benchmarks/single_node/dsv4_fp4_b200_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh
@@ -5,7 +5,7 @@
 # routes prompts through chat-formatted encoding via --dsv4 (required for
 # meaningful MTP acceptance numbers).
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -27,7 +27,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # DeepSeek-V4-Pro weights are large; engine startup can exceed the default
 # 600s. Give it an hour to load.
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
similarity index 99%
rename from benchmarks/single_node/dsv4_fp4_b300_sglang.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
index 8f43ea8a3..6d406f2eb 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -40,7 +40,6 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 # /workspace.
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 
 echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
index 03102778d..dc6af5c76 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 # Tuning inputs from the matrix (all required):
 #   TP            -- tensor parallel size                       -> --tp
@@ -51,7 +51,6 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 # /workspace.
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 
 echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b300_trt.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
index 754846912..db27b4f7a 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh
@@ -4,7 +4,7 @@
 # already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at
 # runtime from this benchmark path.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -54,7 +54,6 @@ fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
 
 MOE_BACKEND="TRTLLM"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
index 8aa9d0e78..c725f350e 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh
@@ -3,7 +3,7 @@
 # DeepSeek-V4-Pro B300 TensorRT-LLM MTP variant. The configured image already
 # contains the DeepSeek-V4 TRTLLM build; this path only toggles speculative MTP.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -53,7 +53,6 @@ fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml"
 
 MOE_BACKEND="TRTLLM"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
similarity index 97%
rename from benchmarks/single_node/dsv4_fp4_b300_vllm.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
index 92d4bf4ad..947d16a6d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh
@@ -4,7 +4,7 @@
 # pareto sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode
 # (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size).
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # DeepSeek-V4-Pro weights are large; engine startup can exceed the default
 # 600s. Give it an hour to load.
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh
index cb41a9eb1..279e3693a 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -22,7 +22,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 4307f9605..6771c1788 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
similarity index 97%
rename from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
index a4976bdb0..b02a09489 100755
--- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -83,7 +83,6 @@ export SGLANG_OPT_USE_FUSED_PAGED_COMPRESS=true
 export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=0
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -124,7 +123,7 @@ python3 -m sglang.launch_server \
     --disable-shared-experts-fusion \
     --tool-call-parser deepseekv4 \
     --reasoning-parser deepseek-v4 \
-    --chat-template "$(dirname "$0")/chat_templates/deepseek_v4_thinking.jinja" \
+    --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \
     --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
similarity index 97%
rename from benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
index 83d807c6e..dc8989b3e 100755
--- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
@@ -21,7 +21,7 @@ set -eo pipefail
 # --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode
 # enables full CUDA graph capture for improved throughput on MI355X.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -47,7 +47,6 @@ fi
 export VLLM_ROCM_USE_AITER=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200.sh
similarity index 97%
rename from benchmarks/single_node/dsv4_fp8_h200.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200.sh
index 51e4a72d2..274dee995 100644
--- a/benchmarks/single_node/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200.sh
@@ -4,7 +4,7 @@
 # the cu129 image and omits the FP4 indexer cache flag (H200 has no FP4
 # path). Max-model-len is pinned at 800k per the recipe.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # DeepSeek-V4-Pro weights are large; engine startup can exceed the default
 # 600s. Give it an hour to load.
diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_mtp.sh
similarity index 98%
rename from benchmarks/single_node/dsv4_fp8_h200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_mtp.sh
index 0446ac6d9..bf37eb2d0 100755
--- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_mtp.sh
@@ -6,7 +6,7 @@
 # routes prompts through chat-formatted encoding via --dsv4 (required for
 # meaningful MTP acceptance numbers per AGENTS.md).
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -28,7 +28,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # DeepSeek-V4-Pro weights are large; engine startup can exceed the default
 # 600s. Give it an hour to load.
diff --git a/benchmarks/single_node/dsv4_fp8_h200_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang.sh
similarity index 96%
rename from benchmarks/single_node/dsv4_fp8_h200_sglang.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang.sh
index bf5c6f7b2..3e7132ebe 100644
--- a/benchmarks/single_node/dsv4_fp8_h200_sglang.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/benchmarks/single_node/dsv4_fp8_h200_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
similarity index 97%
rename from benchmarks/single_node/dsv4_fp8_h200_sglang_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
index bcba41543..788eff5b8 100644
--- a/benchmarks/single_node/dsv4_fp8_h200_sglang_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/benchmarks/single_node/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x.sh
similarity index 97%
rename from benchmarks/single_node/glm5.1_fp4_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x.sh
index c280f3c4f..aada63d56 100644
--- a/benchmarks/single_node/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -x
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -24,7 +24,6 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export SAFETENSORS_FAST_GPU=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 32))
 
 EVAL_CONTEXT_ARGS=""
diff --git a/benchmarks/single_node/glm5.1_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/glm5.1_fp4_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x_atom.sh
index 036346af3..b1d1b61c8 100644
--- a/benchmarks/single_node/glm5.1_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/glm5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200.sh
similarity index 96%
rename from benchmarks/single_node/glm5_fp4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b200.sh
index 53cb8afee..a1ae27021 100755
--- a/benchmarks/single_node/glm5_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -21,7 +21,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/benchmarks/single_node/glm5_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp4_b200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b200_mtp.sh
index ecd5ca0af..7181ae9bc 100755
--- a/benchmarks/single_node/glm5_fp4_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -25,7 +25,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=1
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL"
diff --git a/benchmarks/single_node/glm5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp4_b300.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b300.sh
index b751ddf7a..10c8a0e4c 100755
--- a/benchmarks/single_node/glm5_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -25,7 +25,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
diff --git a/benchmarks/single_node/glm5_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300_mtp.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp4_b300_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b300_mtp.sh
index db586dad8..bdea441a8 100755
--- a/benchmarks/single_node/glm5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300_mtp.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # GLM5 FP8 B200 SGLang recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -29,7 +29,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=1
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL"
diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp8_b200.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b200.sh
index ccaa87b98..2e32a567c 100755
--- a/benchmarks/single_node/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -24,7 +24,6 @@ pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
 export SGL_ENABLE_JIT_DEEPGEMM=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL"
diff --git a/benchmarks/single_node/glm5_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp8_b200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b200_mtp.sh
index 5e4f98533..2c1f6e934 100755
--- a/benchmarks/single_node/glm5_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -25,7 +25,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=1
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL"
diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp8_b300.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b300.sh
index 730cc3950..b9fe1c351 100644
--- a/benchmarks/single_node/glm5_fp8_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # GLM5 FP8 B200 SGLang recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -32,7 +32,6 @@ pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
 export SGL_ENABLE_JIT_DEEPGEMM=0
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL"
diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp8_b300_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh
index 0d4290dd3..5389e6a08 100755
--- a/benchmarks/single_node/glm5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # GLM5 FP8 B200 SGLang recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -33,7 +33,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=0
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL"
diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200.sh
similarity index 96%
rename from benchmarks/single_node/glm5_fp8_h200.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_h200.sh
index 410c66942..266587de9 100644
--- a/benchmarks/single_node/glm5_fp8_h200.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/glm5_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200_mtp.sh
similarity index 96%
rename from benchmarks/single_node/glm5_fp8_h200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_h200_mtp.sh
index ea7eaccde..133d757dc 100755
--- a/benchmarks/single_node/glm5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200_mtp.sh
@@ -6,7 +6,7 @@
 # nsa/trtllm-mha) since those backends are Blackwell-specific and not
 # applicable to Hopper.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/glm5_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x.sh
similarity index 96%
rename from benchmarks/single_node/glm5_fp8_mi325x.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x.sh
index c1d24b76d..0564ef8d8 100755
--- a/benchmarks/single_node/glm5_fp8_mi325x.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -18,7 +18,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/glm5_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x_mtp.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp8_mi325x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x_mtp.sh
index 5e771e74e..fb77d84c2 100755
--- a/benchmarks/single_node/glm5_fp8_mi325x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x_mtp.sh
@@ -3,7 +3,7 @@
 # GLM-5 FP8 on MI325X with EAGLE / MTP speculative decoding.
 # Mirrors glm5_fp8_mi325x.sh and adds the speculative-* flags.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -22,7 +22,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x.sh
similarity index 96%
rename from benchmarks/single_node/glm5_fp8_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x.sh
index cd99536b9..21defe90c 100755
--- a/benchmarks/single_node/glm5_fp8_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -23,7 +23,6 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export SAFETENSORS_FAST_GPU=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/glm5_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/glm5_fp8_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_atom.sh
index 036346af3..b1d1b61c8 100644
--- a/benchmarks/single_node/glm5_fp8_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_mtp.sh
similarity index 97%
rename from benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_mtp.sh
index 49561dcde..90fa04f5d 100755
--- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_mtp.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -x
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -25,7 +25,6 @@ export SAFETENSORS_FAST_GPU=1
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 32))
 
 EVAL_CONTEXT_ARGS=""
diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200.sh
similarity index 97%
rename from benchmarks/single_node/gptoss_fp4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200.sh
index 8ff373b63..743974df3 100644
--- a/benchmarks/single_node/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -48,7 +48,6 @@ export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200_trt.sh
similarity index 98%
rename from benchmarks/single_node/gptoss_fp4_b200_trt.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200_trt.sh
index 60bc9eb71..ced9162f9 100644
--- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200_trt.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # Source benchmark utilities early
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h100.sh
similarity index 96%
rename from benchmarks/single_node/gptoss_fp4_h100.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_h100.sh
index 7208e1b19..dfd842a88 100644
--- a/benchmarks/single_node/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h100.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -34,7 +34,6 @@ EOF
 export PYTHONNOUSERSITE=1
 export VLLM_MXFP4_USE_MARLIN=1
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200.sh
similarity index 96%
rename from benchmarks/single_node/gptoss_fp4_h200.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200.sh
index 0c1b03bbb..b65c86782 100644
--- a/benchmarks/single_node/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -47,7 +47,6 @@ EOF
 
 SERVER_LOG=/workspace/server.log
 export TORCH_CUDA_ARCH_LIST="9.0"
-PORT=${PORT:-8888}
 
 export VLLM_MXFP4_USE_MARLIN=1
 
diff --git a/benchmarks/single_node/gptoss_fp4_h200_trt.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200_trt.sh
similarity index 96%
rename from benchmarks/single_node/gptoss_fp4_h200_trt.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200_trt.sh
index 3da862a0d..02dd05bc9 100644
--- a/benchmarks/single_node/gptoss_fp4_h200_trt.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200_trt.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 set +x
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi300x.sh
similarity index 97%
rename from benchmarks/single_node/gptoss_fp4_mi300x.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi300x.sh
index 572d6b279..c18a5a3ee 100644
--- a/benchmarks/single_node/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi300x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -40,7 +40,6 @@ ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
 FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi325x.sh
similarity index 97%
rename from benchmarks/single_node/gptoss_fp4_mi325x.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi325x.sh
index 572d6b279..c18a5a3ee 100644
--- a/benchmarks/single_node/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi325x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -40,7 +40,6 @@ ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
 FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh
similarity index 97%
rename from benchmarks/single_node/gptoss_fp4_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh
index 3db687e22..14dedb141 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -41,7 +41,6 @@ ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
 FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x_atom.sh
index ee0810e8f..d3a8a66a1 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh
similarity index 97%
rename from benchmarks/single_node/kimik2.5_fp4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh
index b4e85c14f..59b55c90c 100644
--- a/benchmarks/single_node/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -24,7 +24,6 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_fp4_b300.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
index 44a06ebd5..7526e57c2 100755
--- a/benchmarks/single_node/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -28,7 +28,6 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x.sh
similarity index 97%
rename from benchmarks/single_node/kimik2.5_fp4_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x.sh
index 56e927efc..d4616143a 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -29,7 +29,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_fp4_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x_atom.sh
index ca84f8228..6730aded2 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/kimik2.5_int4_b200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b200.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_int4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b200.sh
index 6dd4998ca..cbef22d67 100755
--- a/benchmarks/single_node/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -24,7 +24,6 @@ export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/kimik2.5_int4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b300.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_int4_b300.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b300.sh
index 6674ad8dd..432f97299 100755
--- a/benchmarks/single_node/kimik2.5_int4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -28,7 +28,6 @@ export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/kimik2.5_int4_h200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_h200.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_int4_h200.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_h200.sh
index 1c25d791a..1f18032ff 100755
--- a/benchmarks/single_node/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_h200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -23,7 +23,6 @@ nvidia-smi
 export PYTHONNOUSERSITE=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/kimik2.5_int4_mi300x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi300x.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_int4_mi300x.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi300x.sh
index bb653a7b6..bb5145a66 100755
--- a/benchmarks/single_node/kimik2.5_int4_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi300x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -24,7 +24,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi325x.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_int4_mi325x.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi325x.sh
index bb653a7b6..bb5145a66 100755
--- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi325x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -24,7 +24,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi355x.sh
similarity index 96%
rename from benchmarks/single_node/kimik2.5_int4_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi355x.sh
index 24685a7e3..5c6b8c73a 100755
--- a/benchmarks/single_node/kimik2.5_int4_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -24,7 +24,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b200.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b200.sh
index 27aef1cc9..fc7877a1c 100755
--- a/benchmarks/single_node/minimaxm2.5_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -23,7 +23,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b300.sh
similarity index 97%
rename from benchmarks/single_node/minimaxm2.5_fp4_b300.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b300.sh
index a2861b441..1253c116d 100755
--- a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -27,7 +27,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh
similarity index 97%
rename from benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh
index 4d8fbc9ed..28677ae1e 100755
--- a/benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -36,7 +36,6 @@ EXTRA_VLLM_ARGS=""
 # fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/minimaxm2.5_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp4_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x_atom.sh
index ca84f8228..6730aded2 100644
--- a/benchmarks/single_node/minimaxm2.5_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b200.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp8_b200.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b200.sh
index 19b62239d..9897afca3 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -22,7 +22,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b300.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp8_b300.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b300.sh
index 30821961f..d5b03b59a 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b300.sh
@@ -4,7 +4,7 @@
 # does not have a B300-specific recipe, so this script reuses the existing
 # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h100.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp8_h100.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h100.sh
index 258ec7dc1..012c8b535 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h100.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -23,7 +23,6 @@ nvidia-smi
 export PYTHONNOUSERSITE=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h200.sh
similarity index 97%
rename from benchmarks/single_node/minimaxm2.5_fp8_h200.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h200.sh
index 2e87cd828..eab6e6087 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi300x.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi300x.sh
index 65cb8ee8e..8a95dc138 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi300x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ fi
 export VLLM_ROCM_USE_AITER=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi325x.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi325x.sh
index 13867ce7e..06ad39726 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi325x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -29,7 +29,6 @@ fi
 export VLLM_ROCM_USE_AITER=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x.sh
similarity index 98%
rename from benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x.sh
index 56bae46f0..5093a56d6 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -65,7 +65,6 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
 fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/minimaxm2.5_fp8_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x_atom.sh
index 2a8c67da0..325c97726 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_b200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_bf16_b200.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200.sh
index 4087d7973..3f7c6a314 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 export PYTHONUNBUFFERED=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/qwen3.5_bf16_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_bf16_b200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200_mtp.sh
index 319d39f58..be314c872 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 export PYTHONUNBUFFERED=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_bf16_b300.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300.sh
index f1056c896..48dc98fa9 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 export PYTHONUNBUFFERED=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300_mtp.sh
index 705ca9775..774ca8a3c 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 export PYTHONUNBUFFERED=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi300x.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_bf16_mi300x.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi300x.sh
index 644b6db8c..32fe60a73 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi300x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -18,7 +18,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_bf16_mi325x.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x.sh
index 644b6db8c..32fe60a73 100644
--- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -18,7 +18,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_bf16_mi325x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x_mtp.sh
index 65e68e9c8..e9df93c7d 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi325x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_bf16_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x.sh
index d149e7a40..1661df465 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x_mtp.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_bf16_mi355x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x_mtp.sh
index 87605fa80..38230cc88 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp4_b200.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200.sh
index 76dbf5e0f..638bc85ec 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -21,7 +21,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp4_b200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_mtp.sh
index 55e1bd723..5da51d974 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -21,7 +21,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp4_b300.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300.sh
index 18b6cda09..84205cf51 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300.sh
@@ -3,7 +3,7 @@
 # Follows the SGLang cookbook recipe at
 # https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -29,7 +29,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 export PYTHONUNBUFFERED=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC >= 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300_mtp.sh
similarity index 98%
rename from benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300_mtp.sh
index 9cb5d5464..0cac9bef7 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300_mtp.sh
@@ -3,7 +3,7 @@
 # Follows the SGLang cookbook recipe at
 # https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -29,7 +29,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 export PYTHONUNBUFFERED=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # Default: recv every ~10 requests; if CONC >= 16, relax to ~30 requests between scheduler recv polls.
 if [[ $CONC -ge 16 ]]; then
diff --git a/benchmarks/single_node/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp4_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh
index 2a0976f8d..e400729ff 100644
--- a/benchmarks/single_node/qwen3.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 export SGLANG_USE_AITER=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp4_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_atom.sh
index 2a8c67da0..325c97726 100644
--- a/benchmarks/single_node/qwen3.5_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/qwen3.5_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp4_mi355x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh
index 425fbfcc6..e98dec2db 100755
--- a/benchmarks/single_node/qwen3.5_fp4_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ hf download "$MODEL"
 export SGLANG_USE_AITER=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8}
 
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_b200.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200.sh
index 2450493be..4b9005eb8 100755
--- a/benchmarks/single_node/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -21,7 +21,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200_mtp.sh
index f6ef90864..a7093d4b8 100755
--- a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -21,7 +21,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_b300.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300.sh
index cbceb6f1b..6644c1320 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 nvidia-smi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300_mtp.sh
index ca3b87120..7e799875c 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 nvidia-smi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp8_h100.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh
index 4c70657aa..daf03a05d 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h100.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh
@@ -7,7 +7,7 @@
 # chunked-prefill-size from 16384 → 8192 to leave more headroom.
 # Sweep tops out at conc=32 instead of 64 for the same reason.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -28,7 +28,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 MAX_SEQ_LEN=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100_mtp.sh
index 86b35f5e7..faa666f8b 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100_mtp.sh
@@ -4,7 +4,7 @@
 # Mirrors qwen3.5_fp8_h100.sh; adds the speculative-* flags + SGLANG_ENABLE_SPEC_V2=1
 # and passes --use-chat-template per AGENTS.md.
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -27,7 +27,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 export SGLANG_ENABLE_SPEC_V2=1
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 MAX_SEQ_LEN=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/qwen3.5_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_h200.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200.sh
index a8071c520..07ce08a58 100644
--- a/benchmarks/single_node/qwen3.5_fp8_h200.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -21,7 +21,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 MAX_SEQ_LEN=$((ISL + OSL + 20))
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200_mtp.sh
index b68c9d060..98c1ec9db 100644
--- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -22,7 +22,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding
 SPECULATIVE_NUM_STEPS=3
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi300x.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_mi300x.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi300x.sh
index 760f01403..e1607860d 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi300x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -18,7 +18,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_mi325x.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x.sh
index 760f01403..e1607860d 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -18,7 +18,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x_mtp.sh
similarity index 97%
rename from benchmarks/single_node/qwen3.5_fp8_mi325x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x_mtp.sh
index 3a8c1d3dd..a8e04064b 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi325x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_mi355x.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh
index d149e7a40..1661df465 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_mi355x_atom.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom.sh
index 2a8c67da0..325c97726 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom_mtp.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_mi355x_atom_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom_mtp.sh
index 50d90f380..29351cf33 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -20,7 +20,6 @@ fi
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 
 export OMP_NUM_THREADS=1
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh
similarity index 96%
rename from benchmarks/single_node/qwen3.5_fp8_mi355x_mtp.sh
rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh
index 87605fa80..38230cc88 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
@@ -19,7 +19,6 @@ fi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
-PORT=${PORT:-8888}
 CONTEXT_LENGTH=$((ISL + OSL + 20))
 MAX_PREFILL_TOKENS=32768
 
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 25e7f4db5..875cbcdd5 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -5,13 +5,35 @@
 # the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). The SGLang
 # recipes are copied exactly from the pinned srt-slurm commit below.
 
-set -x
+# -e: abort on any unhandled error. -o pipefail: pipeline fails if any
+# stage fails. Without these, errors like a bad `git checkout SHA` get
+# silently swallowed and the script continues with broken state. R5 of
+# dsv4-fp4-gb300-dynamo-vllm-agentic caught this — a bad checkout left
+# the cw shards on origin/HEAD (which happened to be the right commit),
+# masking the bug entirely until upstream main moves and breaks us.
+set -exo pipefail
 
 if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
     # Weights staged on shared storage; avoid node-local /scratch symlink drift.
     export MODEL_PATH="/mnt/vast/models/dsv4"
 
-    if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
+    if [[ "$IS_AGENTIC" == "1" ]]; then
+        # Agentic multi-node uses upstream NVIDIA/srt-slurm@main, which has
+        # caught up on every schema feature we need:
+        #   - BenchmarkType.CUSTOM + benchmark.command + benchmark.env
+        #     (the hook that hands off to benchmarks/multi_node/agentic_srt.sh)
+        #   - DynamoConfig.wheel (so our vllm recipes can pin the same
+        #     ai-dynamo wheel as the fixed-seq-len path)
+        #   - default_bash_preamble (no more "Unknown field" warning)
+        # Per-worker --mem=0 is set via `srun_options:` in the recipe yaml
+        # (a documented top-level field that srtctl threads through to
+        # start_srun_process → see docs/config-reference.md#srun_options).
+        # Pin to HEAD as of when this landed; bump as upstream evolves.
+        SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git"
+        SRT_SLURM_RECIPES_REF="127597c2926467db06e6707e0aa9227261c6c02a"
+        SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic"
+        SRT_RECIPE_DST="recipes/vllm/deepseek-v4/agentic"
+    elif [[ $FRAMEWORK == "dynamo-sglang" ]]; then
         SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git"
         SRT_SLURM_RECIPES_REF="main"
         SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4"
@@ -45,6 +67,15 @@ export SLURM_ACCOUNT="cw-sup"
 export NVIDIA_VISIBLE_DEVICES=all
 export NVIDIA_DRIVER_CAPABILITIES=compute,utility
 
+# Host-side directory holding aiperf's content-addressed dataset mmap cache.
+# Bind-mounted into worker containers at /aiperf_mmap_cache via the
+# default_mounts: block in srtslurm.yaml below; aiperf reads it via
+# AIPERF_DATASET_MMAP_CACHE_DIR (set in each agentic recipe's benchmark.env).
+# Without it, every run re-tokenizes and re-writes ~65 GB of mmap files
+# per dataset on first use. 777 mode so all gharunner_X SLURM users can
+# write to it.
+export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/ai-perf-cache"
+
 NGINX_IMAGE="nginx:1.27.4"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).
@@ -132,7 +163,11 @@ if [ -e "$HOME/.local/bin/uv" ]; then
     exit 1
 fi
 
-uv venv
+# --seed installs pip+setuptools+wheel into the venv. Without it, the
+# upstream prefetch-ai-dynamo-wheel.sh script (called by srtctl when a
+# recipe has dynamo.wheel set) fails with "No module named pip" because
+# uv venv defaults to no-pip.
+uv venv --seed
 source .venv/bin/activate
 uv pip install -e .
 
@@ -173,6 +208,7 @@ srtctl_root: "${SRTCTL_ROOT}"
 
 default_mounts:
   ${DYNAMO_WHEELS_CACHE_HOST}: /configs/dynamo-wheels
+  ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache
 
 model_paths:
   dspro: "${MODEL_PATH}"
@@ -243,6 +279,23 @@ echo "Extracted JOB_ID: $JOB_ID"
 LOGS_DIR="outputs/$JOB_ID/logs"
 LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
 
+# Snapshot worker logs on any exit path — normal completion, error,
+# SIGTERM (gh run cancel sends this to the launcher), even SIGKILL of
+# our parent. Without this trap, the cancel-time tar lives only in the
+# main flow below (after `wait $POLL_PID`), so a manual `gh run cancel`
+# during the tail wait skips it entirely and the
+# `Upload server logs` workflow step finds nothing to upload.
+# Idempotent: the main-flow tar at the bottom of this script is now a
+# no-op because the trap already produced the artifact, but it stays
+# for narrative continuity in normal (non-cancel) runs.
+_snapshot_server_logs() {
+    if [ -n "${LOGS_DIR:-}" ] && [ -d "$LOGS_DIR" ] && [ -n "${GITHUB_WORKSPACE:-}" ]; then
+        cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" 2>/dev/null || true
+        tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . 2>/dev/null || true
+    fi
+}
+trap _snapshot_server_logs EXIT
+
 while ! ls "$LOG_FILE" &>/dev/null; do
     if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
         echo "ERROR: Job $JOB_ID failed before creating log file"
@@ -273,8 +326,9 @@ echo "Collecting results..."
 
 if [ -d "$LOGS_DIR" ]; then
     echo "Found logs directory: $LOGS_DIR"
-    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+    # Tarball + LOGS copy are produced by the EXIT trap defined near
+    # JOB_ID extraction (so cancel paths also get them); just log here.
+    echo "multinode_server_logs.tar.gz will be (re)produced on script EXIT."
 else
     echo "Warning: Logs directory not found at $LOGS_DIR"
 fi
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 5248e63ed..ec8c1adb7 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -8,6 +8,15 @@ export SLURM_PARTITION="batch_1"
 export SLURM_ACCOUNT="benchmark"
 export ENROOT_ROOTFS_WRITABLE=1
 
+# Host-side directory holding aiperf's content-addressed dataset mmap cache.
+# Bind-mounted into worker containers at /aiperf_mmap_cache via the
+# default_mounts: block in srtslurm.yaml below; aiperf reads it via
+# AIPERF_DATASET_MMAP_CACHE_DIR (set in each agentic recipe's benchmark.env).
+# Without it, every run re-tokenizes and re-writes ~65 GB of mmap files
+# per dataset on first use. 777 mode so all gharunner_X SLURM users can
+# write to it.
+export AIPERF_MMAP_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/ai-perf-cache"
+
 export MODEL_PATH=$MODEL
 
 if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
@@ -19,6 +28,15 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
     export MODEL_PATH=/scratch/models/DeepSeek-R1-0528
     export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
 elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Use the node-local /scratch SSD for the 806 GB DSv4-Pro
+    # checkpoint. Faster than the Vast NFS path, but this dir only
+    # exists on compute nodes — the GHA runner pod's view does NOT
+    # have /scratch/models, so srtctl preflight (which stats the path
+    # from the runner pod) may fail with "Model alias resolved to
+    # /scratch/models/DeepSeek-V4-Pro, but that path is unavailable."
+    # If that happens, the next step is either to (a) patch srt-slurm
+    # to add a skip_model_preflight recipe field, or (b) stub a
+    # symlink on the runner pod that points at the NFS copy.
     export MODEL_PATH=/scratch/models/DeepSeek-V4-Pro
     export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
 elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then
@@ -31,8 +49,14 @@ fi
 
 NGINX_IMAGE="nginx:1.27.4"
 
-SQUASH_FILE="/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-NGINX_SQUASH_FILE="/home/sa-shared/gharunners/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+# Squash files live on the Vast NFS storage; use the /data/ mount
+# (not /home/sa-shared/) — both are the same backing storage but the
+# /home/sa-shared/ mount has a chronic ELOOP / "Too many levels of
+# symbolic links" bug from workflow worker NFS sessions on lockfiles
+# AND data files. /data/ has a separate NFS client cache that isn't
+# poisoned. See feedback_gb300_nfs_eloop_workaround for diagnosis.
+SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+NGINX_SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
 # Run the import on a compute node via srun, not on the login node:
 # the login node is x86_64 while the compute nodes are aarch64, so the
@@ -65,7 +89,40 @@ RUN_KEY=$(printf "%s" "${RESULT_FILENAME:-${RUNNER_NAME:-gb300-nv}}" | sha1sum |
 SRT_REPO_DIR="${GITHUB_WORKSPACE}/srt-slurm-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}"
 rm -rf "$SRT_REPO_DIR"
 
-if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
+if [[ "$IS_AGENTIC" == "1" ]]; then
+    # Agentic multi-node uses cquil11/srt-slurm-nv@cam/no-preflight-flag,
+    # a thin branch off NVIDIA/srt-slurm@127597c that adds one CLI flag
+    # (`srtctl apply --no-preflight`) — needed because:
+    #
+    #   - We want MODEL_PATH=/scratch/models/DeepSeek-V4-Pro (node-local
+    #     NVMe, fast) instead of the NFS path under /data/home/sa-shared.
+    #   - /scratch only exists on GB300 compute nodes; it is NOT mounted
+    #     on the GHA runner pod that invokes srtctl.
+    #   - srtctl's pre-submit model check (_preflight_model in
+    #     src/srtctl/core/validation.py) does a Path.is_dir() in-process
+    #     on the invoking node — so it fails before sbatch is ever
+    #     called with "Model alias 'X' resolved to '/scratch/...',
+    #     but that path is unavailable".
+    #   - --no-preflight skips just the optional Python-level FS check.
+    #     vLLM still fails loudly at runtime if the path is genuinely
+    #     missing on the compute node.
+    #
+    # All other upstream schema features we need are inherited from
+    # NVIDIA HEAD:
+    #   - BenchmarkType.CUSTOM + benchmark.command + benchmark.env
+    #     (hook that hands off to benchmarks/multi_node/agentic_srt.sh)
+    #   - DynamoConfig.wheel (so vllm recipes can pin the ai-dynamo wheel)
+    #   - sbatch_directives / srun_options (top-level recipe fields)
+    git clone https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    # 854b3fd = --no-preflight flag
+    # 6e34b8b = benchmark_stage propagates srun_options (needed for
+    #           container-remap-root to reach the agentic_srt.sh srun)
+    git checkout 6e34b8b83229634d732e41a4e2d6595f46ef60b5
+    mkdir -p recipes/vllm/deepseek-v4/agentic
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic" \
+        recipes/vllm/deepseek-v4/agentic
+elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout aflowers/gb200-dsv4-recipes
@@ -95,7 +152,11 @@ export PATH="$UV_INSTALL_DIR:$PATH"
 
 VENV_DIR="${GITHUB_WORKSPACE}/.venv-srt-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}"
 rm -rf "$VENV_DIR"
-uv venv "$VENV_DIR"
+# --seed installs pip+setuptools+wheel into the venv. Without it, the
+# upstream prefetch-ai-dynamo-wheel.sh script (called by srtctl when a
+# recipe has dynamo.wheel set) fails with "No module named pip" because
+# uv venv defaults to no-pip.
+uv venv --seed "$VENV_DIR"
 source "$VENV_DIR/bin/activate"
 uv pip install -e .
 
@@ -124,6 +185,13 @@ network_interface: ""
 # Path to srtctl repo root (where the configs live)
 srtctl_root: "${SRTCTL_ROOT}"
 
+# Cluster-level bind mounts applied to every worker container
+# (see srtctl/core/runtime.py — get_srtslurm_setting("default_mounts")).
+# Used here for aiperf's persistent mmap cache so the dataset isn't
+# re-tokenized + re-written every job.
+default_mounts:
+  "${AIPERF_MMAP_CACHE_HOST_PATH}": "/aiperf_mmap_cache"
+
 # Model path aliases
 model_paths:
   "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}"
@@ -155,7 +223,18 @@ fi
 # Override the job name in the config file with the runner name
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
 
-SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+# --no-preflight is only safe on the agentic path, where the recipe
+# resolves model.path to /scratch (compute-node-only NVMe) and the
+# srtctl process running on the GHA runner pod can't see it. Fixed-
+# seq-len recipes still resolve model.path to an NFS-visible location
+# where the precheck is a useful sanity guard, so keep enforcement on
+# for them.
+PREFLIGHT_FLAG=""
+if [[ "$IS_AGENTIC" == "1" ]]; then
+    PREFLIGHT_FLAG="--no-preflight"
+fi
+
+SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
 echo "$SRTCTL_OUTPUT"
 
 JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
@@ -174,6 +253,26 @@ echo "Extracted JOB_ID: $JOB_ID"
 LOGS_DIR="outputs/$JOB_ID/logs"
 LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
 
+# Snapshot worker logs on any exit path — normal completion, error,
+# SIGTERM (gh run cancel sends this to the launcher), even SIGKILL of
+# our parent. Without this trap, the cancel-time tar lives only in the
+# main flow below (after `wait $POLL_PID`), so a manual `gh run cancel`
+# during the tail wait skips it entirely and the
+# `Upload server logs` workflow step finds nothing to upload.
+# Idempotent: the main-flow tar at the bottom of this script is now a
+# no-op because the trap already produced the artifact, but it stays
+# for narrative continuity in normal (non-cancel) runs.
+_snapshot_server_logs() {
+    if [ -n "${LOGS_DIR:-}" ] && [ -d "$LOGS_DIR" ] && [ -n "${GITHUB_WORKSPACE:-}" ]; then
+        # Copy + tar are independent best-effort; an in-flight write
+        # from a worker .out file at SIGTERM time would otherwise abort
+        # the whole script before either succeeds.
+        cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" 2>/dev/null || true
+        tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . 2>/dev/null || true
+    fi
+}
+trap _snapshot_server_logs EXIT
+
 # Wait for log file to appear (also check job is still alive)
 while ! ls "$LOG_FILE" &>/dev/null; do
     if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
@@ -207,8 +306,9 @@ echo "Collecting results..."
 
 if [ -d "$LOGS_DIR" ]; then
     echo "Found logs directory: $LOGS_DIR"
-    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
-    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+    # Tarball + LOGS copy are produced by the EXIT trap defined near
+    # JOB_ID extraction (so cancel paths also get them); just log here.
+    echo "multinode_server_logs.tar.gz will be (re)produced on script EXIT."
 else
     echo "Warning: Logs directory not found at $LOGS_DIR"
 fi
@@ -281,6 +381,12 @@ if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
     fi
 fi
 
+# Snapshot logs to GITHUB_WORKSPACE BEFORE cleanup, so the EXIT trap's
+# `[ -d "$LOGS_DIR" ]` guard isn't already false by the time it fires
+# (it runs AFTER the rm below, since EXIT traps are last-thing-before-exit).
+# Without this inline call, R25 lost both 1p6d shards' logs.
+_snapshot_server_logs
+
 # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
 # from blocking the next job's checkout on this runner
 echo "Cleaning up srt-slurm outputs..."
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index b4f594d51..988addedd 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -280,6 +280,7 @@ EOF
 else
 
     HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/"
+    AIPERF_MMAP_CACHE_HOST_PATH="/mnt/nfs/sa-shared/gharunners/ai-perf-cache"
     SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     LOCK_FILE="${SQUASH_FILE}.lock"
 
@@ -306,10 +307,10 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
-        --no-container-entrypoint --export=ALL,PORT=8888 \
+        --no-container-entrypoint --export=ALL,PORT=8888,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \
         bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h100.sh
 
     scancel $JOB_ID
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 1486c4fa6..684721497 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
 export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache"
+export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/gharunner/ai-perf-cache"
 export PORT=8888
 
 MODEL_CODE="${EXP_NAME%%_*}"
@@ -40,10 +41,10 @@ fi
 
 srun --jobid=$JOB_ID \
 --container-image=$CONTAINER_IMAGE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \
 --container-mount-home \
 --container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL \
+--no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \
 bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 
 rmdir $SAGEMAKER_SHM_PATH
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index b701d65a6..572056956 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -315,6 +315,7 @@ else
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \
         --no-container-mount-home \
+        --container-remap-root \
         --container-workdir=$CONTAINER_MOUNT_DIR/ \
         --no-container-entrypoint --export=ALL,PORT=8888,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \
         bash $BENCH_SCRIPT
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 158c30792..23d8d816b 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/bash
 
 export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/"
+export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/data/gharunners/ai-perf-cache"
 export PORT=8888
 
 MODEL_CODE="${EXP_NAME%%_*}"
@@ -13,10 +14,10 @@ set -x
 srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
 --container-image=$IMAGE \
 --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \
 --container-remap-root \
 --container-writable \
 --container-mount-home \
 --container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL \
+--no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \
 bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index a8033847e..7a54e3848 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -59,7 +59,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then
         BENCHMARK_SUBDIR="multi_node"
     else
-        BENCHMARK_SUBDIR="single_node"
+        BENCHMARK_SUBDIR="single_node/fixed_seq_len"
     fi
     JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}")
 
@@ -223,8 +223,8 @@ else
     fi
 
     SCRIPT_BASE="${EXP_NAME%%_*}_${PRECISION}_mi355x"
-    SCRIPT_FW="benchmarks/single_node/${SCENARIO_SUBDIR:-}${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
-    SCRIPT_FALLBACK="benchmarks/single_node/${SCENARIO_SUBDIR:-}${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
+    SCRIPT_FW="benchmarks/single_node/${SCENARIO_SUBDIR:-fixed_seq_len/}${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
+    SCRIPT_FALLBACK="benchmarks/single_node/${SCENARIO_SUBDIR:-fixed_seq_len/}${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
     if [[ -f "$SCRIPT_FW" ]]; then
         BENCHMARK_SCRIPT="$SCRIPT_FW"
     else
diff --git a/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py b/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py
index aa4b639ca..78925636f 100644
--- a/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py
+++ b/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
-"""Analyze ISL/OSL/turn distributions from AIPerf benchmark results.
+"""Analyze ISL/OSL distributions from AIPerf benchmark results.
 
-Reads profile_export.jsonl and produces summary stats + distribution plots
-to verify the benchmark workload matches the intended Qwen trace profile.
+Reads profile_export.jsonl and produces mean/median/p75/p90/p95 summary stats
+plus all-requests ISL and OSL histograms.
 
 Usage:
     python analyze_benchmark_distributions.py path/to/aiperf_artifacts/ -o output_dir/
@@ -12,8 +12,6 @@
 
 import argparse
 import json
-import math
-from collections import Counter, defaultdict
 from pathlib import Path
 
 
@@ -29,331 +27,124 @@ def load_records(artifacts_dir: Path) -> list[dict]:
     return records
 
 
-def load_trace_replay_records(trace_replay_dir: Path) -> list[dict]:
-    """Load per-request records from trace_replay detailed_results.csv.
+def _stats(values: list[int]) -> dict[str, float]:
+    sv = sorted(values)
+    n = len(sv)
+    return {
+        "n": n,
+        "mean": sum(sv) / n,
+        "median": sv[n // 2],
+        "p75": sv[int(n * 0.75)],
+        "p90": sv[int(n * 0.90)],
+        "p95": sv[int(n * 0.95)],
+    }
 
-    Converts to the same format as AIPerf JSONL records so the analyze()
-    function can process both formats identically.
-    """
-    import csv
-    import sys
-    csv.field_size_limit(sys.maxsize)
 
-    csv_path = trace_replay_dir / "detailed_results.csv"
-    records = []
-    with open(csv_path) as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            if row.get("success") != "True":
-                continue
-            records.append({
-                "metadata": {
-                    "x_correlation_id": row["trace_id"],
-                    "conversation_id": row["trace_id"],
-                    "turn_index": int(row["request_idx"]),
-                    "benchmark_phase": "profiling",
-                },
-                "metrics": {
-                    "input_sequence_length": {"value": int(row["input_tokens"])},
-                    "output_sequence_length": {"value": int(row["output_tokens_actual"])},
-                },
-            })
-    return records
+def _fmt(s: dict[str, float]) -> str:
+    return (
+        f"  n={s['n']:,}  mean={s['mean']:,.0f}  median={s['median']:,}  "
+        f"p75={s['p75']:,}  p90={s['p90']:,}  p95={s['p95']:,}"
+    )
 
 
 def analyze(records: list[dict], output_dir: Path) -> None:
-    """Run distribution analysis and save results."""
     output_dir.mkdir(parents=True, exist_ok=True)
 
-    # Group by conversation
-    convos: dict[str, list[dict]] = defaultdict(list)
+    all_isl: list[int] = []
+    all_osl: list[int] = []
     for r in records:
         metrics = r.get("metrics", {})
         if "input_sequence_length" not in metrics or "output_sequence_length" not in metrics:
             continue
-        # Use x_correlation_id (unique per session) not conversation_id (template, reused)
-        cid = r["metadata"].get("x_correlation_id") or r["metadata"]["conversation_id"]
-        ti = r["metadata"]["turn_index"]
-        isl = metrics["input_sequence_length"]["value"]
-        osl = metrics["output_sequence_length"]["value"]
-        convos[cid].append({"turn": ti, "isl": isl, "osl": osl})
-
-    # Sort turns within each conversation
-    for v in convos.values():
-        v.sort(key=lambda x: x["turn"])
-
-    # Turn count distribution
-    turn_counts = Counter(len(v) for v in convos.values())
-    total_convos = len(convos)
-    total_requests = len(records)
-
-    lines = []
-    lines.append("=" * 70)
-    lines.append("BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS")
-    lines.append("=" * 70)
-    lines.append(f"Total conversations: {total_convos:,}")
-    lines.append(f"Total requests: {total_requests:,}")
-    lines.append(f"Avg turns/conv: {total_requests / total_convos:.2f}")
-    lines.append("")
-
-    lines.append("TURN COUNT DISTRIBUTION:")
-    lines.append(f"  {'Turns':>5s}  {'Count':>6s}  {'Pct':>6s}   Target")
-    target = {1: 59, 2: 20, 3: 10, 4: 5, 5: 3, 6: 2, 7: 1}
-    for k in sorted(turn_counts.keys()):
-        pct = 100 * turn_counts[k] / total_convos
-        tgt = f"{target.get(k, 0):.0f}%" if k in target else ""
-        lines.append(f"  {k:5d}  {turn_counts[k]:6,}  {pct:5.1f}%   {tgt}")
-
-    # ISL/OSL by turn index
-    lines.append("")
-    lines.append("ISL BY TURN INDEX:")
-    lines.append(
-        f"  {'Turn':>4s}  {'N':>6s}  {'Mean':>8s}  {'Median':>8s}  {'Std':>8s}  {'P5':>8s}  {'P95':>8s}"
-    )
-    max_turn = max(t["turn"] for v in convos.values() for t in v)
-    for ti in range(max_turn + 1):
-        vals = sorted(t["isl"] for v in convos.values() for t in v if t["turn"] == ti)
-        if not vals:
-            continue
-        n = len(vals)
-        mean = sum(vals) / n
-        std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n)
-        median = vals[n // 2]
-        p5 = vals[int(n * 0.05)]
-        p95 = vals[int(n * 0.95)]
-        lines.append(
-            f"  {ti:4d}  {n:6,}  {mean:8.0f}  {median:8.0f}  {std:8.0f}  {p5:8.0f}  {p95:8.0f}"
-        )
-
-    lines.append("")
-    lines.append("OSL BY TURN INDEX:")
-    lines.append(
-        f"  {'Turn':>4s}  {'N':>6s}  {'Mean':>8s}  {'Median':>8s}  {'Std':>8s}  {'P5':>8s}  {'P95':>8s}"
-    )
-    for ti in range(max_turn + 1):
-        vals = sorted(t["osl"] for v in convos.values() for t in v if t["turn"] == ti)
-        if not vals:
-            continue
-        n = len(vals)
-        mean = sum(vals) / n
-        std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n)
-        median = vals[n // 2]
-        p5 = vals[int(n * 0.05)]
-        p95 = vals[int(n * 0.95)]
-        lines.append(
-            f"  {ti:4d}  {n:6,}  {mean:8.0f}  {median:8.0f}  {std:8.0f}  {p5:8.0f}  {p95:8.0f}"
-        )
-
-    # Overall ISL/OSL stats
-    all_isl = sorted(t["isl"] for v in convos.values() for t in v)
-    all_osl = sorted(t["osl"] for v in convos.values() for t in v)
-    n = len(all_isl)
-    isl_mean = sum(all_isl) / n
-    osl_mean = sum(all_osl) / n
-    lines.append("")
-    lines.append("ALL REQUESTS ISL:")
-    lines.append(
-        f"  n={n:,}  mean={isl_mean:.0f}  median={all_isl[n//2]}  "
-        f"p5={all_isl[int(n*0.05)]}  p95={all_isl[int(n*0.95)]}"
-    )
-    lines.append("ALL REQUESTS OSL:")
-    lines.append(
-        f"  n={n:,}  mean={osl_mean:.0f}  median={all_osl[n//2]}  "
-        f"p5={all_osl[int(n*0.05)]}  p95={all_osl[int(n*0.95)]}"
-    )
-
-    # Per-conversation stats
-    conv_max_isl = sorted(max(t["isl"] for t in v) for v in convos.values())
-    conv_total_osl = sorted(sum(t["osl"] for t in v) for v in convos.values())
-    nc = len(conv_max_isl)
-    lines.append("")
-    lines.append("PER-CONVERSATION MAX ISL (final context size):")
-    lines.append(
-        f"  n={nc:,}  mean={sum(conv_max_isl)/nc:.0f}  median={conv_max_isl[nc//2]}  "
-        f"p5={conv_max_isl[int(nc*0.05)]}  p95={conv_max_isl[int(nc*0.95)]}"
-    )
-    lines.append("PER-CONVERSATION TOTAL OSL:")
-    lines.append(
-        f"  n={nc:,}  mean={sum(conv_total_osl)/nc:.0f}  median={conv_total_osl[nc//2]}  "
-        f"p5={conv_total_osl[int(nc*0.05)]}  p95={conv_total_osl[int(nc*0.95)]}"
-    )
+        all_isl.append(metrics["input_sequence_length"]["value"])
+        all_osl.append(metrics["output_sequence_length"]["value"])
 
-    # ISL context growth (shows accumulation across turns)
-    lines.append("")
-    lines.append("ISL CONTEXT GROWTH (sample multi-turn conversations):")
-    multi = [(cid, v) for cid, v in convos.items() if len(v) >= 3][:10]
-    for cid, turns in multi:
-        isls = " -> ".join(str(t["isl"]) for t in turns)
-        lines.append(f"  {cid}: {isls}")
-
-    lines.append("=" * 70)
+    if not all_isl:
+        print("No records with ISL/OSL metrics found.")
+        return
 
+    isl_stats = _stats(all_isl)
+    osl_stats = _stats(all_osl)
+
+    lines = [
+        "=" * 70,
+        "BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS",
+        "=" * 70,
+        f"Total requests: {len(records):,}",
+        "",
+        "ALL REQUESTS ISL:",
+        _fmt(isl_stats),
+        "ALL REQUESTS OSL:",
+        _fmt(osl_stats),
+        "=" * 70,
+    ]
     summary_text = "\n".join(lines)
     print(summary_text)
-
-    # Save summary
     (output_dir / "workload_distribution_summary.txt").write_text(summary_text)
 
-    # Try to generate plots (matplotlib may not be available)
     try:
-        _generate_plots(convos, records, output_dir)
+        _generate_plots(all_isl, all_osl, isl_stats, osl_stats, output_dir)
     except ImportError:
         print("matplotlib not available, skipping plots")
 
 
 def _generate_plots(
-    convos: dict[str, list[dict]], records: list[dict], output_dir: Path
+    all_isl: list[int],
+    all_osl: list[int],
+    isl_stats: dict[str, float],
+    osl_stats: dict[str, float],
+    output_dir: Path,
 ) -> None:
-    """Generate distribution plots."""
     import matplotlib
 
     matplotlib.use("Agg")
     import matplotlib.pyplot as plt
 
-    fig, axes = plt.subplots(3, 3, figsize=(18, 15))
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
     fig.suptitle("Benchmark Workload Distribution Analysis", fontsize=14)
 
-    # (0,0) Turn count distribution
-    ax = axes[0, 0]
-    turn_counts = Counter(len(v) for v in convos.values())
-    turns = sorted(turn_counts.keys())
-    counts = [turn_counts[t] for t in turns]
-    total = sum(counts)
-    bars = ax.bar(turns, [100 * c / total for c in counts], edgecolor="black", alpha=0.7)
-    for bar, t in zip(bars, turns):
-        ax.text(
-            bar.get_x() + bar.get_width() / 2,
-            bar.get_height(),
-            f"{bar.get_height():.0f}%",
-            ha="center",
-            va="bottom",
-            fontsize=8,
-        )
-    ax.set_xlabel("Number of Turns")
-    ax.set_ylabel("% of Conversations")
-    ax.set_title(f"Turn Count Distribution (n={total:,})")
-    ax.grid(True, alpha=0.3, axis="y")
-
-    # (0,1) All requests ISL histogram
-    ax = axes[0, 1]
-    all_isl = [t["isl"] for v in convos.values() for t in v]
-    clip = int(sorted(all_isl)[int(len(all_isl) * 0.99)] * 1.2)
-    ax.hist([v for v in all_isl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="steelblue")
-    all_isl_sorted = sorted(all_isl)
-    median_isl = all_isl_sorted[len(all_isl) // 2]
-    mean_isl = sum(all_isl) / len(all_isl)
-    ax.axvline(median_isl, color="red", linestyle="--", label=f"Median: {median_isl:,}")
-    ax.axvline(mean_isl, color="orange", linestyle="--", label=f"Mean: {mean_isl:,.0f}")
-    ax.set_xlabel("Input Sequence Length")
-    ax.set_ylabel("Count")
-    ax.set_title(f"All Requests ISL (n={len(all_isl):,})")
-    ax.legend(fontsize=8)
-    ax.grid(True, alpha=0.3, axis="y")
-
-    # (0,2) All requests OSL histogram
-    ax = axes[0, 2]
-    all_osl = [t["osl"] for v in convos.values() for t in v]
-    clip = min(3000, int(sorted(all_osl)[int(len(all_osl) * 0.99)] * 1.2))
-    ax.hist([v for v in all_osl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="coral")
-    all_osl_sorted = sorted(all_osl)
-    median_osl = all_osl_sorted[len(all_osl) // 2]
-    mean_osl = sum(all_osl) / len(all_osl)
-    ax.axvline(median_osl, color="red", linestyle="--", label=f"Median: {median_osl:,}")
-    ax.axvline(mean_osl, color="orange", linestyle="--", label=f"Mean: {mean_osl:,.0f}")
-    ax.set_xlabel("Output Sequence Length")
-    ax.set_ylabel("Count")
-    ax.set_title(f"All Requests OSL (n={len(all_osl):,})")
-    ax.legend(fontsize=8)
-    ax.grid(True, alpha=0.3, axis="y")
-
-    # (1,0) Average new prefill tokens by turn index (ISL delta per turn)
-    ax = axes[1, 0]
-    # Collect deltas grouped by turn index
-    deltas_by_turn: dict[int, list[int]] = defaultdict(list)
-    for v in convos.values():
-        for i, t in enumerate(v):
-            if i == 0:
-                deltas_by_turn[t["turn"]].append(t["isl"])
-            else:
-                deltas_by_turn[t["turn"]].append(max(0, t["isl"] - v[i - 1]["isl"]))
-    if deltas_by_turn:
-        turn_indices = sorted(deltas_by_turn.keys())
-        means = [sum(deltas_by_turn[ti]) / len(deltas_by_turn[ti]) for ti in turn_indices]
-        ns = [len(deltas_by_turn[ti]) for ti in turn_indices]
-        ax.plot(turn_indices, means, marker="o", markersize=3, linewidth=1, color="mediumseagreen")
-        ax.fill_between(turn_indices, 0, means, alpha=0.2, color="mediumseagreen")
-        # Label first and last points
-        if len(turn_indices) > 0:
-            ax.annotate(f"{means[0]:,.0f}", (turn_indices[0], means[0]), fontsize=7, ha="left", va="bottom")
-        if len(turn_indices) > 1:
-            ax.annotate(f"{means[-1]:,.0f}\n(n={ns[-1]})", (turn_indices[-1], means[-1]), fontsize=7, ha="right", va="bottom")
-    # Overall mean/median across all deltas
-    all_deltas = [d for dlist in deltas_by_turn.values() for d in dlist]
-    if all_deltas:
-        overall_mean = sum(all_deltas) / len(all_deltas)
-        all_deltas_sorted = sorted(all_deltas)
-        overall_median = all_deltas_sorted[len(all_deltas) // 2]
-        ax.axhline(overall_mean, color="orange", linestyle="--", linewidth=1, label=f"Mean: {overall_mean:,.0f}")
-        ax.axhline(overall_median, color="red", linestyle="--", linewidth=1, label=f"Median: {overall_median:,}")
-        ax.legend(fontsize=7)
-    ax.set_xlabel("Turn Index")
-    ax.set_ylabel("Mean New Prefill Tokens")
-    ax.set_title("Avg New Prefill Tokens by Turn")
-    ax.grid(True, alpha=0.3)
-
-    # (1,1) ISL vs OSL scatter
-    ax = axes[1, 1]
-    ax.scatter(all_isl, all_osl, alpha=0.15, s=3, c="purple")
-    ax.set_xlabel("ISL (tokens)")
-    ax.set_ylabel("OSL (tokens)")
-    ax.set_title("ISL vs OSL (all requests)")
-    ax.grid(True, alpha=0.3)
-
-    # (1,2) Per-conversation max ISL vs num turns scatter
-    ax = axes[1, 2]
-    conv_turns = [len(v) for v in convos.values()]
-    conv_max_isl_list = [max(t["isl"] for t in v) for v in convos.values()]
-    ax.scatter(conv_turns, conv_max_isl_list, alpha=0.3, s=8, c="steelblue")
-    ax.set_xlabel("Number of Turns")
-    ax.set_ylabel("Max ISL (tokens)")
-    ax.set_title("Final Context Size vs Turn Count")
-    ax.grid(True, alpha=0.3)
-
-    # (2,0) Per-conversation max ISL (final context size per conversation)
-    ax = axes[2, 0]
-    conv_max_isl = [max(t["isl"] for t in v) for v in convos.values()]
-    clip = int(sorted(conv_max_isl)[int(len(conv_max_isl) * 0.99)] * 1.2)
-    ax.hist([v for v in conv_max_isl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="steelblue")
-    conv_max_isl_sorted = sorted(conv_max_isl)
-    median_max = conv_max_isl_sorted[len(conv_max_isl) // 2]
-    mean_max = sum(conv_max_isl) / len(conv_max_isl)
-    ax.axvline(median_max, color="red", linestyle="--", label=f"Median: {median_max:,}")
-    ax.axvline(mean_max, color="orange", linestyle="--", label=f"Mean: {mean_max:,.0f}")
-    ax.set_xlabel("Max ISL per Conversation (tokens)")
+    # ISL histogram
+    ax = axes[0]
+    isl_sorted = sorted(all_isl)
+    clip = int(isl_sorted[int(len(isl_sorted) * 0.99)] * 1.2)
+    ax.hist(
+        [v for v in all_isl if v <= clip],
+        bins=80,
+        edgecolor="black",
+        alpha=0.7,
+        color="steelblue",
+    )
+    ax.axvline(isl_stats["median"], color="red", linestyle="--", label=f"Median: {isl_stats['median']:,}")
+    ax.axvline(isl_stats["mean"], color="orange", linestyle="--", label=f"Mean: {isl_stats['mean']:,.0f}")
+    ax.axvline(isl_stats["p90"], color="green", linestyle=":", label=f"P90: {isl_stats['p90']:,}")
+    ax.axvline(isl_stats["p95"], color="purple", linestyle=":", label=f"P95: {isl_stats['p95']:,}")
+    ax.set_xlabel("Input Sequence Length (tokens)")
     ax.set_ylabel("Count")
-    ax.set_title(f"Per-Conversation Final Context Size (n={len(conv_max_isl):,})")
+    ax.set_title(f"All Requests ISL (n={isl_stats['n']:,})")
     ax.legend(fontsize=8)
     ax.grid(True, alpha=0.3, axis="y")
 
-    # (3,1) Per-conversation total OSL (sum of all output tokens across turns)
-    ax = axes[2, 1]
-    conv_total_osl = [sum(t["osl"] for t in v) for v in convos.values()]
-    clip = int(sorted(conv_total_osl)[int(len(conv_total_osl) * 0.99)] * 1.2)
-    ax.hist([v for v in conv_total_osl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="coral")
-    conv_total_osl_sorted = sorted(conv_total_osl)
-    median_tosl = conv_total_osl_sorted[len(conv_total_osl) // 2]
-    mean_tosl = sum(conv_total_osl) / len(conv_total_osl)
-    ax.axvline(median_tosl, color="red", linestyle="--", label=f"Median: {median_tosl:,}")
-    ax.axvline(mean_tosl, color="orange", linestyle="--", label=f"Mean: {mean_tosl:,.0f}")
-    ax.set_xlabel("Total OSL per Conversation (tokens)")
+    # OSL histogram
+    ax = axes[1]
+    osl_sorted = sorted(all_osl)
+    clip = min(3000, int(osl_sorted[int(len(osl_sorted) * 0.99)] * 1.2))
+    ax.hist(
+        [v for v in all_osl if v <= clip],
+        bins=80,
+        edgecolor="black",
+        alpha=0.7,
+        color="coral",
+    )
+    ax.axvline(osl_stats["median"], color="red", linestyle="--", label=f"Median: {osl_stats['median']:,}")
+    ax.axvline(osl_stats["mean"], color="orange", linestyle="--", label=f"Mean: {osl_stats['mean']:,.0f}")
+    ax.axvline(osl_stats["p90"], color="green", linestyle=":", label=f"P90: {osl_stats['p90']:,}")
+    ax.axvline(osl_stats["p95"], color="purple", linestyle=":", label=f"P95: {osl_stats['p95']:,}")
+    ax.set_xlabel("Output Sequence Length (tokens)")
     ax.set_ylabel("Count")
-    ax.set_title(f"Per-Conversation Total Output Tokens (n={len(conv_total_osl):,})")
+    ax.set_title(f"All Requests OSL (n={osl_stats['n']:,})")
     ax.legend(fontsize=8)
     ax.grid(True, alpha=0.3, axis="y")
 
-    # (2,2) is empty — already placed scatter at (1,2)
-    axes[2, 2].axis("off")
-
     plt.tight_layout()
     out = output_dir / "workload_distribution_plots.png"
     plt.savefig(out, dpi=150, bbox_inches="tight")
@@ -362,32 +153,27 @@ def _generate_plots(
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Analyze benchmark workload distributions"
-    )
-    parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ or trace_replay/ directory")
+    parser = argparse.ArgumentParser(description="Analyze benchmark workload distributions")
+    parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ directory")
     parser.add_argument(
-        "-o", "--output", default=None, help="Output directory (default: same as artifacts_dir)"
+        "-o",
+        "--output",
+        default=None,
+        help="Output directory (default: same as artifacts_dir)",
     )
     args = parser.parse_args()
 
     artifacts_dir = Path(args.artifacts_dir)
     output_dir = Path(args.output) if args.output else artifacts_dir
 
-    # Auto-detect format
-    trace_replay_csv = artifacts_dir / "detailed_results.csv"
     aiperf_jsonl = artifacts_dir / "profile_export.jsonl"
-
-    if trace_replay_csv.exists():
-        records = load_trace_replay_records(artifacts_dir)
-        print(f"Loaded {len(records):,} records from {artifacts_dir} (trace replay)")
-    elif aiperf_jsonl.exists():
-        records = load_records(artifacts_dir)
-        print(f"Loaded {len(records):,} records from {artifacts_dir} (AIPerf)")
-    else:
-        print(f"No recognized data files in {artifacts_dir}")
+    if not aiperf_jsonl.exists():
+        print(f"No profile_export.jsonl found in {artifacts_dir}")
         return
 
+    records = load_records(artifacts_dir)
+    print(f"Loaded {len(records):,} records from {artifacts_dir}")
+
     analyze(records, output_dir)
 
 
diff --git a/utils/agentic-benchmark/scripts/collect_sweep_results.py b/utils/agentic-benchmark/scripts/collect_sweep_results.py
index a7c6111ad..8206385b3 100644
--- a/utils/agentic-benchmark/scripts/collect_sweep_results.py
+++ b/utils/agentic-benchmark/scripts/collect_sweep_results.py
@@ -100,51 +100,12 @@ def scalar_val(metric_name):
     }
 
 
-def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None:
-    """Load per-request metrics from trace_replay detailed_results.csv."""
-    df = pd.read_csv(csv_path)
-    if len(df) == 0:
-        return None
-
-    # Filter to successful requests only
-    df = df[df["success"] == True].copy()
-    if len(df) == 0:
-        return None
-
-    # Convert to the same schema as _load_aiperf_jsonl
-    latency_s = df["request_complete_time"] - df["request_start_time"]
-    return pd.DataFrame({
-        "start_time_ms": df["request_start_time"] * 1000,
-        "ttft_ms": df["ttft"] * 1000,
-        "tpot_ms": df["itl"] * 1000,
-        "latency_ms": latency_s * 1000,
-        "input_num_tokens": df["input_tokens"],
-        "output_num_tokens": df["output_tokens_actual"],
-    })
-
-
 def load_experiment(exp_dir: Path) -> dict | None:
     """Load metrics from a single experiment artifact directory."""
     client_csv = exp_dir / "metrics_client_metrics.csv"
     server_csv = exp_dir / "metrics_server_metrics.csv"
 
-    # No more status.txt: an experiment is considered SUCCESS iff its
-    # trace_replay/detailed_results.csv has at least one successful row.
-    # Failed / missing jobs show up as FAILED in the summary.
-    trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv"
-    status = "FAILED"
-    if trace_replay_csv.exists():
-        try:
-            import csv as _csv
-            import sys as _sys
-            _csv.field_size_limit(_sys.maxsize)
-            with open(trace_replay_csv) as _f:
-                if any(r.get('success') == 'True' for r in _csv.DictReader(_f)):
-                    status = "SUCCESS"
-        except Exception:
-            pass
-
-    # Check for aiperf summary CSV (preferred) or per-record JSONL (fallback)
+    # An experiment is considered SUCCESS iff aiperf produced a summary CSV.
     aiperf_summary_csv = None
     aiperf_artifacts = exp_dir / "aiperf_artifacts"
     if aiperf_artifacts.exists():
@@ -152,10 +113,9 @@ def load_experiment(exp_dir: Path) -> dict | None:
         if candidate.exists():
             aiperf_summary_csv = candidate
 
-    # Check for trace replay output
-    trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv"
+    status = "SUCCESS" if aiperf_summary_csv is not None else "FAILED"
 
-    if not client_csv.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists():
+    if not client_csv.exists() and aiperf_summary_csv is None:
         return None
 
     # Parse experiment name from directory.
@@ -165,7 +125,10 @@ def load_experiment(exp_dir: Path) -> dict | None:
     #   agentic_{model}_tp{N}_conc{M}_offload{mode}_{extra...}
     import re
     name = exp_dir.name
-    match = re.search(r'tp(\d+)_conc(\d+)_offload(none|cpu|ssd)', name)
+    match = re.search(
+        r'tp(\d+)_conc(\d+)_offload(none|cpu|ssd|lmcache-mp|lmcache|hicache)',
+        name,
+    )
     if not match:
         print(f"Warning: cannot parse experiment name '{exp_dir.name}', skipping")
         return None
@@ -186,7 +149,7 @@ def load_experiment(exp_dir: Path) -> dict | None:
         return result
 
     try:
-        # Determine data source: aiperf summary CSV (preferred), custom client CSV, or trace replay CSV
+        # Determine data source: aiperf summary CSV (preferred) or custom client CSV
         if aiperf_summary_csv is not None:
             aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv)
             if aiperf_metrics is None:
@@ -215,48 +178,6 @@ def load_experiment(exp_dir: Path) -> dict | None:
             if total_time_sec <= 0:
                 total_time_sec = df["latency_ms"].sum() / 1000
 
-            num_requests = len(df)
-            result.update({
-                "num_requests": num_requests,
-                "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0,
-                "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
-                "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0,
-                "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0,
-                "mean_ttft_ms": df["ttft_ms"].mean(),
-                "p50_ttft_ms": df["ttft_ms"].median(),
-                "p90_ttft_ms": df["ttft_ms"].quantile(0.9),
-                "p99_ttft_ms": df["ttft_ms"].quantile(0.99),
-                "mean_tpot_ms": df["tpot_ms"].mean(),
-                "p50_tpot_ms": df["tpot_ms"].median(),
-                "p90_tpot_ms": df["tpot_ms"].quantile(0.9),
-                "p99_tpot_ms": df["tpot_ms"].quantile(0.99),
-                "mean_latency_ms": df["latency_ms"].mean(),
-                "p50_latency_ms": df["latency_ms"].median(),
-                "p90_latency_ms": df["latency_ms"].quantile(0.9),
-                "p99_latency_ms": df["latency_ms"].quantile(0.99),
-            })
-        elif trace_replay_csv.exists():
-            df = _load_trace_replay_csv(trace_replay_csv)
-            if df is None or len(df) == 0:
-                return result
-
-            metadata_file = exp_dir / "benchmark_metadata.json"
-            total_time_sec = None
-            if metadata_file.exists():
-                try:
-                    with open(metadata_file) as f:
-                        metadata = json.load(f)
-                    total_time_sec = metadata.get("benchmark_runtime_sec")
-                except Exception:
-                    pass
-
-            if not total_time_sec or total_time_sec <= 0:
-                first_start_ms = df["start_time_ms"].min()
-                last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max()
-                total_time_sec = (last_finish_ms - first_start_ms) / 1000.0
-            if total_time_sec <= 0:
-                total_time_sec = df["latency_ms"].sum() / 1000
-
             num_requests = len(df)
             result.update({
                 "num_requests": num_requests,
diff --git a/utils/aiperf b/utils/aiperf
index 7d880a1ef..8473e1545 160000
--- a/utils/aiperf
+++ b/utils/aiperf
@@ -1 +1 @@
-Subproject commit 7d880a1ef1ef3d045ca8f8d5c95e142b5bcdf6c2
+Subproject commit 8473e1545476c1d91932aa2402b642b416a23df6
diff --git a/utils/generate_aiperf_plots.py b/utils/generate_aiperf_plots.py
new file mode 100755
index 000000000..baefa7db2
--- /dev/null
+++ b/utils/generate_aiperf_plots.py
@@ -0,0 +1,780 @@
+#!/usr/bin/env python3
+"""Generate metrics_plots.png matching kv-cache-tester's 6x2 layout.
+
+Reads aiperf's per-record JSONL + server-metrics JSON (with timeslices
+enabled via ``--slice-duration``) and emits a PNG with the same panels
+the legacy kv-cache-tester pipeline produced. The launchers feed this
+$RESULT_DIR after each run so downstream tooling and humans see the
+same visual.
+
+Layout (6 rows x 2 cols, suptitle "vLLM Server Metrics During Benchmark"):
+    (0,0) KV Cache Utilization Over Time (HBM + External)
+    (0,1) Request Queue Depth (running / waiting / total)
+    (1,0) Prefix Cache Hit Rate Per Interval (GPU / External / Combined)
+    (1,1) Throughput (Total & Decode) with running average
+    (2,0) KV Offload Transfer Rate (GPU↔CPU MB/s)
+    (2,1) Cumulative Prefill Token Source Breakdown (stackplot)
+    (3,0) KV Offload GPU→CPU (Cumulative GB)
+    (3,1) KV Offload CPU→GPU (Cumulative GB)
+    (4,0) TTFT vs Time (scatter + rolling avg)
+    (4,1) Request Latency vs Time (scatter + rolling avg)
+    (5,0) Interactivity 1/TPOT vs Time (scatter + rolling avg)
+    (5,1) Preemptions Over Time (rate + cumulative)
+
+Time-series data comes from server_metrics_export.json's per-series
+``timeslices`` array (populated when ``--slice-duration`` is set on the
+aiperf CLI). Per-record TTFT / Latency / ITL come from
+profile_export.jsonl. Panels with no data still render so the output
+shape is constant across run configs.
+
+Usage:
+    python3 generate_aiperf_plots.py <result_dir>
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+try:
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+except ImportError:
+    print("ERROR: matplotlib not installed; cannot generate plots", file=sys.stderr)
+    sys.exit(1)
+
+
+# ---- Loaders --------------------------------------------------------------
+
+
+def load_jsonl_records(path: Path) -> list[dict]:
+    records: list[dict] = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if obj.get("error"):
+                continue
+            records.append(obj)
+    return records
+
+
+def load_server_metrics(path: Path) -> dict:
+    if not path.exists():
+        return {}
+    with open(path) as f:
+        return json.load(f)
+
+
+def metric_value(record: dict, key: str) -> float | None:
+    m = record.get("metrics", {}).get(key)
+    if m is None:
+        return None
+    v = m.get("value") if isinstance(m, dict) else m
+    if v is None:
+        return None
+    try:
+        return float(v)
+    except (TypeError, ValueError):
+        return None
+
+
+# ---- Server-metrics helpers ----------------------------------------------
+
+
+def first_update_ns(server_metrics: dict) -> int | None:
+    summary = server_metrics.get("summary") or {}
+    info = (summary.get("endpoint_info") or {}).values()
+    candidates = [
+        v.get("first_update_ns")
+        for v in info
+        if isinstance(v, dict) and v.get("first_update_ns") is not None
+    ]
+    return min(candidates) if candidates else None
+
+
+def metric_entry(server_metrics: dict, name: str) -> dict | None:
+    metrics = server_metrics.get("metrics") or {}
+    entry = metrics.get(name)
+    return entry if isinstance(entry, dict) else None
+
+
+def all_series(entry: dict | None) -> list[dict]:
+    if entry is None:
+        return []
+    s = entry.get("series") or []
+    return s if isinstance(s, list) else []
+
+
+def series_with_label(
+    entry: dict | None, label_key: str, label_value: str
+) -> dict | None:
+    """Pick the series whose labels[label_key] matches label_value."""
+    for s in all_series(entry):
+        labels = s.get("labels") or {}
+        if labels.get(label_key) == label_value:
+            return s
+    return None
+
+
+def timeseries_from_series(
+    series: dict | None, t0_ns: int | None, value_key_priority=("avg", "rate", "total", "max")
+) -> tuple[list[float], list[float]]:
+    """Extract (relative-time-s, value) pairs from a series' timeslices."""
+    if series is None or t0_ns is None:
+        return [], []
+    slices = series.get("timeslices") or []
+    times: list[float] = []
+    values: list[float] = []
+    for ts in slices:
+        start = ts.get("start_ns")
+        if start is None:
+            continue
+        for k in value_key_priority:
+            if k in ts and ts[k] is not None:
+                try:
+                    values.append(float(ts[k]))
+                    times.append((start - t0_ns) / 1e9)
+                    break
+                except (TypeError, ValueError):
+                    continue
+    return times, values
+
+
+def aggregate_timeseries(
+    server_metrics: dict, name: str, t0_ns: int | None,
+    *,
+    aggregator=sum,
+    value_key_priority=("avg", "rate", "total", "max"),
+) -> tuple[list[float], list[float]]:
+    """Aggregate timeslices across every series of a metric (sums by default)."""
+    entry = metric_entry(server_metrics, name)
+    if entry is None or t0_ns is None:
+        return [], []
+    bucket: dict[int, list[float]] = defaultdict(list)
+    for s in all_series(entry):
+        for ts in s.get("timeslices") or []:
+            start = ts.get("start_ns")
+            if start is None:
+                continue
+            for k in value_key_priority:
+                if k in ts and ts[k] is not None:
+                    try:
+                        bucket[int(start)].append(float(ts[k]))
+                        break
+                    except (TypeError, ValueError):
+                        continue
+    if not bucket:
+        return [], []
+    times: list[float] = []
+    values: list[float] = []
+    for start_ns in sorted(bucket):
+        times.append((start_ns - t0_ns) / 1e9)
+        values.append(aggregator(bucket[start_ns]))
+    return times, values
+
+
+def rolling_average(values: list[float], window: int) -> list[float]:
+    if window <= 1 or not values:
+        return list(values)
+    out: list[float] = []
+    for i in range(len(values)):
+        chunk = values[max(0, i - window) : i + 1]
+        out.append(sum(chunk) / len(chunk))
+    return out
+
+
+def rolling_window(n: int, max_window: int = 50) -> int:
+    if n <= 10:
+        return 1
+    return min(max_window, max(1, n // 10))
+
+
+# ---- Panels --------------------------------------------------------------
+
+
+def panel_kv_cache_usage(ax, server_metrics: dict, t0_ns: int | None) -> None:
+    times, values = aggregate_timeseries(
+        server_metrics, "vllm:kv_cache_usage_perc", t0_ns, aggregator=max
+    )
+    cpu_times, cpu_values = aggregate_timeseries(
+        server_metrics, "vllm:cpu_kv_cache_usage_perc", t0_ns, aggregator=max
+    )
+
+    def _norm(v: float) -> float:
+        return v * 100.0 if 0 <= v <= 1.0 else v
+
+    if values:
+        gpu_pct = [min(_norm(v), 100.0) for v in values]
+        ax.scatter(times, gpu_pct, alpha=0.15, s=2, c="blue")
+        win = rolling_window(len(gpu_pct))
+        if win > 1:
+            ax.plot(
+                times,
+                rolling_average(gpu_pct, win),
+                "b-",
+                linewidth=2,
+                label=f"GPU (avg n={win})",
+            )
+        else:
+            ax.plot(times, gpu_pct, "b-", linewidth=2, label="GPU")
+    if cpu_values:
+        cpu_pct = [_norm(v) for v in cpu_values]
+        ax.plot(cpu_times, cpu_pct, "r--", linewidth=1.5, label="External")
+    if values or cpu_values:
+        ax.legend(fontsize=8)
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("KV Cache Usage (%)")
+    ax.set_title("KV Cache Utilization Over Time")
+    ax.set_ylim(0, 105)
+    ax.grid(True, alpha=0.3)
+
+
+def panel_queue_depth(ax, server_metrics: dict, t0_ns: int | None) -> None:
+    rt, rv = aggregate_timeseries(
+        server_metrics, "vllm:num_requests_running", t0_ns, aggregator=max
+    )
+    wt, wv = aggregate_timeseries(
+        server_metrics, "vllm:num_requests_waiting", t0_ns, aggregator=max
+    )
+    if rt:
+        win = rolling_window(len(rv))
+        running = rolling_average(rv, win) if win > 1 else rv
+        ax.plot(rt, running, "g-", label=f"Running (avg n={win})", linewidth=1.5)
+    if wt:
+        win = rolling_window(len(wv))
+        waiting = rolling_average(wv, win) if win > 1 else wv
+        ax.plot(wt, waiting, "r-", label=f"Waiting (avg n={win})", linewidth=1.5)
+    if rt and wt and len(rt) == len(wt):
+        total = [r + w for r, w in zip(rv, wv)]
+        win = rolling_window(len(total))
+        smoothed = rolling_average(total, win) if win > 1 else total
+        ax.plot(rt, smoothed, "b-", label=f"Total (avg n={win})", linewidth=1.5)
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("Requests")
+    ax.set_title("Request Queue Depth")
+    if rt or wt:
+        ax.legend(fontsize=8)
+    ax.grid(True, alpha=0.3)
+
+
+def _hit_rate_intervals(
+    server_metrics: dict,
+    hits_name: str,
+    queries_name: str,
+    t0_ns: int | None,
+) -> tuple[list[float], list[float]]:
+    """Compute per-interval hit rates from cumulative counters' deltas."""
+    ht, hv = aggregate_timeseries(
+        server_metrics, hits_name, t0_ns, value_key_priority=("total",)
+    )
+    qt, qv = aggregate_timeseries(
+        server_metrics, queries_name, t0_ns, value_key_priority=("total",)
+    )
+    if not ht or not qt or len(ht) != len(qt):
+        return [], []
+    times: list[float] = []
+    rates: list[float] = []
+    last = 0.0
+    for i in range(len(ht)):
+        dh = hv[i]
+        dq = qv[i]
+        if dq > 0:
+            last = 100.0 * dh / dq
+        rates.append(last)
+        times.append(ht[i])
+    return times, rates
+
+
+def panel_prefix_cache_hit_rate(ax, server_metrics: dict, t0_ns: int | None) -> None:
+    gpu_t, gpu_r = _hit_rate_intervals(
+        server_metrics,
+        "vllm:prefix_cache_hits",
+        "vllm:prefix_cache_queries",
+        t0_ns,
+    )
+    ext_t, ext_r = _hit_rate_intervals(
+        server_metrics,
+        "vllm:external_prefix_cache_hits",
+        "vllm:external_prefix_cache_queries",
+        t0_ns,
+    )
+    if gpu_t:
+        ax.scatter(gpu_t, gpu_r, alpha=0.3, s=5, c="purple", label="GPU (HBM)")
+        win = rolling_window(len(gpu_r))
+        if win > 1:
+            ax.plot(
+                gpu_t,
+                rolling_average(gpu_r, win),
+                "purple",
+                linewidth=1.5,
+                label=f"GPU avg (n={win})",
+            )
+    has_ext = bool(ext_t and any(r > 0 for r in ext_r))
+    if has_ext:
+        ax.scatter(ext_t, ext_r, alpha=0.3, s=5, c="orange", label="External")
+        win = rolling_window(len(ext_r))
+        if win > 1:
+            ax.plot(
+                ext_t,
+                rolling_average(ext_r, win),
+                "orange",
+                linewidth=1.5,
+                label=f"External avg (n={win})",
+            )
+        # Combined (only meaningful when external exists).
+        if gpu_t and len(gpu_t) == len(ext_t):
+            combined = [
+                (g + e) / 2.0 if (g or e) else 0.0 for g, e in zip(gpu_r, ext_r)
+            ]
+            ax.scatter(gpu_t, combined, alpha=0.2, s=3, c="green", label="Combined")
+            win = rolling_window(len(combined))
+            if win > 1:
+                ax.plot(
+                    gpu_t,
+                    rolling_average(combined, win),
+                    "green",
+                    linewidth=2,
+                    label=f"Combined avg (n={win})",
+                )
+    if gpu_t or has_ext:
+        ax.legend(loc="best", fontsize=8)
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("Hit Rate (%)")
+    ax.set_title("Prefix Cache Hit Rate Per Interval (tokens hit / tokens queried)")
+    ax.set_ylim(0, 105)
+    ax.grid(True, alpha=0.3)
+
+
+def panel_throughput(ax, server_metrics: dict, t0_ns: int | None) -> None:
+    gen_t, gen_v = aggregate_timeseries(
+        server_metrics, "vllm:generation_tokens", t0_ns, value_key_priority=("rate",)
+    )
+    prompt_t, prompt_v = aggregate_timeseries(
+        server_metrics, "vllm:prompt_tokens", t0_ns, value_key_priority=("rate",)
+    )
+    if gen_t and prompt_t and len(gen_t) == len(prompt_t):
+        total = [g + p for g, p in zip(gen_v, prompt_v)]
+        win = rolling_window(len(total))
+        if win > 1:
+            ax.plot(
+                gen_t,
+                rolling_average(total, win),
+                "steelblue",
+                linewidth=1.5,
+                label=f"Total (avg n={win})",
+            )
+            ax.plot(
+                gen_t,
+                rolling_average(gen_v, win),
+                "orange",
+                linewidth=1.5,
+                label=f"Decode (avg n={win})",
+            )
+        else:
+            ax.plot(gen_t, total, "steelblue", linewidth=1, alpha=0.8, label="Total")
+            ax.plot(gen_t, gen_v, "orange", linewidth=1, alpha=0.8, label="Decode")
+        # Cumulative running average: cumsum tokens / elapsed.
+        if gen_t:
+            cumulative_total = []
+            t0 = gen_t[0]
+            running = 0.0
+            for i, t in enumerate(gen_t):
+                # rate = tokens/s in that window; multiply by window width.
+                width = (gen_t[i] - gen_t[i - 1]) if i > 0 else 0.0
+                running += total[i] * width
+                elapsed = t - t0 if t > t0 else 1e-9
+                cumulative_total.append(running / elapsed if elapsed > 0 else 0.0)
+            ax.plot(gen_t, cumulative_total, "red", linewidth=2, label="Total Running Avg")
+        ax.legend(fontsize=8)
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("Tokens/sec")
+    ax.set_title("Throughput (Total & Decode)")
+    ax.grid(True, alpha=0.3)
+
+
+def panel_kv_offload_transfer_rate(
+    ax, server_metrics: dict, t0_ns: int | None
+) -> None:
+    g2c_t, g2c_v = aggregate_timeseries(
+        server_metrics,
+        "vllm:kv_offload_bytes_gpu_to_cpu",
+        t0_ns,
+        value_key_priority=("rate",),
+    )
+    c2g_t, c2g_v = aggregate_timeseries(
+        server_metrics,
+        "vllm:kv_offload_bytes_cpu_to_gpu",
+        t0_ns,
+        value_key_priority=("rate",),
+    )
+    has_data = (g2c_t and any(v > 0 for v in g2c_v)) or (
+        c2g_t and any(v > 0 for v in c2g_v)
+    )
+    if has_data:
+        if g2c_t:
+            mb = [v / 1e6 for v in g2c_v]
+            ax.scatter(g2c_t, mb, alpha=0.15, s=3, c="blue")
+            win = rolling_window(len(mb))
+            if win > 1:
+                ax.plot(
+                    g2c_t,
+                    rolling_average(mb, win),
+                    "b-",
+                    linewidth=1.5,
+                    label=f"GPU→CPU (avg n={win})",
+                )
+            else:
+                ax.plot(g2c_t, mb, "b-", linewidth=1, alpha=0.8, label="GPU→CPU")
+        if c2g_t:
+            mb = [v / 1e6 for v in c2g_v]
+            ax.scatter(c2g_t, mb, alpha=0.15, s=3, c="red")
+            win = rolling_window(len(mb))
+            if win > 1:
+                ax.plot(
+                    c2g_t,
+                    rolling_average(mb, win),
+                    "r-",
+                    linewidth=1.5,
+                    label=f"CPU→GPU (avg n={win})",
+                )
+            else:
+                ax.plot(c2g_t, mb, "r-", linewidth=1, alpha=0.8, label="CPU→GPU")
+        ax.legend(fontsize=8)
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("Transfer Rate (MB/s)")
+    ax.set_title("KV Offload Transfer Rate")
+    ax.grid(True, alpha=0.3)
+
+
+def _prompt_token_source_series(
+    server_metrics: dict, source_label: str, t0_ns: int | None
+) -> tuple[list[float], list[float]]:
+    """vllm:prompt_tokens_by_source has labels {source: local_compute|local_cache_hit|external_kv_transfer}."""
+    entry = metric_entry(server_metrics, "vllm:prompt_tokens_by_source")
+    s = series_with_label(entry, "source", source_label)
+    return timeseries_from_series(s, t0_ns, value_key_priority=("total",))
+
+
+def panel_prefill_source_breakdown(
+    ax, server_metrics: dict, t0_ns: int | None
+) -> None:
+    c_t, c_v = _prompt_token_source_series(server_metrics, "local_compute", t0_ns)
+    h_t, h_v = _prompt_token_source_series(server_metrics, "local_cache_hit", t0_ns)
+    e_t, e_v = _prompt_token_source_series(
+        server_metrics, "external_kv_transfer", t0_ns
+    )
+    # Align timestamps: use the union of all sample timestamps.
+    if not (c_t or h_t or e_t):
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("% of Prefill Tokens")
+        ax.set_title("Cumulative Prefill Token Source Breakdown")
+        ax.set_ylim(0, 105)
+        ax.grid(True, alpha=0.3)
+        return
+    # Build per-timestamp cumulative values; counters are already cumulative
+    # totals from the scrape (rate=delta over slice, but ``total`` here is
+    # the slice total — accumulate ourselves).
+    samples = sorted(set(c_t) | set(h_t) | set(e_t))
+
+    def _cum_at(times: list[float], values: list[float]) -> dict:
+        d: dict[float, float] = {}
+        running = 0.0
+        for t, v in zip(times, values):
+            running += v
+            d[t] = running
+        # Forward-fill for missing samples.
+        out: dict[float, float] = {}
+        last = 0.0
+        for t in samples:
+            if t in d:
+                last = d[t]
+            out[t] = last
+        return out
+
+    cum_c = _cum_at(c_t, c_v)
+    cum_h = _cum_at(h_t, h_v)
+    cum_e = _cum_at(e_t, e_v)
+    pct_c: list[float] = []
+    pct_h: list[float] = []
+    pct_e: list[float] = []
+    for t in samples:
+        c = cum_c[t]
+        h = cum_h[t]
+        e = cum_e[t]
+        total = c + h + e
+        if total > 0:
+            pct_c.append(100.0 * c / total)
+            pct_h.append(100.0 * h / total)
+            pct_e.append(100.0 * e / total)
+        else:
+            pct_c.append(0.0)
+            pct_h.append(0.0)
+            pct_e.append(0.0)
+    ax.stackplot(
+        samples,
+        pct_c,
+        pct_h,
+        pct_e,
+        labels=["Prefill", "HBM Cache Hit", "Offload Cache Hit"],
+        colors=["coral", "steelblue", "mediumseagreen"],
+        alpha=0.8,
+    )
+    ax.legend(fontsize=8, loc="lower left")
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("% of Prefill Tokens")
+    ax.set_title("Cumulative Prefill Token Source Breakdown")
+    ax.set_ylim(0, 105)
+    ax.grid(True, alpha=0.3)
+
+
+def panel_kv_offload_cumulative(
+    ax,
+    server_metrics: dict,
+    metric_name: str,
+    title: str,
+    color: str,
+    t0_ns: int | None,
+) -> None:
+    times, values = aggregate_timeseries(
+        server_metrics, metric_name, t0_ns, value_key_priority=("total",)
+    )
+    if times and any(v > 0 for v in values):
+        cumulative: list[float] = []
+        running = 0.0
+        for v in values:
+            running += v
+            cumulative.append(running / 1e9)  # GB
+        ax.plot(times, cumulative, f"{color}-", linewidth=1.5)
+        ax.fill_between(times, cumulative, alpha=0.2, color=color)
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("Cumulative Transfer (GB)")
+    ax.set_title(title)
+    ax.grid(True, alpha=0.3)
+
+
+def panel_per_record_metric(
+    ax,
+    request_times_s: list[float],
+    values: list[float],
+    *,
+    color: str,
+    ylabel: str,
+    title: str,
+) -> None:
+    if not values:
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel(ylabel)
+        ax.set_title(title)
+        ax.grid(True, alpha=0.3)
+        return
+    ax.scatter(request_times_s, values, alpha=0.3, s=5, c=color)
+    win = rolling_window(len(values))
+    if win > 1:
+        ax.plot(
+            request_times_s,
+            rolling_average(values, win),
+            "r-",
+            linewidth=1.5,
+            label=f"Rolling avg (n={win})",
+        )
+        ax.legend(loc="best", fontsize=8)
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel(ylabel)
+    ax.set_title(title)
+    ax.grid(True, alpha=0.3)
+
+
+def panel_preemptions(ax, server_metrics: dict, t0_ns: int | None) -> None:
+    times, values = aggregate_timeseries(
+        server_metrics, "vllm:num_preemptions", t0_ns, value_key_priority=("total",)
+    )
+    if not times:
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Preemptions/sec")
+        ax.set_title("Preemptions Over Time")
+        ax.grid(True, alpha=0.3)
+        return
+    # ``total`` is the per-slice delta; convert to rate by dividing by slice
+    # width (assume uniform: median diff between consecutive starts).
+    if len(times) >= 2:
+        diffs = [times[i] - times[i - 1] for i in range(1, len(times))]
+        slice_w = max(1e-9, statistics.median(diffs))
+    else:
+        slice_w = 1.0
+    rates = [v / slice_w for v in values]
+    if any(r > 0 for r in rates):
+        ax.scatter(times, rates, alpha=0.15, s=3, c="red")
+        win = rolling_window(len(rates), max_window=30)
+        if win > 1:
+            ax.plot(
+                times,
+                rolling_average(rates, win),
+                "r-",
+                linewidth=1.5,
+                label=f"Rolling avg (n={win})",
+            )
+        # Cumulative on twin axis.
+        cumulative: list[float] = []
+        running = 0.0
+        for v in values:
+            running += v
+            cumulative.append(running)
+        ax2 = ax.twinx()
+        ax2.plot(times, cumulative, "b--", linewidth=1, alpha=0.5, label="Cumulative")
+        ax2.set_ylabel("Cumulative Preemptions", color="blue")
+        ax2.tick_params(axis="y", labelcolor="blue")
+    ax.set_xlabel("Time (s)")
+    ax.set_ylabel("Preemptions/sec", color="red")
+    ax.tick_params(axis="y", labelcolor="red")
+    ax.set_title("Preemptions Over Time")
+    ax.grid(True, alpha=0.3)
+
+
+# ---- Main ----------------------------------------------------------------
+
+
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(
+        description="Generate metrics_plots.png from aiperf artifacts (kv-cache-tester layout)"
+    )
+    parser.add_argument(
+        "result_dir",
+        type=Path,
+        help="Result dir containing trace_replay/ subdirectory",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default=None,
+        help="Output PNG path (default: <result_dir>/metrics_plots.png)",
+    )
+    args = parser.parse_args(argv)
+
+    # benchmark_lib.sh writes aiperf output to <result_dir>/aiperf_artifacts/
+    # (--output-artifact-dir). Older runs used trace_replay/, kept as fallback.
+    artifact = args.result_dir / "aiperf_artifacts"
+    if not (artifact / "profile_export.jsonl").exists():
+        legacy = args.result_dir / "trace_replay"
+        if (legacy / "profile_export.jsonl").exists():
+            artifact = legacy
+    jsonl_path = artifact / "profile_export.jsonl"
+    server_metrics_path = artifact / "server_metrics_export.json"
+
+    if not jsonl_path.exists() and artifact.is_dir():
+        for child in sorted(artifact.iterdir()):
+            if child.is_dir() and (child / "profile_export.jsonl").is_file():
+                jsonl_path = child / "profile_export.jsonl"
+                server_metrics_path = child / "server_metrics_export.json"
+                break
+
+    if not jsonl_path.exists():
+        print(f"ERROR: {jsonl_path} not found", file=sys.stderr)
+        return 1
+
+    records = load_jsonl_records(jsonl_path)
+    server_metrics = load_server_metrics(server_metrics_path)
+    t0_ns = first_update_ns(server_metrics)
+
+    starts_ns = [
+        int(r["metadata"]["request_start_ns"])
+        for r in records
+        if r.get("metadata", {}).get("request_start_ns")
+    ]
+    first_record_start = min(starts_ns) if starts_ns else 0
+    request_times_s = [(s - first_record_start) / 1e9 for s in starts_ns]
+
+    ttfts_ms: list[float] = []
+    e2es_ms: list[float] = []
+    interactivities: list[float] = []
+    for r in records:
+        ttft = metric_value(r, "time_to_first_token")
+        e2e = metric_value(r, "request_latency")
+        itl = metric_value(r, "inter_token_latency")
+        ttfts_ms.append(ttft if ttft is not None else 0.0)
+        e2es_ms.append(e2e if e2e is not None else 0.0)
+        # Interactivity: tokens/sec from per-token latency (ms).
+        interactivities.append(1000.0 / itl if itl and itl > 0 else 0.0)
+
+    fig, axes = plt.subplots(6, 2, figsize=(14, 24))
+    fig.suptitle("vLLM Server Metrics During Benchmark", fontsize=14)
+
+    panel_kv_cache_usage(axes[0, 0], server_metrics, t0_ns)
+    panel_queue_depth(axes[0, 1], server_metrics, t0_ns)
+    panel_prefix_cache_hit_rate(axes[1, 0], server_metrics, t0_ns)
+    panel_throughput(axes[1, 1], server_metrics, t0_ns)
+    panel_kv_offload_transfer_rate(axes[2, 0], server_metrics, t0_ns)
+    panel_prefill_source_breakdown(axes[2, 1], server_metrics, t0_ns)
+    panel_kv_offload_cumulative(
+        axes[3, 0],
+        server_metrics,
+        "vllm:kv_offload_bytes_gpu_to_cpu",
+        "KV Offload: GPU → CPU (Cumulative)",
+        "b",
+        t0_ns,
+    )
+    panel_kv_offload_cumulative(
+        axes[3, 1],
+        server_metrics,
+        "vllm:kv_offload_bytes_cpu_to_gpu",
+        "KV Offload: CPU → GPU (Cumulative)",
+        "r",
+        t0_ns,
+    )
+    panel_per_record_metric(
+        axes[4, 0],
+        request_times_s,
+        ttfts_ms,
+        color="blue",
+        ylabel="TTFT (ms)",
+        title="Time to First Token vs Time",
+    )
+    panel_per_record_metric(
+        axes[4, 1],
+        request_times_s,
+        e2es_ms,
+        color="green",
+        ylabel="Latency (ms)",
+        title="Request Latency vs Time",
+    )
+    panel_per_record_metric(
+        axes[5, 0],
+        request_times_s,
+        interactivities,
+        color="purple",
+        ylabel="Interactivity (tokens/sec)",
+        title="Decode Speed (1/TPOT) vs Time",
+    )
+    panel_preemptions(axes[5, 1], server_metrics, t0_ns)
+
+    plt.tight_layout()
+    out_path = args.output or (args.result_dir / "metrics_plots.png")
+    plt.savefig(out_path, dpi=150)
+    plt.close(fig)
+    print(f"Saved {out_path}")
+    if records:
+        ttft_clean = [v for v in ttfts_ms if v > 0]
+        e2e_clean = [v for v in e2es_ms if v > 0]
+        if ttft_clean and e2e_clean:
+            print(
+                f"  Records: {len(records)} | "
+                f"TTFT median {statistics.median(ttft_clean):.0f}ms | "
+                f"E2E median {statistics.median(e2e_clean):.0f}ms"
+            )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 9f38292f4..53efcca9f 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -832,44 +832,45 @@ def generate_test_config_sweep(args, all_config_data, runner_data=None):
                     continue
 
                 for conc in conc_values:
-                    if is_multinode:
-                        entry = {
-                            Fields.IMAGE.value: image,
-                            Fields.MODEL.value: model,
-                            Fields.MODEL_PREFIX.value: model_code,
-                            Fields.PRECISION.value: precision,
-                            Fields.FRAMEWORK.value: framework,
-                            Fields.RUNNER.value: runner,
-                            Fields.SPEC_DECODING.value: spec_decoding,
-                            Fields.PREFILL.value: prefill,
-                            Fields.DECODE.value: decode,
-                            Fields.CONC.value: conc,
-                            Fields.DURATION.value: duration,
-                            Fields.EXP_NAME.value: (
-                                f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}"
-                                f"_d{decode[Fields.NUM_WORKER.value]}x{decode[Fields.TP.value]}_conc{conc}"
-                            ),
-                            Fields.DISAGG.value: disagg,
-                            Fields.SCENARIO_TYPE.value: "agentic-coding",
-                        }
-                    else:
-                        entry = {
-                            Fields.IMAGE.value: image,
-                            Fields.MODEL.value: model,
-                            Fields.MODEL_PREFIX.value: model_code,
-                            Fields.PRECISION.value: precision,
-                            Fields.FRAMEWORK.value: framework,
-                            Fields.RUNNER.value: runner,
-                            Fields.TP.value: tp,
-                            Fields.EP.value: ep if ep is not None else 1,
-                            Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False,
-                            Fields.CONC.value: conc,
-                            Fields.OFFLOADING.value: offloading,
-                            Fields.DURATION.value: duration,
-                            Fields.EXP_NAME.value: f"{model_code}_tp{tp}_conc{conc}_offload{offloading}",
-                            Fields.SCENARIO_TYPE.value: "agentic-coding",
-                        }
-                    matrix_values.append(validate_agentic_matrix_entry(entry))
+                    for runner_value in runners_for_entry:
+                        if is_multinode:
+                            entry = {
+                                Fields.IMAGE.value: image,
+                                Fields.MODEL.value: model,
+                                Fields.MODEL_PREFIX.value: model_code,
+                                Fields.PRECISION.value: precision,
+                                Fields.FRAMEWORK.value: framework,
+                                Fields.RUNNER.value: runner_value,
+                                Fields.SPEC_DECODING.value: spec_decoding,
+                                Fields.PREFILL.value: prefill,
+                                Fields.DECODE.value: decode,
+                                Fields.CONC.value: conc,
+                                Fields.DURATION.value: duration,
+                                Fields.EXP_NAME.value: (
+                                    f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}"
+                                    f"_d{decode[Fields.NUM_WORKER.value]}x{decode[Fields.TP.value]}_conc{conc}"
+                                ),
+                                Fields.DISAGG.value: disagg,
+                                Fields.SCENARIO_TYPE.value: "agentic-coding",
+                            }
+                        else:
+                            entry = {
+                                Fields.IMAGE.value: image,
+                                Fields.MODEL.value: model,
+                                Fields.MODEL_PREFIX.value: model_code,
+                                Fields.PRECISION.value: precision,
+                                Fields.FRAMEWORK.value: framework,
+                                Fields.RUNNER.value: runner_value,
+                                Fields.TP.value: tp,
+                                Fields.EP.value: ep if ep is not None else 1,
+                                Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False,
+                                Fields.CONC.value: conc,
+                                Fields.OFFLOADING.value: offloading,
+                                Fields.DURATION.value: duration,
+                                Fields.EXP_NAME.value: f"{model_code}_tp{tp}_conc{conc}_offload{offloading}",
+                                Fields.SCENARIO_TYPE.value: "agentic-coding",
+                            }
+                        matrix_values.append(validate_agentic_matrix_entry(entry))
 
     return matrix_values
 
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index 297e57524..9bb473896 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -1619,6 +1619,48 @@ def test_runner_node_filter_no_match_skips_config(self, sample_multinode_config,
 
         assert result == []
 
+    def test_runner_node_filter_expands_agentic_config_runner(self, sample_runner_config):
+        """Agentic test-config entries should support concrete runner targeting."""
+        config = {
+            "qwen-agentic-hicache": {
+                "image": "sglang-rocm",
+                "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+                "model-prefix": "qwen3.5",
+                "precision": "fp8",
+                "framework": "sglang",
+                "runner": "mi300x",
+                "multinode": False,
+                "scenarios": {
+                    "agentic-coding": [
+                        {
+                            "duration": 1800,
+                            "search-space": [
+                                {
+                                    "tp": 8,
+                                    "ep": 1,
+                                    "offloading": "hicache",
+                                    "conc-list": [64],
+                                }
+                            ],
+                        }
+                    ]
+                },
+            }
+        }
+        args = argparse.Namespace(
+            config_keys=["qwen-agentic-hicache"],
+            seq_lens=None,
+            conc=None,
+            scenario_type=["agentic-coding"],
+            runner_node_filter="mi300x-amd_1",
+        )
+
+        result = generate_test_config_sweep(args, config, sample_runner_config)
+
+        assert len(result) == 1
+        assert result[0]["runner"] == "mi300x-amd_1"
+        assert result[0]["scenario-type"] == "agentic-coding"
+
 
 # =============================================================================
 # Test apply_node_type_defaults
@@ -1970,4 +2012,3 @@ def test_prefill_entries_never_in_single_or_evals(self, mixed_entries):
         assert all('prefill' in x for x in multi)
         assert all('prefill' not in x for x in single)
         assert all('prefill' not in x for x in evals)
-
diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py
index 1274fd86a..c385017b1 100644
--- a/utils/matrix_logic/test_validation.py
+++ b/utils/matrix_logic/test_validation.py
@@ -3,9 +3,11 @@
 from validation import (
     Fields,
     SingleNodeMatrixEntry,
+    SingleNodeAgenticMatrixEntry,
     MultiNodeMatrixEntry,
     WorkerConfig,
     SingleNodeSearchSpaceEntry,
+    AgenticCodingSearchSpaceEntry,
     MultiNodeSearchSpaceEntry,
     SingleNodeSeqLenConfig,
     MultiNodeSeqLenConfig,
@@ -305,6 +307,61 @@ def test_extra_field_forbidden(self, valid_single_node_matrix_entry):
             SingleNodeMatrixEntry(**valid_single_node_matrix_entry)
 
 
+# =============================================================================
+# Test Agentic Matrix Entries
+# =============================================================================
+
+class TestAgenticMatrixEntries:
+    """Tests for agentic coding validation models."""
+
+    def test_lmcache_mp_offloading_is_valid_for_single_node_agentic_entry(self):
+        """LMCache MP is a valid agentic offloading backend."""
+        entry = SingleNodeAgenticMatrixEntry(**{
+            "image": "cquil/vllm-openai:v0.21.0-8813c92",
+            "model": "deepseek-ai/DeepSeek-V4-Pro",
+            "model-prefix": "dsv4",
+            "precision": "fp4",
+            "framework": "vllm",
+            "runner": "b200-dgxc",
+            "tp": 8,
+            "ep": 1,
+            "dp-attn": False,
+            "conc": 1,
+            "offloading": "lmcache-mp",
+            "duration": 1800,
+            "exp-name": "dsv4_tp8_conc1_offloadlmcache-mp",
+            "scenario-type": "agentic-coding",
+        })
+        assert entry.offloading == "lmcache-mp"
+
+    def test_lmcache_mp_offloading_is_valid_for_agentic_search_space(self):
+        """Agentic search-space entries can request LMCache MP offloading."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 8,
+            "offloading": "lmcache-mp",
+            "conc-list": [1, 2],
+        })
+        assert entry.offloading == "lmcache-mp"
+
+    def test_lmcache_offloading_is_valid_for_agentic_search_space(self):
+        """Agentic search-space entries can request in-process LMCache."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 8,
+            "offloading": "lmcache",
+            "conc-list": [1, 2],
+        })
+        assert entry.offloading == "lmcache"
+
+    def test_hicache_offloading_is_valid_for_agentic_search_space(self):
+        """Agentic search-space entries can request SGLang HiCache."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 8,
+            "offloading": "hicache",
+            "conc-list": [1, 2],
+        })
+        assert entry.offloading == "hicache"
+
+
 # =============================================================================
 # Test MultiNodeMatrixEntry
 # =============================================================================
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index dd245aec7..4e3f0bbd7 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -156,7 +156,9 @@ class SingleNodeAgenticMatrixEntry(BaseModel):
     ep: int
     dp_attn: bool = Field(alias=Fields.DP_ATTN.value)
     conc: int
-    offloading: Literal["none", "cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value)
+    offloading: Literal["none", "cpu", "ssd", "lmcache", "lmcache-mp", "hicache"] = Field(
+        alias=Fields.OFFLOADING.value
+    )
     duration: int = Field(default=1800, alias=Fields.DURATION.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value)
@@ -338,7 +340,9 @@ class AgenticCodingSearchSpaceEntry(BaseModel):
         default="none", alias=Fields.SPEC_DECODING.value)
     prefill: Optional[WorkerConfig] = None
     decode: Optional[WorkerConfig] = None
-    offloading: Literal["none", "cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value)
+    offloading: Literal["none", "cpu", "ssd", "lmcache", "lmcache-mp", "hicache"] = Field(
+        default="none", alias=Fields.OFFLOADING.value
+    )
     conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value)
     conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value)
     conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value)
diff --git a/utils/process_agentic_result.py b/utils/process_agentic_result.py
index 10aaff80e..3c4015ce6 100644
--- a/utils/process_agentic_result.py
+++ b/utils/process_agentic_result.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """Process aiperf agentic-replay output into the InferenceX agg_*.json shape.
 
-Reads aiperf's three artifact files from $RESULT_DIR/trace_replay/ and emits
+Reads aiperf's three artifact files from $RESULT_DIR/aiperf_artifacts/ and emits
 $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json with the same key schema fixed-seq-len
 and the legacy kv-cache-tester pipeline produce, so utils/summarize.py and
 sibling aggregators keep working without changes.
@@ -37,7 +37,7 @@
 # Trace metadata lookup: conversation_id (= trace id) -> per-turn dict with
 # ``hash_ids`` and ``output_length``. Built lazily from the HF dataset cache.
 _TRACE_METADATA_CACHE: dict[str, list[dict]] | None = None
-_HF_DATASET = "semianalysisai/cc-traces-weka-042026"
+_HF_DATASET = "semianalysisai/cc-traces-weka-with-subagents-051926"
 
 
 # ---- helpers ---------------------------------------------------------------
@@ -626,11 +626,11 @@ def _resolve_artifact_dir(result_dir: Path) -> Path:
 
     aiperf accepts ``--output-artifact-dir`` and writes directly into it when
     ``--num-profile-runs == 1`` (our default), but creates a per-run subdir
-    when that flag is > 1. Handle both: prefer ``result_dir/trace_replay``
+    when that flag is > 1. Handle both: prefer ``result_dir/aiperf_artifacts``
     when it has the export files, else descend into the first child dir
     that does.
     """
-    base = result_dir / "trace_replay"
+    base = result_dir / "aiperf_artifacts"
     if (base / "profile_export.jsonl").is_file():
         return base
     if base.is_dir():
diff --git a/utils/proxy_to_weka.py b/utils/proxy_to_weka.py
new file mode 100644
index 000000000..3b5a28afb
--- /dev/null
+++ b/utils/proxy_to_weka.py
@@ -0,0 +1,514 @@
+#!/usr/bin/env python3
+"""Convert flat per-session JSONL dumps into weka-format trace JSON.
+
+Reads <in-dir>/<session_id>.jsonl produced by `sample_proxy_traces.py`
+and writes <in-dir>/../<out-dir>/<session_id>.json in the v1 weka trace
+format consumed by the kv-cache-tester replayer (see
+utils/aiperf/src/aiperf/dataset/loader/weka_trace_models.py).
+
+Subagent grouping mirrors the conversation-view algorithm from the
+SemiAnalysis claude-code-proxy:
+
+  1. Walk session rows chronologically.
+  2. A row with `subagent_label IS NULL` is a parent (main-agent) turn.
+  3. A run of consecutive non-null-label rows is a "stretch". The
+     stretch ends as soon as a NULL-label row appears.
+  4. Inside the stretch, group by `subagent_label`. Each label group
+     becomes one WekaSubagentEntry with its label rows as inner
+     WekaNormalRequest entries (in chronological order).
+  5. Different labels inside the same stretch produce sibling entries
+     (the dashboard renders parallel groups for each).
+
+Hash IDs (24-char hex strings in the proxy DB) are remapped to small
+per-trace ints so we can emit `hash_id_scope: "local"`. The mapping is
+session-scoped: first-seen hash gets 0, second 1, etc.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+
+def _dump_trace_inline_hash_ids(trace: dict, path: Path) -> None:
+    """Write the trace as indented JSON, but with every ``hash_ids``
+    array kept on a single line regardless of length.
+
+    `json.dump(..., indent=2)` always expands arrays to one element
+    per line, which turns the weka file into thousands of one-int
+    lines that drown out the actual structure. We work around it
+    with a two-phase serialize: substitute each ``hash_ids`` list
+    with a placeholder string before dumping, then text-replace the
+    placeholder with a compact one-line array. Robust against weird
+    list contents because the substitution happens at object level,
+    not at the JSON-text level.
+    """
+    placeholders: list[list[Any]] = []
+
+    def _substitute(obj):
+        if isinstance(obj, dict):
+            out: dict[str, Any] = {}
+            for k, v in obj.items():
+                if k == "hash_ids" and isinstance(v, list):
+                    idx = len(placeholders)
+                    placeholders.append(v)
+                    out[k] = f"@@HASHIDS_{idx}@@"
+                else:
+                    out[k] = _substitute(v)
+            return out
+        if isinstance(obj, list):
+            return [_substitute(x) for x in obj]
+        return obj
+
+    text = json.dumps(_substitute(trace), indent=2)
+    text = re.sub(
+        r'"@@HASHIDS_(\d+)@@"',
+        lambda m: json.dumps(placeholders[int(m.group(1))], separators=(", ", ": ")),
+        text,
+    )
+    with path.open("w") as f:
+        f.write(text + "\n")
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--in-dir", "-i", type=Path, required=True,
+        help="Directory containing <session_id>.jsonl files (the output of sample_proxy_traces.py).",
+    )
+    p.add_argument(
+        "--out-dir", "-o", type=Path, required=True,
+        help="Directory to write <session_id>.json weka traces into.",
+    )
+    return p.parse_args()
+
+
+_SLUG_RE = re.compile(r"[^a-z0-9]+")
+
+
+def slugify(label: str) -> str:
+    return _SLUG_RE.sub("_", label.lower()).strip("_") or "subagent"
+
+
+def load_session_rows(path: Path) -> list[dict]:
+    rows: list[dict] = []
+    with path.open() as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                rows.append(json.loads(line))
+    rows.sort(key=lambda r: r["timestamp"])
+
+    # Drop exact-duplicate rows. The proxy occasionally records the same
+    # logical request twice — observed at ~1.5% of subagent inner rows on
+    # the v5 + CC>=2.1.139 pool, concentrated in heavy-fanout subagents.
+    # Without deduping, the weka conversion would inflate token counts /
+    # request counts and the converter would also misclassify the
+    # duplicate row as "concurrent with itself" when grouping.
+    #
+    # Fingerprint: (timestamp, model, input_tokens, output_tokens,
+    # duration_ms, agent_id). Two distinct logical requests landing on
+    # the same nanosecond timestamp with identical token counts AND the
+    # same agent_id are so unlikely that collapsing them is safe.
+    seen: set[tuple] = set()
+    deduped: list[dict] = []
+    for r in rows:
+        fp = (
+            r.get("timestamp"),
+            r.get("model"),
+            r.get("input_tokens"),
+            r.get("output_tokens"),
+            r.get("duration_ms"),
+            r.get("agent_id") or "",
+        )
+        if fp in seen:
+            continue
+        seen.add(fp)
+        deduped.append(r)
+    n_dropped = len(rows) - len(deduped)
+    if n_dropped:
+        print(
+            f"  dedup: dropped {n_dropped} exact-duplicate row(s) from {path.name}",
+            file=sys.stderr,
+        )
+    return deduped
+
+
+def remap_hash(h: str, m: dict[str, int]) -> int:
+    if h not in m:
+        m[h] = len(m)
+    return m[h]
+
+
+def infer_block_size(rows: list[dict]) -> int:
+    """Anthropic's KV-cache uses a constant 64-token block. The proxy's
+    `hash_token_count` can drift below `len(hash_ids) * 64` on rows
+    where the prompt's trailing partial block isn't hashed — naive
+    division over the first row gives nonsense (53 for a 377-token
+    utility call). We don't infer; we constant 64.
+    """
+    return 64
+
+
+def effective_input_length(row: dict, block_size: int = 64) -> int:
+    """Effective ``in`` for the weka request.
+
+    We want the replayed prompt to be EXACTLY what the proxy hashed and
+    nothing more — the unhashed tail (typically the volatile user
+    message of the turn) is synthesized junk at replay time and doesn't
+    represent real content. So ``in`` is the proxy's own
+    ``hash_token_count`` whenever it's populated. Fallback chain:
+
+      1. ``hash_token_count``       — proxy's exact accounting, handles
+                                       last-block-partial residues
+                                       (e.g. 212 not 256 for 4 blocks).
+      2. ``len(hash_ids) * block_size`` — clean block-multiple if the
+                                          proxy didn't record the count.
+      3. ``input + cache_read + cache_write`` — total prompt length,
+                                                used only when no hash
+                                                coverage exists.
+    """
+    hash_tok = row.get("hash_token_count") or 0
+    if hash_tok > 0:
+        return hash_tok
+    hashes = row.get("hash_ids") or []
+    if hashes:
+        return len(hashes) * block_size
+    return (
+        (row.get("input_tokens") or 0)
+        + (row.get("cache_read_input_tokens") or 0)
+        + (row.get("cache_write_tokens") or 0)
+    )
+
+
+def build_normal_request(
+    row: dict, hash_map: dict[str, int], think_time: float | None
+) -> dict:
+    """Inner subagent request — Normal type, per weka v1 spec."""
+    out = {
+        "t": row["t_sec"],
+        "type": "n",
+        "model": row["model"],
+        "in": effective_input_length(row),
+        "out": row.get("output_tokens") or 0,
+        "hash_ids": [remap_hash(h, hash_map) for h in (row.get("hash_ids") or [])],
+        "api_time": (row.get("duration_ms") or 0) / 1000.0,
+    }
+    if think_time is not None:
+        out["think_time"] = think_time
+    return out
+
+
+def build_top_request(
+    row: dict, hash_map: dict[str, int], think_time: float | None
+) -> dict:
+    """Top-level main-agent request — Normal or Streaming."""
+    out = {
+        "t": row["t_sec"],
+        "model": row["model"],
+        "in": effective_input_length(row),
+        "out": row.get("output_tokens") or 0,
+        "hash_ids": [remap_hash(h, hash_map) for h in (row.get("hash_ids") or [])],
+        "api_time": (row.get("duration_ms") or 0) / 1000.0,
+    }
+    if think_time is not None:
+        out["think_time"] = think_time
+    if row.get("is_streaming"):
+        out["type"] = "s"
+        ttft_ms = row.get("ttft_ms")
+        if ttft_ms is not None:
+            out["ttft"] = ttft_ms / 1000.0
+    else:
+        out["type"] = "n"
+    return out
+
+
+def compute_think_times(rows: list[dict]) -> list[float | None]:
+    """Wall-clock gap from the previous chronological row's end.
+
+    First row gets None (no prior). Negative gaps clamp to 0 (the proxy
+    timestamps are millisecond-precise; minor reorderings within the
+    same millisecond can produce small negatives).
+    """
+    out: list[float | None] = []
+    prev_end: float | None = None
+    for r in rows:
+        if prev_end is None:
+            out.append(None)
+        else:
+            gap = r["t_sec"] - prev_end
+            out.append(max(0.0, gap))
+        prev_end = r["t_sec"] + (r.get("duration_ms") or 0) / 1000.0
+    return out
+
+
+# Claude CLI version at which `x-claude-code-agent-id` became the
+# canonical sub-agent signal. On rows >= this version, a labelled row
+# without a header id is treated as a utility call (Title Generation,
+# Statusline Agent, …), demoted to a main turn instead of getting its
+# own SubagentEntry. Diverges intentionally from the dashboard, which
+# still renders those as subagents — we want clean weka traces.
+MIN_CLI_FOR_HEADER_AS_TRUTH = (2, 1, 139)
+
+
+def _parse_cli_version(s: str | None) -> tuple[int, int, int] | None:
+    if not s:
+        return None
+    parts = s.split(".")
+    if len(parts) != 3:
+        return None
+    try:
+        return (int(parts[0]), int(parts[1]), int(parts[2]))
+    except ValueError:
+        return None
+
+
+def _is_utility_label_only(row: dict) -> bool:
+    """True if the row's `subagent_label` should be ignored on new CLI.
+
+    A "utility" row is one labelled as a sub-agent by the proxy's
+    pattern matcher but with no header-derived id. On CLI versions
+    where `x-claude-code-agent-id` is authoritative, the absence of
+    that header means this isn't a Task-tool-spawned sub-agent — it's
+    a utility call (Title Generation / Name Generation / Statusline)
+    that should appear in the trace as a regular main turn.
+    """
+    if not row.get("subagent_label"):
+        return False
+    if row.get("agent_id") or row.get("thread_id"):
+        return False
+    cli = _parse_cli_version(row.get("cli_version"))
+    return cli is not None and cli >= MIN_CLI_FOR_HEADER_AS_TRUTH
+
+
+def _id_group_key(row: dict) -> str | None:
+    """Match `idGroupKey` in subagent-runs.ts.
+
+    Returns a stable cross-session key when we have a header-derived id,
+    else None (caller falls back to legacy contiguous-stretch grouping).
+    """
+    if not row.get("subagent_label"):
+        return None
+    if row.get("agent_id"):
+        return f"cc-agent::{row['agent_id']}"
+    if row.get("thread_id"):
+        return f"{row['subagent_label']}::thread::{row['thread_id']}"
+    return None
+
+
+def build_subagent_entry(
+    label: str,
+    instance_idx: int,
+    items: list[tuple[dict, float | None]],
+    hash_map: dict[str, int],
+) -> dict:
+    inner = [build_normal_request(row, hash_map, tt) for row, tt in items]
+    first_row = items[0][0]
+    last_row = items[-1][0]
+    end_t = last_row["t_sec"] + (last_row.get("duration_ms") or 0) / 1000.0
+    duration_ms = int(round((end_t - first_row["t_sec"]) * 1000))
+    total_tokens = sum(r["in"] + r["out"] for r in inner)
+    models = sorted({row["model"] for row, _ in items})
+    # agent_id suffix priority: Claude Code agent-id (canonical when
+    # present) > Codex thread-id. Matches the dashboard's
+    # getSubagentRunLabel which suffixes with the last 8 chars.
+    cc_agent_id = first_row.get("agent_id")
+    thread_id = first_row.get("thread_id")
+    agent_id = f"{slugify(label)}_{instance_idx:03d}"
+    suffix = cc_agent_id or thread_id
+    if suffix:
+        agent_id = f"{agent_id}_{suffix[-8:]}"
+    return {
+        "t": first_row["t_sec"],
+        "type": "subagent",
+        "agent_id": agent_id,
+        "subagent_type": label,
+        "duration_ms": duration_ms,
+        "total_tokens": total_tokens,
+        # tool_use_count is not tracked in the proxy DB; leave as None
+        # (the model field defaults to None).
+        "tool_use_count": None,
+        "status": "completed",
+        "requests": inner,
+        "models": models,
+    }
+
+
+def session_to_weka(session_id: str, rows: list[dict]) -> dict:
+    if not rows:
+        return {
+            "id": session_id,
+            "models": [],
+            "block_size": 64,
+            "hash_id_scope": "local",
+            "requests": [],
+        }
+
+    # Demote utility-labelled rows (no header id) on new CLI versions
+    # so they appear as main turns instead of 1-inner SubagentEntries.
+    # We work on a shallow copy that nulls out subagent_label on those
+    # rows; everything else is unchanged.
+    n_demoted = 0
+    demoted_rows: list[dict] = []
+    for r in rows:
+        if _is_utility_label_only(r):
+            r = {**r, "subagent_label": None}
+            n_demoted += 1
+        demoted_rows.append(r)
+    if n_demoted:
+        print(
+            f"  demoted {n_demoted} utility-labelled row(s) to main turns "
+            f"(no x-claude-code-agent-id on CLI >= "
+            f"{'.'.join(str(x) for x in MIN_CLI_FOR_HEADER_AS_TRUTH)})",
+            file=sys.stderr,
+        )
+    rows = demoted_rows
+
+    think_times = compute_think_times(rows)
+    hash_map: dict[str, int] = {}
+    block_size = infer_block_size(rows)
+
+    out_requests: list[dict] = []
+    instance_count: dict[str, int] = {}
+    models_seen: set[str] = set()
+
+    # Pass 1: pre-collect ALL rows belonging to each header-keyed group
+    # across the entire session, not just within contiguous label
+    # stretches. A sub-agent running in the background while the user
+    # makes more main-agent requests would otherwise get fragmented
+    # into one entry per stretch. The agent-id / thread-id header is
+    # stable across fragments — collapse them. Mirrors the pass-1 logic
+    # in subagent-runs.ts:buildRequestRuns.
+    id_groups: dict[str, list[tuple[dict, float | None]]] = {}
+    for r, tt in zip(rows, think_times):
+        key = _id_group_key(r)
+        if key is None:
+            continue
+        id_groups.setdefault(key, []).append((r, tt))
+
+    # Pass 2: walk chronologically and emit:
+    #   - main turn (null label)           → emit at its position
+    #   - id-keyed sub-agent, first sight  → emit FULL collected group
+    #   - id-keyed sub-agent, already seen → skip (already grouped)
+    #   - label-only sub-agent (no header) → fall back to old stretch-
+    #                                        based grouping
+    #
+    # For agent-id (Claude Code ≥ 2.1.139) groups, the per-request label
+    # drifts arbitrarily across the agent's life (e.g. General Agent ↔
+    # Web Search Agent). We follow the dashboard and use a flat
+    # 'Subagent' label for those. For thread-id (Codex) groups, the
+    # label is stable so we keep the original.
+    emitted: set[str] = set()
+    i = 0
+    while i < len(rows):
+        row = rows[i]
+        if row.get("subagent_label") is None:
+            out_requests.append(build_top_request(row, hash_map, think_times[i]))
+            models_seen.add(row["model"])
+            i += 1
+            continue
+
+        key = _id_group_key(row)
+        if key is not None:
+            if key not in emitted:
+                emitted.add(key)
+                items = id_groups[key]
+                # Claude Code agent-id groups use the flat 'Subagent'
+                # label since per-request system-prompt labels drift.
+                use_label = (
+                    "Subagent" if row.get("agent_id") else row["subagent_label"]
+                )
+                instance_count[use_label] = instance_count.get(use_label, 0) + 1
+                entry = build_subagent_entry(
+                    use_label, instance_count[use_label], items, hash_map
+                )
+                out_requests.append(entry)
+                models_seen.update(entry["models"])
+            i += 1
+            continue
+
+        # Legacy contiguous-stretch fallback for label-only sub-agents
+        # (pre-2.1.139 Claude Code or rows with no header coverage).
+        # Same algorithm as before: collect consecutive same-label rows
+        # bounded by main-agent turns, group by label.
+        stretch_rows: list[tuple[dict, float | None]] = []
+        while (i < len(rows)
+               and rows[i].get("subagent_label") is not None
+               and _id_group_key(rows[i]) is None):
+            stretch_rows.append((rows[i], think_times[i]))
+            i += 1
+        groups: dict[str, list[tuple[dict, float | None]]] = {}
+        for r, tt in stretch_rows:
+            groups.setdefault(r["subagent_label"], []).append((r, tt))
+        for label, items in groups.items():
+            instance_count[label] = instance_count.get(label, 0) + 1
+            entry = build_subagent_entry(
+                label, instance_count[label], items, hash_map
+            )
+            out_requests.append(entry)
+            models_seen.update(entry["models"])
+
+    return {
+        "id": session_id,
+        "models": sorted(models_seen),
+        "block_size": block_size,
+        "hash_id_scope": "local",
+        "requests": out_requests,
+    }
+
+
+def main() -> int:
+    args = parse_args()
+
+    in_files = sorted(p for p in args.in_dir.glob("*.jsonl"))
+    if not in_files:
+        sys.exit(f"ERROR: no .jsonl files in {args.in_dir}")
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    n_traces = 0
+    n_top = 0
+    n_subagent_entries = 0
+    n_inner = 0
+    for src in in_files:
+        session_id = src.stem
+        rows = load_session_rows(src)
+        trace = session_to_weka(session_id, rows)
+
+        out_path = args.out_dir / f"{session_id}.json"
+        _dump_trace_inline_hash_ids(trace, out_path)
+
+        n_traces += 1
+        for entry in trace["requests"]:
+            if entry.get("type") == "subagent":
+                n_subagent_entries += 1
+                n_inner += len(entry["requests"])
+            else:
+                n_top += 1
+
+        print(
+            f"{session_id}: {len(rows)} row(s) -> "
+            f"{len(trace['requests'])} entries "
+            f"({sum(1 for e in trace['requests'] if e.get('type') == 'subagent')} subagent groups)"
+            f" -> {out_path}",
+            file=sys.stderr,
+        )
+
+    print(
+        f"\nWrote {n_traces} trace(s): "
+        f"{n_top} main turns, "
+        f"{n_subagent_entries} subagent groups ({n_inner} inner requests)",
+        file=sys.stderr,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/test_process_agentic_result.py b/utils/test_process_agentic_result.py
index c54e79736..38477b62a 100644
--- a/utils/test_process_agentic_result.py
+++ b/utils/test_process_agentic_result.py
@@ -1,6 +1,6 @@
 """Smoke tests for process_agentic_result.py against synthetic aiperf output.
 
-The processor consumes three files in $RESULT_DIR/trace_replay/:
+The processor consumes three files in $RESULT_DIR/aiperf_artifacts/:
 profile_export.jsonl, profile_export_aiperf.json, and
 (optionally) server_metrics_export.json. It writes one
 $RESULT_FILENAME.json under $AGENTIC_OUTPUT_DIR. We build a minimal
@@ -94,7 +94,7 @@ def _make_record(
 def _write_fixture(tmp_path: Path) -> Path:
     """Build a $RESULT_DIR with aiperf-shaped artifacts. Returns RESULT_DIR."""
     result_dir = tmp_path / "results"
-    artifact = result_dir / "trace_replay"
+    artifact = result_dir / "aiperf_artifacts"
     artifact.mkdir(parents=True)
 
     # 5 records across 2 conversations; turn indices grow within each.
@@ -264,7 +264,7 @@ def test_processor_response_cache_hit_rate_populated_when_cached_tokens_present(
     tmp_path: Path,
 ):
     result_dir = tmp_path / "results"
-    artifact = result_dir / "trace_replay"
+    artifact = result_dir / "aiperf_artifacts"
     artifact.mkdir(parents=True)
     rec = _make_record(
         conv_id="trace-A",
@@ -301,7 +301,7 @@ def test_processor_parses_real_server_metrics_schema(tmp_path: Path):
     iterated the metrics dict like a list.
     """
     result_dir = _write_fixture(tmp_path)
-    artifact = result_dir / "trace_replay"
+    artifact = result_dir / "aiperf_artifacts"
     server_metrics = {
         "schema_version": "1.0",
         "summary": {
@@ -368,7 +368,7 @@ def test_processor_parses_real_server_metrics_schema(tmp_path: Path):
 def test_processor_aggregates_across_multiple_series(tmp_path: Path):
     """Counters with multiple series (multi-endpoint) sum across them."""
     result_dir = _write_fixture(tmp_path)
-    artifact = result_dir / "trace_replay"
+    artifact = result_dir / "aiperf_artifacts"
     server_metrics = {
         "metrics": {
             "vllm:prefix_cache_hits": {
@@ -468,7 +468,7 @@ def test_processor_loads_traces_jsonl_for_theoretical_cache(tmp_path: Path):
 def test_processor_supports_per_run_subdir_layout(tmp_path: Path):
     """When --num-profile-runs > 1, aiperf writes into a per-run subdir."""
     result_dir = tmp_path / "results"
-    artifact = result_dir / "trace_replay" / "run_0"
+    artifact = result_dir / "aiperf_artifacts" / "run_0"
     artifact.mkdir(parents=True)
     rec = _make_record(
         conv_id="trace-A",
diff --git a/utils/trace-replay b/utils/trace-replay
deleted file mode 160000
index 9074e186d..000000000
--- a/utils/trace-replay
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 9074e186da47998c0171a6053aecc70b24625b3b