diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a3afb2f6b..ec8d8667a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -323,6 +323,21 @@ qwen3.5-fp8-mi355x-sglang-agentic: search-space: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } +qwen3.5-fp8-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + qwen3.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -653,10 +668,6 @@ kimik2.5-fp4-mi355x-vllm: # its fixed-seq-len sweep is unaffected. # - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' kimik2.5-fp4-mi355x-vllm-agentic: - # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin - # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm - # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and - # includes all subsequent ROCm offload work. image: vllm/vllm-openai-rocm:v0.21.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 @@ -669,16 +680,9 @@ kimik2.5-fp4-mi355x-vllm-agentic: - duration: 1800 search-space: - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - # CPU offload only above the KV cliff. Lower concurrencies fit - # entirely on-GPU, so paying the offload-path overhead there would - # just slow them down without measuring anything new. - - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } - # TP=4 probe: half-node layout doubles per-GPU weight footprint - # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to - # cliff-region concurrencies on both offload modes so we can directly - # compare TP=4 vs TP=8 at the same conc points. + - { tp: 8, offloading: lmcache, conc-list: [32, 40, 48, 56] } - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } + - { tp: 4, offloading: lmcache, conc-list: [16, 24, 32, 40] } kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 @@ -701,6 +705,22 @@ kimik2.5-fp4-mi355x-atom: - { tp: 8, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 128 } +dsv4-fp4-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } + minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.21.0 model: MiniMaxAI/MiniMax-M2.5 @@ -1833,6 +1853,29 @@ dsv4-fp4-mi355x-sglang: - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32 } +# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; +# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - scenarios: replaced fixed-seq-len with agentic-coding. +# Image is identical to the base entry (rocm/sgl-dev DSv4 build). +# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware +# comparability. Offload sweep is none-only (SGLang has no equivalent of +# vLLM's SimpleCPUOffloadConnector path that we exercise on b200). +dsv4-fp4-mi355x-sglang-agentic: + image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [16, 32, 64] } + - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } + # DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm # nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged # on 2026-05-05, so any nightly built after that includes the diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f8cc486b2..e77a2916f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1726,7 +1726,7 @@ dsv4-fp4-b200-sglang: framework: sglang multinode: false # Two recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 - # are selected inside benchmarks/single_node/dsv4_fp4_b200.sh by DP_ATTENTION: + # are selected inside benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh by DP_ATTENTION: # low-latency (DP_ATTENTION=false): TP-only, flashinfer_mxfp4 # DP-attention (DP_ATTENTION=true): DP-attn + DeepEP + mega_moe opts # The DP-attention recipe covers both "balanced" (conc 64-128) and @@ -1781,8 +1781,10 @@ dsv4-fp4-b200-vllm: # the original dsv4-fp4-b200-vllm entry is left identical to origin/main so # its fixed-seq-len sweep is unaffected. # - runner: 'b200-dsv4' -> 'b200-dgxc' +# - image: bumped to a custom v0.21.0 build (cquil/vllm-openai:v0.21.0-8813c92) +# to test SimpleCPUOffloadConnector lazy_offload behavior on a newer vLLM. dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 + image: cquil/vllm-openai:v0.21.0-dsv4-offloading model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc @@ -1793,11 +1795,16 @@ dsv4-fp4-b200-vllm-agentic: agentic-coding: - duration: 1800 search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). - # Re-add when investigating regressions in offload=none. - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + # no-offload curve against the new cc-traces-weka-no-subagents-051826 + # dataset (98 traces, v5-only + CC ≥ 2.1.139). cpu-offload entries + # removed for this iteration; restore from prior commits if revisiting + # offload regressions. + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16] } + # Native vLLM CPU offload with HMA enabled. The benchmark script sizes + # the aggregate native offload pool to the same 2.8 TB target used for + # the blocked LMCache experiment. + # - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [12, 16, 24, 32, 48, 64] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [12, 16, 24, 32, 48, 64] } dsv4-fp4-b200-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 @@ -2013,7 +2020,7 @@ dsv4-fp4-b300-sglang: framework: sglang multinode: false # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 - # are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC: + # are selected inside benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh by CONC: # low-latency (CONC <= 32): TP-only # balanced (32 < CONC <= 128): + DP-attn # max-throughput (CONC > 128): + DP-attn @@ -2039,7 +2046,7 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is - # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by + # selected inside benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh by # DP_ATTENTION: # dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192 # + EAGLE (3,1,4) + mem-fraction 0.90 @@ -2453,6 +2460,21 @@ qwen3.5-fp8-b300-sglang: search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } +qwen3.5-fp8-b300-sglang-agentic-hicache: + image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + qwen3.5-fp4-b300-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 @@ -2677,13 +2699,32 @@ kimik2.5-fp4-b200-vllm: # Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; # the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so # its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' +# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.21.0' # - runner: 'b200' -> 'b200-dgxc' kimik2.5-fp4-b200-vllm-agentic: - # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that - # cleared the agentic-coding warmup crash on max_model_len=131072 + - # prefix caching. - image: vllm/vllm-openai:v0.20.2 + # v0.21.0 ships a newer huggingface_hub that resolves LFS content correctly + # in `hf download` (1.14.0 in v0.20.x silently fetched LFS pointer files, + # which pyarrow then choked on with "Missing a name for object member" -- + # see run 26536606210). v0.20.x's flashinfer fix for the agentic-coding + # warmup crash on max_model_len=131072 + prefix caching is included. + image: vllm/vllm-openai:v0.21.0 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } + # - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } + # - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + +kimik2.5-fp4-b200-vllm-agentic-lmcache: + image: vllm/vllm-openai:v0.21.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b200-dgxc @@ -2695,9 +2736,9 @@ kimik2.5-fp4-b200-vllm-agentic: - duration: 1800 search-space: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } - - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } + - { tp: 8, ep: 1, offloading: lmcache, conc-list: [16, 24, 32, 36] } - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } - - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html # does not have a B300-specific recipe, so this config reuses the existing @@ -2775,12 +2816,7 @@ dsr1-fp8-b300-sglang-mtp: # - precision: 'fp8' -> 'fp4' # - framework: 'sglang' -> 'vllm' kimik2.5-fp4-b300-vllm-agentic: - # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM - # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the - # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted - # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the - # INT4 B300 sister already uses successfully. - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.21.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b300 @@ -2793,6 +2829,7 @@ kimik2.5-fp4-b300-vllm-agentic: search-space: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 8, ep: 1, offloading: lmcache, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14 @@ -3044,12 +3081,13 @@ dsv4-fp4-b300-vllm-agentic: agentic-coding: - duration: 1800 search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs. Re-add when investigating regressions in offload=none. - - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } + # no-offload curve against the new cc-traces-weka-no-subagents-051826 + # dataset (98 traces, v5-only + CC ≥ 2.1.139). cpu-offload entries + # removed for this iteration; restore from prior commits if revisiting + # offload regressions. + - { tp: 8, offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } dsv4-fp4-b300-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 @@ -4766,7 +4804,7 @@ minimaxm2.5-fp8-h200-vllm: # (either main had none or had a different conc/offload sweep). # The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.21.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 @@ -8756,6 +8794,153 @@ dsv4-fp4-gb300-dynamo-vllm: ep: 16 dp-attn: true +# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons +# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to +# origin/main so its fixed-seq-len sweep is unaffected. +# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape +# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. +# - additional-settings.CONFIG_FILE: points at the new agentic recipe under +# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh +# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC +# branch). Local-overlay pattern mirrors the existing 8k1k overlay. +dsv4-fp4-gb300-dynamo-vllm-agentic: + image: vllm/vllm-openai:v0.21.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + # gb300-nv (not generic gb300) — the generic label is shared by both NV + # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards. + # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml + # + actual runner label listings). Pins agentic to the NVIDIA cluster + # for initial validation. Drop -nv suffix to widen later. + runner: gb300-nv + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # Low-latency: same 1p6d shape as the mid tier but at much lower conc + # (32 vs 192). 32/6 ≈ 5 seqs per decode worker — well below saturation, + # so each request gets ~6× the per-request decode compute it would get + # at conc=192. Reuses the 1p6d recipe; no separate recipe file needed. + - spec-decoding: none + conc-list: [32] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + # Mid: 1 prefill (DEP=4) + 6 decode (TP=4). 7 nodes / 28 GPUs. + # Mirrors fixed-seq-len conc=192 entry. + - spec-decoding: none + conc-list: [192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + # High-throughput: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 nodes / + # 24 GPUs. Smallest 4096-class shape in fixed-seq-len; deep_gemm_mega_moe + # on both sides. Mirrors fixed-seq-len conc=4096 entry (4p1d variant). + - spec-decoding: none + conc-list: [4096] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + +# CoreWeave sibling of dsv4-fp4-gb300-dynamo-vllm-agentic — same image, +# recipes, and search space; only `runner` differs (gb300-cw vs gb300-nv). +# Kept as a separate config (not a label-widening on the -nv entry) +# because we dispatch NV and CW as independent sweep runs — bundling +# both SKUs into one `gh workflow run` invocation lets a fault on one +# cascade-cancel the other (see prior R20–R23 outages). The two sibling +# configs share recipe files via the same launch_gb300-cw.sh IS_AGENTIC +# overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe +# applies to both clusters with no duplication. +dsv4-fp4-gb300-cw-dynamo-vllm-agentic: + image: vllm/vllm-openai:v0.21.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # Low-latency: 1p6d at conc=32. + - spec-decoding: none + conc-list: [32] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + # Mid: 1p6d at conc=192. + - spec-decoding: none + conc-list: [192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + # High-throughput: 4p1d at conc=4096. + - spec-decoding: none + conc-list: [4096] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd model: deepseek-ai/DeepSeek-V4-Pro @@ -9214,6 +9399,31 @@ qwen3.5-fp8-h100-sglang: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } +# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below; +# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main +# so its fixed-seq-len sweep is unaffected. +# - scenarios: replaced fixed-seq-len with agentic-coding. +# - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster). +# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130). +# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with- +# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache +# tends to flake on first runs and conc 16 covers the cliff. The bench script +# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant. +qwen3.5-fp8-h100-sglang-agentic: + image: lmsysorg/sglang:v0.5.12-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: h100-dgxc + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } + - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } + qwen3.5-fp8-h100-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 27d9a098e..eee8405d0 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -116,19 +116,19 @@ mi325x-disagg: - 'mi325x-amds_07' - 'mi325x-amds_08' mi355x: -- 'mi355x-amds_0' -- 'mi355x-amds_1' -- 'mi355x-amds_2' -- 'mi355x-amds_3' -- 'mi355x-amds_4' -- 'mi355x-amds_5' -- 'mi355x-amds_6' -- 'mi355x-amds_7' -- 'mi355x-amds_8' +- 'mi355x-amds_00' +- 'mi355x-amds_01' +- 'mi355x-amds_02' +- 'mi355x-amds_03' +- 'mi355x-amds_04' +- 'mi355x-amds_05' +- 'mi355x-amds_06' +- 'mi355x-amds_07' +- 'mi355x-amds_08' mi355x-disagg: -- 'mi355x-amds_6' -- 'mi355x-amds_7' -- 'mi355x-amds_8' +- 'mi355x-amds_06' +- 'mi355x-amds_07' +- 'mi355x-amds_08' gb200: - gb200-nv_0 - gb200-nv_1 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index f901b1ff7..81727ef39 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -139,7 +139,7 @@ env: EVAL_ONLY: ${{ inputs.eval-only }} EVAL_CONC: ${{ inputs.eval-conc }} SCENARIO_TYPE: ${{ inputs.scenario-type }} - SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }} + SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }} IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }} CONC: ${{ inputs.conc }} DURATION: ${{ inputs.duration }} @@ -291,8 +291,8 @@ jobs: LOGS/agentic/benchmark_command.txt LOGS/agentic/workload_distribution_summary.txt LOGS/agentic/workload_distribution_plots.png - LOGS/agentic/trace_replay/detailed_results.csv - LOGS/agentic/trace_replay/debug_trace.jsonl + LOGS/agentic/aiperf_artifacts/detailed_results.csv + LOGS/agentic/aiperf_artifacts/debug_trace.jsonl if-no-files-found: ignore - name: Upload eval results (if any) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index cca6031c3..2148def36 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -73,7 +73,7 @@ on: type: string default: 'fixed-seq-len' offloading: - description: "KV offload backend for agentic scenarios (none/cpu/ssd)" + description: "KV offload backend for agentic scenarios (none/cpu/ssd/lmcache/lmcache-mp/hicache)" required: false type: string default: 'none' @@ -109,7 +109,7 @@ env: RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} SCENARIO_TYPE: ${{ inputs.scenario-type }} - SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }} + SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || 'fixed_seq_len/' }} IS_AGENTIC: ${{ inputs.scenario-type == 'agentic-coding' && '1' || '0' }} OFFLOADING: ${{ inputs.offloading }} TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }} @@ -151,7 +151,7 @@ jobs: fi # Cleanup results/ from a prior job on this runner. Agentic jobs - # write to fixed subpaths (trace_replay/, metrics_*, etc.), so stale + # write to fixed subpaths (aiperf_artifacts/, metrics_*, etc.), so stale # data from a previous job would otherwise be picked up as this # job's output when replay fails early. rm -rf "${{ github.workspace }}/results" 2>/dev/null || true @@ -226,7 +226,7 @@ jobs: path: agg_${{ env.RESULT_FILENAME }}.json - name: Upload agentic aggregated result - if: ${{ inputs.scenario-type == 'agentic-coding' }} + if: ${{ always() && inputs.scenario-type == 'agentic-coding' }} uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: bmk_agentic_${{ env.RESULT_FILENAME }} @@ -239,33 +239,36 @@ jobs: name: agentic_${{ env.RESULT_FILENAME }} path: | results/server.log + results/lmcache_server.log results/benchmark.log results/config.yaml + results/lmcache_command.txt + results/sglang_command.txt results/vllm_command.txt results/benchmark_command.txt results/workload_distribution_summary.txt results/workload_distribution_plots.png results/metrics_plots.png - results/trace_replay/profile_export.jsonl - results/trace_replay/profile_export_aiperf.json - results/trace_replay/profile_export_aiperf.csv - results/trace_replay/profile_export_aiperf_timeslices.json - results/trace_replay/profile_export_aiperf_timeslices.csv - results/trace_replay/profile_export_aiperf_aggregate.json - results/trace_replay/profile_export_aiperf_aggregate.csv - results/trace_replay/profile_export_aiperf_collated.json - results/trace_replay/server_metrics_export.json - results/trace_replay/server_metrics_export.jsonl - results/trace_replay/server_metrics_export.csv - results/trace_replay/server_metrics_export.parquet - results/trace_replay/gpu_telemetry_export.jsonl - results/trace_replay/logs/aiperf.log - results/trace_replay/logs/*.log + results/aiperf_artifacts/profile_export.jsonl + results/aiperf_artifacts/profile_export_aiperf.json + results/aiperf_artifacts/profile_export_aiperf.csv + results/aiperf_artifacts/profile_export_aiperf_timeslices.json + results/aiperf_artifacts/profile_export_aiperf_timeslices.csv + results/aiperf_artifacts/profile_export_aiperf_aggregate.json + results/aiperf_artifacts/profile_export_aiperf_aggregate.csv + results/aiperf_artifacts/profile_export_aiperf_collated.json + results/aiperf_artifacts/server_metrics_export.json + results/aiperf_artifacts/server_metrics_export.jsonl + results/aiperf_artifacts/server_metrics_export.csv + results/aiperf_artifacts/server_metrics_export.parquet + results/aiperf_artifacts/gpu_telemetry_export.jsonl + results/aiperf_artifacts/logs/aiperf.log + results/aiperf_artifacts/logs/*.log # Excluded by design (multi-GB debug artifacts, not consumed by - # post-processing): results/trace_replay/inputs.json (pre-formatted + # post-processing): results/aiperf_artifacts/inputs.json (pre-formatted # request bodies — the mmap'd binary equivalent is rebuilt from # --public-dataset + --random-seed) and - # results/trace_replay/profile_export_raw.jsonl (full HTTP bodies + # results/aiperf_artifacts/profile_export_raw.jsonl (full HTTP bodies # per request — recoverable by re-running the same trace). if-no-files-found: ignore @@ -274,7 +277,9 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }} - path: ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }} + path: | + ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }} + ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }} if-no-files-found: ignore - name: Upload GPU metrics diff --git a/.gitmodules b/.gitmodules index 03670a881..fb9b1cc76 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,4 @@ -[submodule "utils/trace-replay"] - path = utils/trace-replay - url = https://github.com/callanjfox/kv-cache-tester.git - branch = agentx-minimized [submodule "utils/aiperf"] path = utils/aiperf url = https://github.com/cquil11/aiperf.git - branch = cjq/weka-live-assistant-responses + branch = cjq/agentx-v0.3-subagents diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f5e39b4cf..cb66d75f5 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -9,6 +9,13 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}" mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true +# Inference server port shared by every benchmark recipe. Launchers that need +# a non-default value (e.g. launch_mi355x-amds.sh derives PORT from RUNNER_NAME +# to avoid collisions across concurrent gh-runners on a shared host) set PORT +# themselves before sourcing this file; the `:-` fallback only kicks in when +# nothing upstream set it. +export PORT="${PORT:-8888}" + # -------------------------------- # GPU monitoring helpers # -------------------------------- @@ -883,9 +890,6 @@ run_eval() { INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}" AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}" -# TRACE_REPLAY_DIR retained for any out-of-tree consumer that still -# imports the kv-cache-tester scripts. Not used by the helpers below. -TRACE_REPLAY_DIR="${TRACE_REPLAY_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/trace-replay}" agentic_pip_install() { local pip_install=(python3 -m pip install) @@ -907,12 +911,27 @@ ensure_hf_cli() { } resolve_trace_source() { - local dataset="semianalysisai/cc-traces-weka-no-subagents-051226" - # aiperf reads the corpus via its public-dataset registry; the loader - # under the hood pulls from semianalysisai/cc-traces-weka-no-subagents-051226 - # (949 traces, no-subagents variant — see plugins.yaml). - TRACE_SOURCE_FLAG="--public-dataset semianalysis_cc_traces_weka" - echo "Loading traces via aiperf public-dataset: semianalysis_cc_traces_weka ($dataset)" + # Per-recipe override: set WEKA_LOADER_OVERRIDE to one of the aiperf + # public-dataset loader names allowed by the inferencex-agentx-mvp + # scenario. Used by recipes whose servers have non-default context + # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the + # unfiltered 052726 corpus and switches to the 256k-capped variant). + local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}" + local dataset + case "$loader" in + semianalysis_cc_traces_weka_with_subagents) + dataset="semianalysisai/cc-traces-weka-with-subagents-052726" + ;; + semianalysis_cc_traces_weka_with_subagents_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k" + ;; + *) + echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2 + exit 1 + ;; + esac + TRACE_SOURCE_FLAG="--public-dataset $loader" + echo "Loading traces via aiperf public-dataset: $loader ($dataset)" # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used # for model weights) so subsequent runs read from cache instead of # re-downloading every job. @@ -921,6 +940,12 @@ resolve_trace_source() { } install_agentic_deps() { + # vllm/vllm-openai container ships without git. pip needs git to + # introspect the aiperf source tree on install. Install on demand; + # no-op when git is already present (e.g. AMD images that ship it). + if ! command -v git >/dev/null 2>&1; then + apt-get update && apt-get install -y git + fi agentic_pip_install --quiet urllib3 requests 2>/dev/null || true agentic_pip_install -q -r "$AGENTIC_DIR/requirements.txt" # Editable install of aiperf from the submodule — gives us the @@ -943,22 +968,25 @@ install_agentic_deps() { build_replay_cmd() { # aiperf invocation for the inferencex-agentx-mvp scenario. # - # Live-assistant mode is on by default - # (AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES=1): the loader emits - # user-only deltas and the worker threads the server's live assistant - # response back into the session. This preserves cache-hit reuse on - # the just-generated KV blocks at the cost of hash-id fidelity past - # turn 0 — which is exactly what we want for benchmark numbers. + # Pre-canned assistant replay is the default: recorded assistant responses + # are used for future prompt construction, and live server responses are + # discarded. Set AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES=1 explicitly + # to use live-assistant mode, where the loader emits user-only deltas and + # the worker threads the server's live assistant response back into the + # session. # - # The scenario plugin locks: --cache-bust first_turn_prefix, - # --inter-turn-delay-cap-seconds 60, etc., and auto-injects them — so - # we do not pass them. See utils/aiperf/docs/tutorials/agentx-mvp.md. + # The scenario plugin locks: --cache-bust first_turn_prefix and + # --trace-idle-gap-cap-seconds 60 (per-trace idle-gap compression + # against parent + subagent request-start timestamps; supersedes the + # legacy --use-think-time-only / --inter-turn-delay-cap-seconds path), + # and auto-injects them — so we do not pass them. See + # utils/aiperf/docs/tutorials/agentx-mvp.md. local result_dir="$1" - local duration="${DURATION:-1800}" + local duration="$DURATION" - export AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES=1 + export AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES="${AIPERF_DATASET_WEKA_LIVE_ASSISTANT_RESPONSES:-0}" # Dataset configuration (load + reconstruct + inputs.json + mmap) - # routinely takes 4-5 min for the 949-trace weka corpus on fast /tmp + # routinely takes 4-5 min for the Weka corpus on fast /tmp # (B300) but can stretch to 14 min on slower /tmp + parallel contention # (observed on H200 where all 14 R3 jobs hit aiperf's 900s Configure # Profiling timeout simultaneously). Bump to 1800s to absorb 3x @@ -976,12 +1004,11 @@ build_replay_cmd() { REPLAY_CMD+=" --concurrency $CONC" REPLAY_CMD+=" --benchmark-duration $duration" REPLAY_CMD+=" --random-seed 42" - # Abort the run if real-failure rate exceeds 5% after a grace floor of - # max(CONC, 10) records. Context-overflow records are dropped from the - # failure tally in AGENTIC_REPLAY scenarios (see record_processor_service - # in the aiperf submodule), so this threshold measures only real failures - # (server 5xx, parse errors, malformed responses). - REPLAY_CMD+=" --failed-request-threshold 0.05" + # Fail runs once more than 10% of requests error. This keeps known + # transient low-rate failures from killing long sweeps while still + # catching malformed payloads or server crashes before they get aggregated + # as benchmarkable data. + REPLAY_CMD+=" --failed-request-threshold 0.10" # Sample each trajectory's warmup start position uniformly from # [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream). # Avoids starting trajectories right at turn 0 where the KV cache is @@ -1002,11 +1029,18 @@ build_replay_cmd() { # need trust_remote_code=True to load. Benign for models without # custom tokenizer code, so we set it unconditionally. REPLAY_CMD+=" --tokenizer-trust-remote-code" - # Default --num-dataset-entries is 100; the weka corpus has 949. Cap - # at 949 so all unique traces are loaded (the loader treats this as a - # ``min(cap, available)`` ceiling, not a target — see + # Keep replay inputs inside the same context window used to launch the + # server. The WEKA corpus contains a few very long parent/subagent traces; + # if we mmap and replay them against a smaller-context server they become + # deterministic 4xxs and can still pressure the engine while queued. + if [ -n "${MAX_MODEL_LEN:-}" ] && [ "$MAX_MODEL_LEN" != "0" ]; then + REPLAY_CMD+=" --max-context-length $MAX_MODEL_LEN" + fi + # Default --num-dataset-entries is 100; the with-subagents Weka corpus + # has 472. Cap at 472 so all unique traces are loaded (the loader treats + # this as a ``min(cap, available)`` ceiling, not a target — see # semianalysis_cc_traces_weka.py). - REPLAY_CMD+=" --num-dataset-entries 949" + REPLAY_CMD+=" --num-dataset-entries 472" # 1-second timeslices on the server-metrics scrape so the post-run # plotter has per-window time series (KV usage, cache hit rate, # throughput, etc.). Matches kv-cache-tester's poll_interval=1.0 @@ -1014,7 +1048,7 @@ build_replay_cmd() { # Without this, aiperf only emits aggregate stats and the 6x2 panels # collapse to flat lines. REPLAY_CMD+=" --slice-duration 1.0" - REPLAY_CMD+=" --output-artifact-dir $result_dir/trace_replay" + REPLAY_CMD+=" --output-artifact-dir $result_dir/aiperf_artifacts" # The inferencex-agentx-mvp scenario enforces a 900s minimum # benchmark duration. For smoke tests with shorter durations, opt # into --unsafe-override (the run's submission_valid will be flagged @@ -1038,3 +1072,27 @@ write_agentic_result_json() { # missing in a stripped-down image). The agg JSON is the success gate. python3 "$INFMAX_CONTAINER_WORKSPACE/utils/generate_aiperf_plots.py" "$result_dir" 2>&1 || true } + +run_agentic_replay_and_write_outputs() { + local result_dir="$1" + local replay_rc + + echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt" + + set +e + set -x + $REPLAY_CMD 2>&1 | tee "$result_dir/benchmark.log" + replay_rc=${PIPESTATUS[0]} + set +x + set -e + + write_agentic_result_json "$result_dir" + + python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true + + if [ "$replay_rc" -ne 0 ]; then + echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2 + return "$replay_rc" + fi +} diff --git a/benchmarks/multi_node/agentic_srt.sh b/benchmarks/multi_node/agentic_srt.sh index 2be99bf58..a0e9e243c 100644 --- a/benchmarks/multi_node/agentic_srt.sh +++ b/benchmarks/multi_node/agentic_srt.sh @@ -9,14 +9,9 @@ set -x INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/infmax-workspace}" source "$INFMAX_CONTAINER_WORKSPACE/benchmarks/benchmark_lib.sh" -check_env_vars MODEL MODEL_PREFIX FRAMEWORK PRECISION CONC RESULT_FILENAME +check_env_vars MODEL MODEL_PREFIX FRAMEWORK PRECISION CONC RESULT_FILENAME DURATION -PORT="${PORT:-8000}" RESULT_DIR="${RESULT_DIR:-/logs/agentic}" -DURATION="${DURATION:-1800}" -MAX_DELAY="${MAX_DELAY:-60}" -ADVANCE_MIN="${ADVANCE_MIN:-0.0}" -ADVANCE_MAX="${ADVANCE_MAX:-0.7}" mkdir -p "$RESULT_DIR" @@ -24,18 +19,4 @@ resolve_trace_source install_agentic_deps build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set +e -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" -REPLAY_RC=${PIPESTATUS[0]} -set -e - -write_agentic_result_json "$RESULT_DIR" - -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true - -if [ "$REPLAY_RC" -ne 0 ]; then - echo "WARNING: agentic trace replay exited with code $REPLAY_RC after writing available results" >&2 -fi +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml new file mode 100644 index 000000000..fb7b9fd97 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml @@ -0,0 +1,177 @@ +name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4-agentic" + +# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml. +# Topology is identical (1 prefill DEP=4 + 6 decode TP=4, 28 GPUs across 7 +# GB300 nodes + 1 dedicated NATS/etcd infra node) so we can compare against +# the fixed-seq-len 1p6d baseline at the same concurrency point (192). +# +# Divergence vs the 8k1k sibling: +# - benchmark.type: sa-bench -> custom (hands off to agentic_srt.sh) +# - max-model-len: removed (let vLLM derive from model config; agentic +# trajectories blow past any small explicit cap) +# - no-enable-prefix-caching: dropped (prefix caching MUST be on for +# trajectory reuse — entire point of agentic) +# Note: --enable-auto-tool-choice / --tool-call-parser / --reasoning-parser +# are NOT set on the worker. The dynamo-vllm worker entrypoint doesn't +# accept them (different arg parser than `vllm serve`). In disagg, chat +# parsing happens at the dynamo frontend, not at the worker. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.21.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 6 + prefill_workers: 1 + decode_workers: 6 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +infra: + etcd_nats_dedicated_node: true + # Raise NATS server max_payload from the 1 MiB default to 32 MiB. + # Agentic prompts at 50k-200k DSv4 tokens serialize to JSON at ~10-15 + # bytes/token, easily clearing 1-3 MB per request. Without this, every + # long-prompt prefill RPC gets rejected by the NATS server with + # "maximum payload exceeded" (visible in infra.out), and the dynamo + # frontend surfaces a misleading "NATS request ... deadline has elapsed" + # (it never gets a reply because the publish was rejected). 32 MiB gives + # ~10x headroom over the largest observed payload (3.2 MB) without + # crossing NATS's 64 MiB hard cap or Dynamo's 16 MiB advisory limit. + nats_max_payload_mb: 32 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +# sbatch + srun resource grants for clusters without per-GPU defaults. +# +# mem=0: allocate all available node memory (~868 GB on CW gb300). Without +# this, sbatch only requests ntasks × DefMemPerCPU = 8 × 4 GB = 32 GB for +# the whole job and worker cgroups OOM-kill mid model load (R7-R11 hit +# this; sacct showed AllocTRES mem=4G per step). +# +# cpus-per-task=72: give each task one CW gb300 NUMA socket (144 cores +# split 2 × 72). Critical for the *infra step* (etcd + nats) which +# srtctl spawns without --gres=gpu — on CW that means DefMemPerCPU +# applies and the step gets 1 CPU by default. With 24 dynamo DP ranks +# all hammering etcd for lease keep-alives, single-CPU etcd can't keep +# up and dies (R12 hit this; etcd reported max-cpu-set=1, leases +# deadline-exceeded, infra SIGKILL'd at 16:35:49). 72 CPUs is plenty +# for both etcd + nats AND for vLLM worker auxiliary threads. +# +# nv gb300 doesn't need this because cluster default DefCpuPerGPU=35 +# auto-allocates 4*35=140 CPUs per GPU-bearing task; cw has no per-GPU +# default. Setting it here is safe on both because the value is ≤ node +# CPU count. +# +# srun_options.mem=0 forces each srun step to use the full node memory +# (without it, srun steps default back to cpus_per_task × DefMemPerCPU). +# Docs: docs/config-reference.md#sbatch_directives + #srun_options. +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" + # gb300-nv: pyxis maps the calling user (sa-shared) into the container as + # uid 345200007. dpkg refuses to run without EUID 0 even though + # ENROOT_ROOTFS_WRITABLE=1 makes the rootfs writable, so the agentic_srt + # apt-get install git step fails. --container-remap-root asks pyxis to + # remap us to uid 0 inside the container, matching the gb300-cw behavior. + # No-op on cw (already root). srt-slurm renders empty-string values as + # flag-only srun args (see core/slurm.py:250). + container-remap-root: "" + +benchmark: + type: custom + command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh + env: + INFMAX_CONTAINER_WORKSPACE: /infmax-workspace + RESULT_DIR: /logs/agentic + PORT: "8000" + IS_MULTINODE: "true" + # Container-side path of the aiperf mmap dataset cache; the host-side + # mount is wired via launch_gb300-*.sh's srtslurm.yaml default_mounts. + # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files + # per dataset on every run. + AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml new file mode 100644 index 000000000..f1bd9b1e9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml @@ -0,0 +1,136 @@ +name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4-keepalive" + +# Keepalive variant of disagg-gb300-1p6d-dep4-tp4-agentic.yaml: same +# server topology (1P + 6D = 7 vLLM workers + 1 NATS/etcd infra node) +# but `benchmark.command` is replaced with a long sleep instead of +# agentic_srt.sh. Brings up the server and parks the orchestrator so +# you can hammer aiperf from outside without competing with the +# launcher's own aiperf invocation. +# +# Usage: +# cd +# srtctl apply --no-preflight -f \ +# recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-keepalive.yaml +# tail -F outputs//logs/sweep_.log +# # wait for "Model is ready. Have 4 prefills and 6 decodes." +# # then run aiperf against http://:8000 from anywhere +# # tear down: scancel + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.21.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 6 + prefill_workers: 1 + decode_workers: 6 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +infra: + etcd_nats_dedicated_node: true + # See sibling 1p6d agentic recipe for rationale — NATS 1 MiB default + # rejects long agentic prompts; 32 MiB gives ~10x headroom. + nats_max_payload_mb: 32 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" + container-remap-root: "" + +# THIS IS THE KEY DIFF vs the agentic sibling: use srt-slurm's +# first-class `manual` benchmark mode instead of spawning agentic_srt.sh. +# In manual mode, BenchmarkStageMixin.run_benchmark() (see +# src/srtctl/cli/mixins/benchmark_stage.py:131-141) brings up workers +# + frontend, logs "Frontend URL: http://:8000", then sleeps +# in a 5s health-check loop waiting only for worker failures or +# scancel/Ctrl+C. External clients (your aiperf shell) drive the +# server directly. No competing benchmark container, no sleep hack. +benchmark: + type: manual diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml new file mode 100644 index 000000000..bb8fc6df8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml @@ -0,0 +1,176 @@ +name: "svf-vllm-disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic" + +# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml. +# Max-throughput shape: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 GB300 +# nodes (4P + 2D = 24 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra +# node. Sized for concurrency 4096 with deep_gemm_mega_moe on both workers. +# +# Divergence vs the 8k1k sibling: +# - benchmark.type: sa-bench -> custom (hands off to agentic_srt.sh) +# - max-model-len: removed (let vLLM derive from model config; agentic +# trajectories blow past any small explicit cap) +# - no-enable-prefix-caching: dropped (prefix caching MUST be on for +# trajectory reuse — entire point of agentic) +# Note: --enable-auto-tool-choice / --tool-call-parser / --reasoning-parser +# are NOT set on the worker. The dynamo-vllm worker entrypoint doesn't +# accept them (different arg parser than `vllm serve`). In disagg, chat +# parsing happens at the dynamo frontend, not at the worker. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.21.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + # See sibling 1p6d recipe for rationale — NATS 1 MiB default rejects + # agentic prompts; 32 MiB gives ~10x headroom over observed payloads. + nats_max_payload_mb: 32 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +# sbatch + srun resource grants for clusters without per-GPU defaults. +# +# mem=0: allocate all available node memory (~868 GB on CW gb300). Without +# this, sbatch only requests ntasks × DefMemPerCPU = 8 × 4 GB = 32 GB for +# the whole job and worker cgroups OOM-kill mid model load (R7-R11 hit +# this; sacct showed AllocTRES mem=4G per step). +# +# cpus-per-task=72: give each task one CW gb300 NUMA socket (144 cores +# split 2 × 72). Critical for the *infra step* (etcd + nats) which +# srtctl spawns without --gres=gpu — on CW that means DefMemPerCPU +# applies and the step gets 1 CPU by default. With 24 dynamo DP ranks +# all hammering etcd for lease keep-alives, single-CPU etcd can't keep +# up and dies (R12 hit this; etcd reported max-cpu-set=1, leases +# deadline-exceeded, infra SIGKILL'd at 16:35:49). 72 CPUs is plenty +# for both etcd + nats AND for vLLM worker auxiliary threads. +# +# nv gb300 doesn't need this because cluster default DefCpuPerGPU=35 +# auto-allocates 4*35=140 CPUs per GPU-bearing task; cw has no per-GPU +# default. Setting it here is safe on both because the value is ≤ node +# CPU count. +# +# srun_options.mem=0 forces each srun step to use the full node memory +# (without it, srun steps default back to cpus_per_task × DefMemPerCPU). +# Docs: docs/config-reference.md#sbatch_directives + #srun_options. +sbatch_directives: + mem: "0" + cpus-per-task: "72" +srun_options: + mem: "0" + # gb300-nv: pyxis maps the calling user (sa-shared) into the container as + # uid 345200007. dpkg refuses to run without EUID 0 even though + # ENROOT_ROOTFS_WRITABLE=1 makes the rootfs writable, so the agentic_srt + # apt-get install git step fails. --container-remap-root asks pyxis to + # remap us to uid 0 inside the container, matching the gb300-cw behavior. + # No-op on cw (already root). srt-slurm renders empty-string values as + # flag-only srun args (see core/slurm.py:250). + container-remap-root: "" + +benchmark: + type: custom + command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh + env: + INFMAX_CONTAINER_WORKSPACE: /infmax-workspace + RESULT_DIR: /logs/agentic + PORT: "8000" + IS_MULTINODE: "true" + # Container-side path of the aiperf mmap dataset cache; the host-side + # mount is wired via launch_gb300-*.sh's srtslurm.yaml default_mounts. + # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files + # per dataset on every run. + AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh index af275e6ef..f9955adc7 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-5} if [[ -n "${SLURM_JOB_ID:-}" ]]; then @@ -67,14 +61,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh index f7c7f9ca1..ff76b768d 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -60,14 +55,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 03dee8dd0..108347479 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -19,18 +19,17 @@ set -x # # Required env vars: # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only, with DSv4 hybrid KV manager enabled. +# cpu - vLLM native OffloadingConnector, with hybrid KV manager enabled. +# lmcache-mp - Temporarily disabled for DSv4. LMCache PR #3261 must merge +# first so LMCacheMPConnector can support HMA block-id tuples. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} -DP_ATTENTION=${DP_ATTENTION:-false} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=1000000 fi @@ -51,45 +50,145 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() +HYBRID_KV_ARGS=(--no-disable-hybrid-kv-cache-manager) +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + case "$OFFLOADING" in none) ;; cpu) # b200-dgxc compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits - # individual jobs to a fraction of that. Aim for ~1.5 TB total host - # CPU pool across the engine(s). + # individual jobs to a fraction of that. Aim for ~1.2 TB total native + # CPU offload pool across the engine(s); previously 2.8 TB but every + # DP-attn worker stalled for 4+ min during pinned-CPU-tensor allocation + # and the shm_broadcast watchdog killed them (run 26246044726). 150 GB + # per worker (1.2 TB / 8) completes the alloc within the 60 s window. # - # SimpleCPUOffloadConnector divides cpu_bytes_to_use by - # parallel_config.world_size (= TP*PP, NOT including DP — see - # vllm/config/parallel.py and parallel.py docstrings). So: - # - DP-attn=true → each of $TP DP engines has world_size=1 in - # its parallel_config; the connector does no internal divide, - # and each engine torch.zeros + pin_tensor allocates the full - # --kv_offloading_size value. Pre-divide by $TP here so the - # aggregate host commit ≈ TOTAL_CPU_DRAM_GB. - # - DP-attn=false → single engine with world_size=TP. Pass the - # full TOTAL_CPU_DRAM_GB; the connector's internal divide - # yields TOTAL/TP per rank, and TP-shared mmap (PR #37206) - # keeps the aggregate at TOTAL. - TOTAL_CPU_DRAM_GB=1500 + # Native --kv-offloading-size becomes OffloadingConnector's + # cpu_bytes_to_use. For DP-attn there are $TP independent DP engines, + # so pre-divide to keep aggregate host commit near TOTAL_CPU_DRAM_GB. + # For pure TP, vLLM treats the size as the total across TP ranks. + TOTAL_CPU_DRAM_GB=1200 if [ "$DP_ATTENTION" = "true" ]; then PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP)) else PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB fi - PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024)) - # Use --kv-transfer-config JSON to also pass lazy_offload=true. Eager - # mode (default) hits an AssertionError in - # vllm/v1/core/kv_cache_utils.py:269 popleft_n at low/mid CONC; lazy - # mode defers the store path and clears low/mid CONC at 80-100%. - # See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob. - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}" + unset VLLM_USE_SIMPLE_KV_OFFLOAD + OFFLOAD_ARGS=( + --kv-offloading-backend native + --kv-offloading-size "$PER_ENGINE_GB" + ) + ;; + lmcache-mp) + { set +x; } 2>/dev/null + # LMCacheMPConnector needs HMA support before it can run DSv4 with the + # hybrid KV manager. Re-enable this path after + # https://github.com/LMCache/LMCache/pull/3261 is merged. + echo "Error: OFFLOADING=lmcache-mp is disabled for DSv4 until LMCache PR #3261 adds HMA support." >&2 + exit 1 + + # LMCache docs recommend MP mode for production: start an external + # `lmcache server`, then point vLLM's LMCacheMPConnector at it. For + # vLLM >= 0.20, prefer the LMCache-shipped connector module because it + # tracks the latest server protocol ahead of vLLM's vendored copy. + # + # Important DSv4 caveat: LMCacheMPConnector currently only accepts the + # non-hybrid KV block layout. The connector raises if vLLM returns the + # hybrid block-id tuple used by the CSA/HCA hybrid KV manager. This + # mode therefore disables the hybrid manager; `none` and `cpu` keep it + # enabled for the normal B200 DSv4 path. + agentic_pip_install --quiet --no-cache-dir lmcache + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + TOTAL_CPU_DRAM_GB=2800 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-200}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + HYBRID_KV_ARGS=(--disable-hybrid-kv-cache-manager) + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + ) ;; *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache-mp)" >&2 exit 1 ;; esac @@ -120,25 +219,31 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve "$MODEL" \ ---host 0.0.0.0 \ ---port "$PORT" \ ---trust-remote-code \ ---kv-cache-dtype fp8 \ ---block-size 256 \ -"${PARALLEL_ARGS[@]}" \ -"${EP_ARGS[@]}" \ ---compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ ---attention_config.use_fp4_indexer_cache=True \ ---tokenizer-mode deepseek_v4 \ ---tool-call-parser deepseek_v4 \ ---enable-auto-tool-choice \ ---reasoning-parser deepseek_v4 \ ---enable-prefix-caching \ ---no-disable-hybrid-kv-cache-manager \ ---max-model-len "$MAX_MODEL_LEN" \ ---max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --trust-remote-code + --kv-cache-dtype fp8 + --block-size 256 + "${PARALLEL_ARGS[@]}" + "${EP_ARGS[@]}" + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + --attention_config.use_fp4_indexer_cache=True + --tokenizer-mode deepseek_v4 + --tool-call-parser deepseek_v4 + --enable-auto-tool-choice + --reasoning-parser deepseek_v4 + --enable-prefix-caching + "${HYBRID_KV_ARGS[@]}" + --max-model-len "$MAX_MODEL_LEN" + --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -147,14 +252,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index e21b31e7a..f6748a5f8 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -22,15 +22,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} -DP_ATTENTION=${DP_ATTENTION:-false} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=1000000 fi @@ -147,14 +140,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh new file mode 100755 index 000000000..99aec25fe --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on MI355X using SGLang. +# Adapted from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh (fixed-seq-len +# sibling) with the agentic harness (build_replay_cmd / write_agentic_result_json +# / analyze_benchmark_distributions) swapped in for run_benchmark_serving. +# +# This launcher does NOT support CPU offload. SGLang's KV offload paths are +# different from vLLM's SimpleCPUOffloadConnector, and the matching agentic +# config (dsv4-fp4-mi355x-sglang-agentic) only sweeps offloading=none. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility under slurm cgroups. +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# Reject anything other than none: this launcher has no SGLang CPU-offload +# wiring (different surface than vLLM's SimpleCPUOffloadConnector). +case "$OFFLOADING" in + none) ;; + *) + echo "Error: dsv4_fp4_mi355x_sglang.sh only supports OFFLOADING=none (got '$OFFLOADING')" >&2 + exit 1 + ;; +esac + +# Transformers in the container doesn't recognize the `deepseek_v4` model_type. +# PR #23608's fallback in hf_transformers_utils.get_config tries to handle this +# by writing a patched config to /tmp, but in practice isn't catching the error +# in this image. Patch the cached config.json directly instead: set model_type +# to `deepseek_v3` so AutoConfig.from_pretrained succeeds, and keep +# architectures=['DeepseekV4ForCausalLM'] so SGLang dispatches to its native +# DSv4 model class (python/sglang/srt/models/deepseek_v4.py). +python3 << PYEOF +import json +from huggingface_hub import hf_hub_download +path = hf_hub_download(repo_id="$MODEL", filename="config.json") +with open(path) as f: + config = json.load(f) +if config.get("model_type") == "deepseek_v4": + config["model_type"] = "deepseek_v3" + with open(path, "w") as f: + json.dump(config, f, indent=2) + print(f"Patched {path}: model_type deepseek_v4 -> deepseek_v3") +else: + print(f"No patch needed: model_type is {config.get('model_type')!r}") +PYEOF + +# DSv4 FP4-experts path. Mirrors the env block in the fixed-seq-len sibling +# (benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh), which tracks the active +# block in python/run_dsv4.sh on the amd/deepseek_v4 branch: +# SGLANG_DSV4_FP4_EXPERTS=True -> route experts through FP4 kernels +# SGLANG_FORCE_TRITON_MOE_FP8=0 -> dispatch MoE through aiter and apply +# the swiglu_limit clamp in the triton +# MoE fallback path. +export SGLANG_REASONING_EFFORT=max +export SGLANG_OPT_USE_FUSED_COMPRESS=true +export SGLANG_OPT_USE_OLD_COMPRESSOR=true +export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false +export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false +export SGLANG_OPT_USE_FUSED_HASH_TOPK=false +export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false +export SGLANG_OPT_USE_TILELANG_MHC_PRE=false +export SGLANG_OPT_USE_TILELANG_MHC_POST=false +export SGLANG_OPT_USE_AITER_MHC_PRE=true +export SGLANG_OPT_USE_AITER_MHC_POST=true +export SGLANG_ENABLE_THINKING=1 +export SGLANG_USE_AITER=1 +export SGLANG_USE_ROCM700A=1 +export SGLANG_TOPK_TRANSFORM_512_TORCH=0 +export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 +export SGLANG_DSV4_FP4_EXPERTS=True +export SGLANG_OPT_DPSK_V4_RADIX=0 +export SGLANG_OPT_USE_OVERLAP_STORE_CACHE=false +export SGLANG_OPT_USE_FUSED_STORE_CACHE=false +export SGLANG_FORCE_TRITON_MOE_FP8=0 +export SGLANG_HACK_FLASHMLA_BACKEND=tilelang +export SGLANG_OPT_USE_TILELANG_INDEXER=true +export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +# Parallelism: pure TP, TP+EP, or DEP (DP-attn + EP). Matches the dsv4 b200 +# vllm agentic launcher so the agentic sweep can probe both interactivity and +# throughput regimes. +PARALLEL_ARGS=(--tensor-parallel-size "$TP") +if [ "$DP_ATTENTION" = "true" ]; then + PARALLEL_ARGS+=( + --dp "$TP" + --enable-dp-attention + --enable-prefill-delayer + ) +fi +if [ "${EP_SIZE:-1}" -gt 1 ]; then + PARALLEL_ARGS+=(--ep-size "$EP_SIZE") +fi + +# --max-running-requests is per-engine. With DP-attn each DP engine handles +# only CONC/$TP sequences in steady state (the agentic harness load-balances +# users across DP ranks), so size the per-engine cap to that. +# Pure TP is a single engine and sees all CONC sequences itself. +if [ "$DP_ATTENTION" = "true" ]; then + PER_ENGINE_MAX_RUNNING=$(( CONC / TP )) + [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +else + PER_ENGINE_MAX_RUNNING=$CONC +fi + +echo "Starting sglang server..." +python3 -m sglang.launch_server \ + --model-path "$MODEL" \ + --host=0.0.0.0 \ + --port "$PORT" \ + "${PARALLEL_ARGS[@]}" \ + --trust-remote-code \ + --attention-backend compressed \ + --max-running-requests "$PER_ENGINE_MAX_RUNNING" \ + --cuda-graph-max-bs "$PER_ENGINE_MAX_RUNNING" \ + --page-size 256 \ + --context-length "$MAX_MODEL_LEN" \ + --chunked-prefill-size 8192 \ + --disable-shared-experts-fusion \ + --tool-call-parser deepseekv4 \ + --reasoning-parser deepseek-v4 \ + --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \ + --watchdog-timeout 1800 > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh index 8049c1082..0a0177983 100755 --- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh @@ -11,13 +11,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=800000 fi @@ -71,14 +66,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 6795086a3..500b456f5 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -63,7 +58,6 @@ python3 -m sglang.launch_server \ --nsa-decode-backend tilelang \ --kv-cache-dtype fp8_e4m3 \ --tokenizer-worker-num $((TP*2)) \ - --disable-radix-cache \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -73,14 +67,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh index 91c289d7c..259c19586 100755 --- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -65,7 +60,6 @@ python3 -m sglang.launch_server \ --chunked-prefill-size 32768 \ --max-prefill-tokens 32768 \ --enable-flashinfer-allreduce-fusion \ ---disable-radix-cache \ --stream-interval 30 \ --context-length $MAX_MODEL_LEN \ --enable-metrics \ @@ -78,14 +72,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh index 284bf3be2..6e921db58 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -74,14 +69,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh index dce4f4250..557986b0d 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} # Agentic matrix entries don't set max-model-len, so the workflow passes 0. # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then @@ -78,14 +73,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh index c8050fe12..1592a8d5c 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} # Agentic matrix entries don't set max-model-len, so the workflow passes 0. # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then @@ -78,14 +73,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh index 962210577..eb1883ff1 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} # Agentic matrix entries don't set max-model-len, so the workflow passes 0. # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then @@ -91,14 +86,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh index 38ccac035..99e29c819 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} # Agentic matrix entries don't set max-model-len, so the workflow passes 0. # ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then @@ -90,14 +85,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index a1c95f64a..ad0b4495a 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -5,17 +5,17 @@ set -x # Agentic trace replay benchmark for Kimi-K2.5 NVFP4 on B200 using vLLM. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native simple CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -30,9 +30,61 @@ install_agentic_deps # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + case "$OFFLOADING" in none) ;; @@ -44,10 +96,70 @@ case "$OFFLOADING" in # the full eager sweep before. TOTAL_CPU_DRAM_GB=2500 export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + agentic_pip_install --quiet --no-cache-dir lmcache + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Keep the semantic CPU KV pool at 2.5 TB for every TP shape. MP mode + # owns that pool in the external LMCache server instead of passing + # --kv-offloading-size through vLLM's integrated LMCache convenience + # path, which divides the value by TP and then hits a large single-shot + # cudaHostAlloc in LMCache 0.4.5's single-process local CPU backend. + TOTAL_CPU_DRAM_GB=2500 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector builds its ZMQ endpoint by concatenating + # lmcache.mp.host and lmcache.mp.port, and its default host already + # includes the tcp:// scheme. Keep the server bind host raw, but pass + # a ZMQ-style host string to the connector. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" + # Initial allocation is deliberately small; --l1-size-gb above is the + # actual pool capacity and grows lazily as the run fills the cache. + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + --disable-hybrid-kv-cache-manager + ) ;; *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2 exit 1 ;; esac @@ -64,20 +176,27 @@ export PYTHONNOUSERSITE=1 # unsafe. export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.90 \ ---max-num-seqs $CONC \ ---reasoning-parser kimi_k2 \ ---tool-call-parser kimi_k2 \ ---compilation_config.pass_config.fuse_allreduce_rms true \ ---kv-cache-dtype fp8 \ ---max-cudagraph-capture-size 2048 \ ---stream-interval 20 \ ---trust-remote-code \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + --gpu-memory-utilization 0.90 + --max-num-seqs "$CONC" + --reasoning-parser kimi_k2 + --tool-call-parser kimi_k2 + --compilation_config.pass_config.fuse_allreduce_rms true + --kv-cache-dtype fp8 + --max-cudagraph-capture-size 2048 + --stream-interval 20 + --trust-remote-code + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -86,14 +205,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index d5975b1c4..8cebe4f20 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -6,16 +6,16 @@ set -x # # Required env vars: # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native simple CPU offload. +# lmcache - in-process LMCacheConnectorV1 via vLLM's lmcache backend. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -32,7 +32,9 @@ install_agentic_deps SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + case "$OFFLOADING" in none) ;; cpu) @@ -43,28 +45,65 @@ case "$OFFLOADING" in # inside the cgroup for vLLM worker RSS + page cache. TOTAL_CPU_DRAM_GB=2500 export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + agentic_pip_install --quiet --no-cache-dir lmcache + python3 -c "import lmcache.integration.vllm.vllm_v1_adapter" >/dev/null + + # B300 NV nodes expose ~2.82 TiB to the job cgroup. Keep the LMCache + # CPU pool at 2.5 TB to match the native offload envelope while leaving + # headroom for vLLM workers and page cache. vLLM divides this total + # across TP ranks for --kv-offloading-backend=lmcache. + TOTAL_CPU_DRAM_GB=2500 + export LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + # Avoid pinning the full 2.5 TB during engine startup. LMCache grows + # the CPU allocator as agentic prefixes accumulate in the replay. + export LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR="${LMCACHE_ENABLE_LAZY_MEMORY_ALLOCATOR:-true}" + export LMCACHE_LAZY_MEMORY_INITIAL_RATIO="${LMCACHE_LAZY_MEMORY_INITIAL_RATIO:-0.01}" + export LMCACHE_LAZY_MEMORY_STEP_RATIO="${LMCACHE_LAZY_MEMORY_STEP_RATIO:-0.02}" + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + OFFLOAD_ARGS=( + --kv-offloading-backend lmcache + --kv-offloading-size "$TOTAL_CPU_DRAM_GB" + --disable-hybrid-kv-cache-manager + ) ;; - *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; + *) echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2; exit 1 ;; esac echo "Starting vllm server..." export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.90 \ ---max-num-seqs $CONC \ ---reasoning-parser kimi_k2 \ ---tool-call-parser kimi_k2 \ ---compilation_config.pass_config.fuse_allreduce_rms true \ ---kv-cache-dtype fp8 \ ---max-cudagraph-capture-size 2048 \ ---stream-interval 20 \ ---trust-remote-code \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + --gpu-memory-utilization 0.90 + --max-num-seqs "$CONC" + --reasoning-parser kimi_k2 + --tool-call-parser kimi_k2 + --compilation_config.pass_config.fuse_allreduce_rms true + --kv-cache-dtype fp8 + --max-cudagraph-capture-size 2048 + --stream-interval 20 + --trust-remote-code + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -73,14 +112,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index c72076118..fd0ce3677 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -5,18 +5,24 @@ set -x # Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0. +# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this +# script we need the concrete value so AgentX filters prompt+max_tokens against +# the same limit vLLM enforces. +if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then + MAX_MODEL_LEN=262144 +fi if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -43,6 +49,522 @@ if [ "${TP}" -lt 8 ]; then export VLLM_ROCM_USE_AITER_RMSNORM=0 fi +write_lmcache_rocm_mp_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/sitecustomize.py" <<'PY' +"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" + +import os +import threading + +if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": + import builtins + import sys + + _orig_import = builtins.__import__ + + def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: + _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator + + if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): + return + + _orig_init = _LazyMemoryAllocator.__init__ + _orig_allocate = _LazyMemoryAllocator.allocate + _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate + + def _expand_to(self, target_size: int) -> None: + target_size = min( + self._final_size, + _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), + ) + lock = self._agentic_rocm_demand_expand_lock + with lock: + if target_size <= self._curr_size: + return + + start_size = self._curr_size + while self._curr_size < target_size: + commit_start = self._curr_size + commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) + while self._curr_size < commit_target: + self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) + self._curr_size += self.PIN_CHUNK_SIZE + self._commit_expansion(self._curr_size - commit_start) + + self._log_expansion_progress(self._curr_size - start_size) + + def _retry_with_demand_expansion(self, allocate_once): + obj = allocate_once() + step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) + step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) + + while obj is None and self._curr_size < self._final_size: + _expand_to(self, self._curr_size + step_bytes) + obj = allocate_once() + + return obj + + def _patched_init(self, *args, **kwargs): + _orig_init(self, *args, **kwargs) + self._agentic_rocm_demand_expand_lock = threading.Lock() + + # LMCache MP's upstream LazyMemoryAllocator currently expands to + # the final pinned size in a background thread. On ROCm Kimi TP4, + # vLLM reaches KV-cache registration only after that 2.5 TB pool + # is fully pinned, and the server-side IPC open path can stall + # before acknowledging register_kv_caches. Keep the same final + # capacity, but pin/commit extra host memory only when L1 + # allocations actually need it. + self._stop_expand.set() + self._expand_thread.join() + _lazy_memory_allocator.logger.info( + "Agentic ROCm patch: using demand-driven LMCache pinned " + "memory expansion; final capacity remains %s MB", + self._final_size >> 20, + ) + + def _patched_allocate( + self, + shapes, + dtypes, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), + ) + + def _patched_batched_allocate( + self, + shapes, + dtypes, + batch_size, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_batched_allocate( + self, shapes, dtypes, batch_size, fmt, allocator_type + ), + ) + + _LazyMemoryAllocator.__init__ = _patched_init + _LazyMemoryAllocator.allocate = _patched_allocate + _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate + _LazyMemoryAllocator._agentic_rocm_demand_patch = True + + def _patch_l1_memory_manager(_memory_manager) -> None: + _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) + _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) + if _L1MemoryManager is None or _LazyMemoryAllocator is None: + return + if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): + return + + _orig_get_memory_usage = _L1MemoryManager.get_memory_usage + + def _patched_get_memory_usage(self): + allocator = getattr(self, "_allocator", None) + if isinstance(allocator, _LazyMemoryAllocator): + address_manager = allocator.get_address_manager() + used_size = ( + address_manager.get_heap_size() - address_manager.get_free_size() + ) + return used_size, allocator._final_size + return _orig_get_memory_usage(self) + + _L1MemoryManager.get_memory_usage = _patched_get_memory_usage + _L1MemoryManager._agentic_rocm_final_capacity_patch = True + + def _maybe_patch_lazy_memory_allocator() -> None: + module = sys.modules.get("lmcache.v1.lazy_memory_allocator") + if module is not None and hasattr(module, "LazyMemoryAllocator"): + _patch_lazy_memory_allocator(module) + + def _maybe_patch_l1_memory_manager() -> None: + module = sys.modules.get("lmcache.v1.distributed.memory_manager") + if module is not None and hasattr(module, "L1MemoryManager"): + _patch_l1_memory_manager(module) + + def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): + module = _orig_import(name, globals, locals, fromlist, level) + if name == "lmcache.v1.lazy_memory_allocator" or ( + name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules + ): + _maybe_patch_lazy_memory_allocator() + if name == "lmcache.v1.distributed.memory_manager" or ( + name.startswith("lmcache") + and "lmcache.v1.distributed.memory_manager" in sys.modules + ): + _maybe_patch_l1_memory_manager() + return module + + builtins.__import__ = _agentic_rocm_import + _maybe_patch_lazy_memory_allocator() + _maybe_patch_l1_memory_manager() + +if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": + import torch + import lmcache.non_cuda_equivalents as lmc + + if not hasattr(lmc, "multi_layer_block_kv_transfer"): + _DTYPE_BY_NAME = { + "bfloat16": torch.bfloat16, + "float16": torch.float16, + "float32": torch.float32, + } + + def _dtype_from_env() -> torch.dtype: + name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") + try: + return _DTYPE_BY_NAME[name] + except KeyError as exc: + raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc + + def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + block_stride = shape_desc.block_stride_elems or ( + shape_desc.bs * shape_desc.nh * shape_desc.hs + ) + base = lmc._tensor_from_ptr( + ptr, + (shape_desc.nb * block_stride,), + dtype, + device, + ) + return torch.as_strided( + base, + (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), + (block_stride, shape_desc.nh * shape_desc.hs, 1), + ) + + def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + return lmc._tensor_from_ptr( + ptr, + (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), + dtype, + device, + ) + + def multi_layer_block_kv_transfer( + group_kv_pointers, + tmp_buffer_ptrs, + block_ids, + paged_memory_device, + direction, + shape_desc, + lmcache_chunk_size, + gpu_kv_format, + skip_blocks=0, + ) -> None: + # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with + # shape [num_blocks, block_size, hidden_size]. LMCache's Python + # fallback has no block-transfer entrypoint yet, so implement the + # same gather/scatter contract with torch indexing on ROCm. + if shape_desc.kv_size != 1: + raise NotImplementedError( + "ROCm LMCache MP block fallback currently supports MLA KV caches only" + ) + + dtype = _dtype_from_env() + device = ( + paged_memory_device + if isinstance(paged_memory_device, torch.device) + else torch.device(paged_memory_device) + ) + num_layers = int(group_kv_pointers.numel()) + blocks_per_chunk = lmcache_chunk_size // shape_desc.bs + direction_name = getattr(direction, "name", str(direction)) + + for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): + start = chunk_idx * blocks_per_chunk + end = start + blocks_per_chunk + chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) + + dest_slot_offset = 0 + if skip_blocks and chunk_idx == 0: + chunk_blocks = chunk_blocks[int(skip_blocks):] + dest_slot_offset = int(skip_blocks) * shape_desc.bs + if chunk_blocks.numel() == 0: + continue + + num_slots = int(chunk_blocks.numel()) * shape_desc.bs + tmp = _tmp_view( + int(tmp_ptr), + shape_desc, + num_layers, + lmcache_chunk_size, + dtype, + device, + ) + + for layer_idx in range(num_layers): + paged = _paged_view( + int(group_kv_pointers[layer_idx].item()), + shape_desc, + dtype, + device, + ) + tmp_slice = tmp[ + 0, + layer_idx, + dest_slot_offset : dest_slot_offset + num_slots, + :, + ] + if direction_name == "D2H": + gathered = paged.index_select(0, chunk_blocks).reshape( + num_slots, shape_desc.nh * shape_desc.hs + ) + tmp_slice.copy_(gathered) + elif direction_name == "H2D": + src = tmp_slice.reshape( + int(chunk_blocks.numel()), + shape_desc.bs, + shape_desc.nh * shape_desc.hs, + ) + paged.index_copy_(0, chunk_blocks, src) + else: + raise ValueError(f"Unsupported transfer direction: {direction}") + + lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer + +# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ---- +if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0": + import chunked_connector_patch # noqa: F401 + +# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ---- +import scheduler_assertion_patch # noqa: F401 +PY +} + +write_chunked_connector_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/chunked_connector_patch.py" <<'PY' +""" +Monkey-patch for LMCacheMPConnector to add chunked KV loading. + +Fixes GPU block exhaustion deadlock at high concurrency by capping +the number of external tokens reported AND retrieved per scheduling step. + +Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD= and import this +module from sitecustomize.py before LMCache is loaded. +""" + +import logging +import os +import sys +import builtins + +logger = logging.getLogger("chunked_lmcache_patch") + +_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768")) + +# Per-request chunk tracking (module-level, survives across calls) +_chunk_state: dict[str, dict] = {} + + +def _apply_patch(): + """Patch LMCacheMPConnector in-place.""" + mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector") + if mod is None: + return + cls = getattr(mod, "LMCacheMPConnector", None) + if cls is None or getattr(cls, "_chunked_patch_applied", False): + return + + LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None) + _orig_get_matched = cls.get_num_new_matched_tokens + _orig_get_finished = cls.get_finished + + def _get_blocks_per_chunk(self): + block_size = getattr(self, "block_size", 1) + return max(1, _MAX_TOKENS // block_size) + + def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens): + full_match = _orig_get_matched(self, request, num_computed_tokens) + if full_match <= 0 or _MAX_TOKENS <= 0: + return full_match + + req_id = request.request_id + block_size = getattr(self, "block_size", 1) + blocks_per_chunk = _get_blocks_per_chunk(self) + full_match_blocks = full_match // block_size + + state = _chunk_state.get(req_id) + if state is None or state.get("num_computed_at_start") != num_computed_tokens: + state = { + "full_match_blocks": full_match_blocks, + "chunk_end_blocks": 0, + "num_computed_at_start": num_computed_tokens, + "lookup_done": False, + } + _chunk_state[req_id] = state + + if state["lookup_done"]: + return 0 + + remaining = state["full_match_blocks"] - state["chunk_end_blocks"] + if remaining <= 0: + state["lookup_done"] = True + return 0 + + this_chunk = min(remaining, blocks_per_chunk) + state["chunk_end_blocks"] += this_chunk + if state["chunk_end_blocks"] >= state["full_match_blocks"]: + state["lookup_done"] = True + + capped = this_chunk * block_size + if capped < full_match: + logger.debug( + "Chunked LMCache: req %s capped %d -> %d tokens " + "(chunk %d/%d blocks)", + req_id, full_match, capped, this_chunk, full_match_blocks, + ) + + # Cap the tracker's hit blocks to match what we report + tracker = getattr(request, "kv_transfer_params", None) + if tracker is not None: + orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0) + if orig_hits > this_chunk: + tracker.num_lmcache_hit_blocks = this_chunk + + return capped + + def _patched_get_finished(self, scheduler_output): + result = _orig_get_finished(self, scheduler_output) + # Clean up chunk state for finished requests. + # vLLM passes scheduler_output as a set of request-ID strings + # (not a SchedulerOutput object), so iterate directly when it + # is a set/frozenset; fall back to the attribute path for + # forward compatibility. + if isinstance(scheduler_output, (set, frozenset)): + finished = scheduler_output + else: + finished = getattr(scheduler_output, "finished_req_ids", []) + for req in finished: + _chunk_state.pop(req, None) + return result + + cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens + cls.get_finished = _patched_get_finished + cls._chunked_patch_applied = True + logger.info( + "Chunked LMCache connector patch applied " + "(max_tokens_per_load=%d)", _MAX_TOKENS, + ) + + +_orig_import = builtins.__import__ + + +def _patching_import(name, *args, **kwargs): + module = _orig_import(name, *args, **kwargs) + if ( + name == "lmcache.integration.vllm.lmcache_mp_connector" + or ( + name.startswith("lmcache") + and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules + ) + ): + _apply_patch() + return module + + +builtins.__import__ = _patching_import +_apply_patch() +PY +} + +write_scheduler_assertion_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY' +""" +Patch vLLM scheduler to handle stale finished_recving gracefully. + +The assertion at scheduler.py crashes when a KV transfer reports +"finished recving" but the request is already in RUNNING state. +This happens when transfers complete asynchronously and the scheduler +has already moved the request forward. + +Fix: Instead of asserting, log a warning and skip. +""" + +import logging +import sys +import builtins + +logger = logging.getLogger("scheduler_assertion_patch") + + +def _apply_patch(): + """Patch vLLM scheduler's _update_from_kv_xfer_finished.""" + sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler") + if sched_mod is None: + return + req_mod = sys.modules.get("vllm.v1.request") + if req_mod is None: + return + Scheduler = getattr(sched_mod, "Scheduler", None) + RequestStatus = getattr(req_mod, "RequestStatus", None) + if Scheduler is None or RequestStatus is None: + return + if getattr(Scheduler, "_kv_xfer_patch_applied", False): + return + + _orig_update = Scheduler._update_from_kv_xfer_finished + + def _patched_update(self, kv_connector_output): + if self.connector is not None: + self.connector.update_connector_output(kv_connector_output) + for req_id in kv_connector_output.finished_recving or (): + if req_id not in self.requests: + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.warning( + "Stale finished_recving for req %s in status %s; skipping.", + req_id, req.status.name, + ) + for req_id in kv_connector_output.finished_sending or (): + if req_id not in self.requests: + continue + self._free_blocks(self.requests[req_id]) + + Scheduler._update_from_kv_xfer_finished = _patched_update + Scheduler._kv_xfer_patch_applied = True + logger.info("Scheduler KV transfer assertion patch applied") + + +_orig_import = builtins.__import__ + + +def _patching_import(name, *args, **kwargs): + module = _orig_import(name, *args, **kwargs) + if ( + name == "vllm.v1.core.sched.scheduler" + or ( + name.startswith("vllm") + and "vllm.v1.core.sched.scheduler" in sys.modules + ) + ): + _apply_patch() + return module + + +builtins.__import__ = _patching_import +_apply_patch() +PY +} + # Workaround for MEC FW <177 RCCL memory reclaim issue version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then @@ -54,47 +576,233 @@ export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + case "$OFFLOADING" in none) ;; cpu) + unset VLLM_USE_SIMPLE_KV_OFFLOAD # MI355X nodes have ~2.7 TiB of host DRAM available for offload; - # reserve 2.5 TB for the simple CPU offload connector (leaves - # ~200 GB headroom for worker RSS / page cache / slurm cgroup). + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). TOTAL_CPU_DRAM_GB=2500 - # Pure TP (no DP-attn): single engine, world_size=TP. - # SimpleCPUOffloadConnector internally divides cpu_bytes_to_use by - # world_size, so pass the full TOTAL_CPU_DRAM_GB. - PER_ENGINE_BYTES=$((TOTAL_CPU_DRAM_GB * 1024 * 1024 * 1024)) - # JSON form (rather than --kv_offloading_backend native shortcut) so - # we can pass lazy_offload=true. Eager mode (the shortcut default) - # can hit a popleft_n AssertionError in vllm/v1/core/kv_cache_utils.py - # at low/mid CONC; lazy defers the store path. Matches the H200 - # Kimi int4 launcher which cleared 17/17 with this pattern. - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + agentic_pip_install --quiet --no-cache-dir lmcache + # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and + # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and + # during Kimi fused-MoE model inspection it imports nixl_ep whenever + # that module is importable, even when this run is not using EP/NIXL + # kernels. The CUDA extension then fails immediately on AMD nodes with + # "ImportError: libcuda.so.1". + # + # LMCache MP also uses CuPy stream APIs while registering vLLM's KV + # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime + # with cudaErrorInsufficientDriver when LMCache touches the stream. Use + # the ROCm 7 CuPy wheel so the same API dispatches through HIP. + python3 -m pip uninstall -y \ + nixl nixl-cu12 nixl-cu13 nixl_ep \ + >/dev/null 2>&1 || true + python3 -m pip uninstall -y \ + cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ + >/dev/null 2>&1 || true + agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 + python3 - <<'PY' +import importlib.util +import sys + +spec = importlib.util.find_spec("nixl_ep") +if spec is not None: + locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) + print( + "Error: nixl_ep is still importable after LMCache install; " + "this ROCm Kimi run would import a CUDA-only nixl_ep module. " + f"location={locations}", + file=sys.stderr, + ) + sys.exit(1) + +try: + from cupy_backends.cuda.api import runtime as cupy_runtime +except Exception as exc: + print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) + sys.exit(1) + +if not getattr(cupy_runtime, "is_hip", False): + print( + "Error: CuPy is still using the CUDA backend after installing " + "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", + file=sys.stderr, + ) + sys.exit(1) +PY + LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" + write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" + write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR" + write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR" + export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 + export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 + export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 + # Cap external KV tokens loaded per scheduling step to prevent GPU + # block exhaustion deadlock at high concurrency (c>=32). Default + # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to + # disable chunking (only safe at low concurrency). + export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}" + export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + TOTAL_CPU_DRAM_GB=2500 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + --disable-hybrid-kv-cache-manager + ) ;; *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac -if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"; else EP=" "; fi +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---tensor-parallel-size=$TP \ -$EP \ ---gpu-memory-utilization 0.90 \ ---block-size=1 \ ---trust-remote-code \ ---max-num-seqs $CONC \ ---mm-encoder-tp-mode data \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + "${EP_ARGS[@]}" + --gpu-memory-utilization 0.90 + --block-size=1 + --trust-remote-code + --max-model-len "$MAX_MODEL_LEN" + --max-num-seqs "$CONC" + --mm-encoder-tp-mode data + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -103,14 +811,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh index 9ebe02ae8..697d3fa45 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -69,14 +64,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh index a69669c07..2fd3b381c 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -70,14 +65,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh index e8b7e49fe..97929e43e 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh @@ -9,13 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -80,14 +75,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh index 1fcbfb4ba..38ef72b56 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh @@ -9,15 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} -DP_ATTENTION=${DP_ATTENTION:-false} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -30,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -80,14 +78,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh index fa9c91a80..4ce131cba 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -85,14 +84,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh index 2516656e2..9f2d83a0b 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -85,14 +84,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh index b339be956..d21690da6 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -79,14 +78,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh index 2e5f96d4f..ed59991cb 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -29,6 +23,11 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -79,14 +78,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index 82343bae9..260bbdc68 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -35,6 +29,11 @@ rocm-smi || true amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -86,14 +85,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index 509070bf1..edac27a45 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -35,6 +29,11 @@ rocm-smi || true amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -83,14 +82,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index 316b35f63..39dd63293 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -35,6 +29,11 @@ rocm-smi || true amd-smi || true # ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + resolve_trace_source install_agentic_deps @@ -87,14 +86,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh index d3c5df245..4ba87976b 100755 --- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 @@ -59,7 +53,6 @@ python3 -m sglang.launch_server \ --chunked-prefill-size 32768 \ --max-prefill-tokens 32768 \ --context-length $MAX_MODEL_LEN \ ---disable-radix-cache \ --attention-backend trtllm_mha \ --moe-runner-backend flashinfer_trtllm \ --enable-flashinfer-allreduce-fusion \ @@ -75,14 +68,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh index 30b5f8cb9..3432af5c9 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 @@ -59,7 +53,6 @@ python3 -m sglang.launch_server \ --chunked-prefill-size 32768 \ --max-prefill-tokens 32768 \ --context-length $MAX_MODEL_LEN \ ---disable-radix-cache \ --attention-backend trtllm_mha \ --moe-runner-backend flashinfer_trtllm \ --enable-flashinfer-allreduce-fusion \ @@ -75,14 +68,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh new file mode 100755 index 000000000..9d9c1d7d5 --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on B300 using SGLang. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - SGLang GPU KV only with radix cache disabled. +# hicache - SGLang HiCache with local CPU hierarchical cache. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10} +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +nvidia-smi + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # HiCache extends RadixAttention, so do not pass --disable-radix-cache. + # B300 nodes have about 2 TB of usable CPU DRAM. Qwen3.5's hybrid + # GDN/Mamba path allocates two HiCache host pools per TP rank: one for + # hierarchical KV cache and one for hierarchical Mamba cache. Keep this + # local to the script because the workflow currently passes a generic + # default for TOTAL_CPU_DRAM_GB, not a platform-specific value. + TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}" + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # SGLang --hicache-size is per rank per host pool, while the workflow + # input is a node-total DRAM budget. Divide by TP and the number of + # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning. + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size 64 + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend kernel + --hicache-mem-layout page_first + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 +export NCCL_NVLS_ENABLE=1 +export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_FLASHINFER_GEMM=true + +{ set +x; } 2>/dev/null +SGLANG_CMD=( + python3 -m sglang.launch_server + --model-path="$MODEL" + --host=0.0.0.0 + --port="$PORT" + --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" + --trust-remote-code + --tensor-parallel-size="$TP" + --data-parallel-size=1 + --expert-parallel-size="$EP_SIZE" + --enable-symm-mem + --quantization fp8 + --kv-cache-dtype fp8_e4m3 + --mamba-ssm-dtype bfloat16 + --attention-backend trtllm_mha + --moe-runner-backend flashinfer_trtllm + --cuda-graph-max-bs "$CONC" + --max-running-requests "$CONC" + --max-prefill-tokens 16384 + --chunked-prefill-size 16384 + --mem-fraction-static 0.80 + --stream-interval 50 + --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" + --tokenizer-worker-num 6 + --tokenizer-path "$MODEL" + --context-length "$MAX_MODEL_LEN" + --enable-metrics + "${CACHE_ARGS[@]}" +) +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" +"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh new file mode 100755 index 000000000..95f0397a0 --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on H100 using SGLang. +# +# H100 has 80 GB HBM3 (vs B300's 192 GB), so weights + KV fit tighter. +# Mem-fraction-static lowered to 0.75 and chunked-prefill-size halved to +# 8192 (mirrors fixed_seq_len/qwen3.5_fp8_h100.sh). Attention backend is +# flashinfer (sm_90); the trtllm_mha path is Blackwell-only. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - SGLang GPU KV only (RadixAttention prefix cache stays on — +# agentic workloads rely on >95% theoretical hit rate). +# hicache - SGLang HiCache with local CPU hierarchical cache. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-10} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +nvidia-smi + +# ---- Resolve traces and install deps ---------------------------------------- +# H100 max_model_len caps at 131k (HBM-bound). The unfiltered with-subagents +# corpus has requests up to ~1M proxy tokens that the server would reject. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k); even +# at 131k context, the rejection rate is much lower than against the +# unfiltered corpus. +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k + +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # HiCache extends RadixAttention, so do not pass --disable-radix-cache. + # H100 nodes typically expose ~1.5-2 TB usable CPU DRAM; Qwen3.5's + # hybrid GDN/Mamba path allocates two HiCache host pools per TP rank + # (one KV, one Mamba). Workflow passes a generic TOTAL_CPU_DRAM_GB, so + # keep the per-rank-per-pool conversion local to this script. + TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-1500}" + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size 64 + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend kernel + --hicache-mem-layout page_first + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +SGLANG_MULTI_TOKENIZER=/sgl-workspace/sglang/python/sglang/srt/managers/multi_tokenizer_mixin.py +if ! sed -n '/elif isinstance(output, BatchStrOutput):/,/input_token_logprobs_val=_extract_field_by_index/p' "$SGLANG_MULTI_TOKENIZER" \ + | grep -q 'cached_tokens_details=_extract_field_by_index'; then + sed -i '/elif isinstance(output, BatchStrOutput):/,/input_token_logprobs_val=_extract_field_by_index/ { + /cached_tokens=_extract_field_by_index(output, "cached_tokens", i),/a\ + cached_tokens_details=_extract_field_by_index(\ + output, "cached_tokens_details", i\ + ), + }' "$SGLANG_MULTI_TOKENIZER" +fi + +{ set +x; } 2>/dev/null +SGLANG_CMD=( + python3 -m sglang.launch_server + --model-path="$MODEL" + --host=0.0.0.0 + --port="$PORT" + --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" + --trust-remote-code + --tensor-parallel-size="$TP" + --data-parallel-size=1 + --expert-parallel-size="$EP_SIZE" + --quantization fp8 + --kv-cache-dtype fp8_e4m3 + --mamba-ssm-dtype bfloat16 + --attention-backend flashinfer + --enable-flashinfer-allreduce-fusion + # --cuda-graph-max-bs "$CONC" + # --max-running-requests "$CONC" + # --max-prefill-tokens 8192 + # --chunked-prefill-size 8192 + --mem-fraction-static 0.75 + --stream-interval 50 + --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" + --tokenizer-worker-num 6 + --tokenizer-path "$MODEL" + --enable-metrics + "${CACHE_ARGS[@]}" +) +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" +"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index 13efe215e..aef9650ca 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -9,14 +9,8 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR +check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -MAX_DELAY=${MAX_DELAY:-60} -ADVANCE_MIN=${ADVANCE_MIN:-0.0} -ADVANCE_MAX=${ADVANCE_MAX:-0.7} -EP_SIZE=${EP_SIZE:-1} if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then MAX_MODEL_LEN=131072 fi @@ -52,7 +46,6 @@ python3 -m sglang.launch_server \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --disable-radix-cache \ --max-prefill-tokens 32768 \ --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 \ @@ -66,14 +59,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" - -set -x -$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true -set +x - -write_agentic_result_json "$RESULT_DIR" - -# ---- Post-processing -------------------------------------------------------- -python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ - "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh new file mode 100755 index 000000000..5427d0d31 --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - SGLang GPU KV only with radix cache disabled. +# hicache - SGLang HiCache with local CPU hierarchical cache. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid + # GDN/Mamba path allocates two HiCache host pools per TP rank: one for + # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB + # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per + # host pool, not 250 GB. Keep overrides for one-off tuning. + TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}" + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on + # MI355X, which requires page_size=1. The kernel/page_first HiCache + # transfer path faults on first prefill in this mode on ROCm, so keep + # the default on the safer direct/layer_first copy path. These remain + # env-overridable for future SGLang/ROCm fixes. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + # SGLang --hicache-size is per rank per host pool, while the workflow + # input is a node-total DRAM budget. Divide by TP and the number of + # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning. + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness, but SGLang's internal warmup + # request has timed out after 600s on this Qwen MI355X path. Let aiperf + # own benchmark traffic instead of blocking server readiness on it. + WARMUP_ARGS=(--skip-server-warmup) + # Keep request concurrency as the swept variable, but do not force + # HiCache runs to capture ROCm graphs at every high concurrency point. + # The conc=32 HiCache job crashed after startup readiness, before any + # aiperf traffic, while conc=16 is the highest known-good capture size + # for this model/server path. Requests above the capture size can still + # run; they just do not require a larger captured graph at startup. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +{ set +x; } 2>/dev/null +SGLANG_CMD=( + python3 -m sglang.launch_server + --attention-backend triton + --model-path "$MODEL" + --host=0.0.0.0 + --port "$PORT" + --tensor-parallel-size "$TP" + --ep-size "$EP_SIZE" + --trust-remote-code + --tokenizer-worker-num 6 + --enable-aiter-allreduce-fusion + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --max-running-requests "$CONC" + --max-prefill-tokens 32768 + --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" + --mem-fraction-static 0.8 + --context-length "$MAX_MODEL_LEN" + --enable-metrics + "${CACHE_ARGS[@]}" + "${WARMUP_ARGS[@]}" +) +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" +"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp4_b200.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh index 76bfabaf1..fa1fd407f 100644 --- a/benchmarks/single_node/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -21,7 +21,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/dsr1_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp4_b200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh index a47abbf21..4a76a82d4 100755 --- a/benchmarks/single_node/dsr1_fp4_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_mtp.sh @@ -4,7 +4,7 @@ # Mirrors dsr1_fp4_b200.sh and adds the speculative-* flags from # dsr1_fp8_b200_mtp.sh (the production B200 sglang MTP template). -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -31,7 +31,6 @@ if [[ $TP -ne 8 ]]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [[ $CONC -ge 16 ]]; then SCHEDULER_RECV_INTERVAL=30 diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp4_b200_trt.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt.sh index d57dc72cb..d2186df2c 100644 --- a/benchmarks/single_node/dsr1_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -41,7 +41,6 @@ fi echo "MOE_BACKEND set to '$MOE_BACKEND'" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsr1-fp4.yml" cat > $EXTRA_CONFIG_FILE << EOF diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt_mtp.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt_mtp.sh index e4f8b50e7..15d93458a 100644 --- a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200_trt_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -37,7 +37,6 @@ fi echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsr1-fp4-mtp.yml" cat > $EXTRA_CONFIG_FILE << EOF diff --git a/benchmarks/single_node/dsr1_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b300.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp4_b300.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_b300.sh index 917f4f5f3..334203123 100644 --- a/benchmarks/single_node/dsr1_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # DSR1 FP4 B200 SGLang recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -25,7 +25,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/dsr1_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh similarity index 96% rename from benchmarks/single_node/dsr1_fp4_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh index a062726df..bb6ce75cb 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -28,7 +28,6 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/dsr1_fp4_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom.sh index 31554fc22..6ae8f92ba 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom_mtp.sh similarity index 96% rename from benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom_mtp.sh index 1d557684e..8447a8b2a 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_atom_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_mtp.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp4_mi355x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_mtp.sh index a505b65d0..4499736e2 100755 --- a/benchmarks/single_node/dsr1_fp4_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x_mtp.sh @@ -3,7 +3,7 @@ # DeepSeek-R1-0528 MXFP4 on MI355X with EAGLE/MTP speculative decoding. # Mirrors dsr1_fp4_mi355x.sh and adds the speculative-* flags. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -34,7 +34,6 @@ if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp8_b200.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200.sh index abfecfe44..8a016bb2a 100644 --- a/benchmarks/single_node/dsr1_fp8_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -23,7 +23,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $TP -eq 8 ]]; then diff --git a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_mtp.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp8_b200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_mtp.sh index 45cfccc3e..1ad0c9041 100755 --- a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -23,7 +23,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_ENABLE_JIT_DEEPGEMM=false SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # MTP only supports TP=8 for now if [[ $TP -ne 8 ]]; then diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp8_b200_trt.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt.sh index b593535f3..b0457614e 100644 --- a/benchmarks/single_node/dsr1_fp8_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -49,7 +49,6 @@ fi echo "MOE_BACKEND set to '$MOE_BACKEND'" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsr1-fp8.yml" cat > $EXTRA_CONFIG_FILE << EOF diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt_mtp.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt_mtp.sh index e51b73384..16f13710e 100644 --- a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b200_trt_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -55,7 +55,6 @@ fi echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml" cat > $EXTRA_CONFIG_FILE << EOF diff --git a/benchmarks/single_node/dsr1_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp8_b300.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300.sh index 2d475bc0b..2599b7126 100644 --- a/benchmarks/single_node/dsr1_fp8_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # DSR1 FP8 B200 SGLang recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -27,7 +27,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $TP -eq 8 ]]; then diff --git a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300_mtp.sh similarity index 98% rename from benchmarks/single_node/dsr1_fp8_b300_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300_mtp.sh index d16cbcf8e..b60971ae5 100755 --- a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_b300_mtp.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # DSR1 FP8 B200 SGLang MTP recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -27,7 +27,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_ENABLE_JIT_DEEPGEMM=false SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # MTP only supports TP=8 for now if [[ $TP -ne 8 ]]; then diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_h200.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200.sh index 2c05e8d14..db846b4d2 100644 --- a/benchmarks/single_node/dsr1_fp8_h200.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ pip3 install --user --break-system-packages sentencepiece if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor diff --git a/benchmarks/single_node/dsr1_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_mtp.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_h200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_mtp.sh index 7929a0904..611f600f6 100755 --- a/benchmarks/single_node/dsr1_fp8_h200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_mtp.sh @@ -6,7 +6,7 @@ # Keeps the H200's flashinfer attention backend (no trtllm_mla path on # H200 for this image). -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -33,7 +33,6 @@ if [[ $TP -ne 8 ]]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding SPECULATIVE_NUM_STEPS=2 diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_h200_trt.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt.sh index 0a62abc90..c59eb8625 100644 --- a/benchmarks/single_node/dsr1_fp8_h200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -28,7 +28,6 @@ MOE_BACKEND="CUTLASS" echo "MOE_BACKEND set to '$MOE_BACKEND'" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsr1-fp8.yml" cat > $EXTRA_CONFIG_FILE << EOF diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt_mtp.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt_mtp.sh index fcea69e3d..c544af6ed 100644 --- a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_h200_trt_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -34,7 +34,6 @@ fi echo "MOE_BACKEND='$MOE_BACKEND', MTP='$MTP'" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsr1-fp8-mtp.yml" # If ISL=8192 and DP_ATTENTION=true, export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192 diff --git a/benchmarks/single_node/dsr1_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi300x.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_mi300x.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi300x.sh index b9d46225e..da95c0e7a 100644 --- a/benchmarks/single_node/dsr1_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi300x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -34,7 +34,6 @@ export SGLANG_USE_AITER=1 export SGLANG_AITER_MLA_PERSIST=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_mi325x.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x.sh index a06a206d2..6b1c50265 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x.sh @@ -1,7 +1,7 @@ #!/usr/bin/bash # Source benchmark utilities early -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ diff --git a/benchmarks/single_node/dsr1_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x_mtp.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_mi325x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x_mtp.sh index d792bc7e9..8251c169a 100755 --- a/benchmarks/single_node/dsr1_fp8_mi325x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi325x_mtp.sh @@ -3,7 +3,7 @@ # DeepSeek-R1-0528 FP8 on MI325X with EAGLE/MTP speculative decoding. # Mirrors dsr1_fp8_mi325x.sh and adds the speculative-* flags. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x.sh similarity index 96% rename from benchmarks/single_node/dsr1_fp8_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x.sh index ea9ecefe8..d8b596826 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -25,7 +25,6 @@ export RCCL_MSCCL_ENABLE=0 export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/dsr1_fp8_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom.sh index 31554fc22..6ae8f92ba 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh similarity index 96% rename from benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh index 69179cec0..e4943488f 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_mtp.sh similarity index 97% rename from benchmarks/single_node/dsr1_fp8_mi355x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_mtp.sh index c1d43f153..d8fc1590b 100755 --- a/benchmarks/single_node/dsr1_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_mtp.sh @@ -3,7 +3,7 @@ # DeepSeek-R1-0528 FP8 on MI355X with EAGLE/MTP speculative decoding. # Mirrors dsr1_fp8_mi355x.sh and adds the speculative-* flags. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -31,7 +31,6 @@ export RCCL_MSCCL_ENABLE=0 export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Keep server-side speculative decoding capacity aligned with the matrix row. MAX_RUNNING_REQUESTS="${MAX_RUNNING_REQUESTS:-$CONC}" diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b200.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh index 070e987a0..e1d031854 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -35,7 +35,6 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # Drop the runner conditional once lmsys moves sglang back out of /workspace. SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b200_trt.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh index 40669cd15..e4a24dea2 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt.sh @@ -4,7 +4,7 @@ # already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at # runtime from this benchmark path. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -54,7 +54,6 @@ fi nvidia-smi SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" MOE_BACKEND="TRTLLM" diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b200_trt_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh index d7308bbf5..9e5c88212 100644 --- a/benchmarks/single_node/dsv4_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_trt_mtp.sh @@ -3,7 +3,7 @@ # DeepSeek-V4-Pro B200 TensorRT-LLM MTP variant. The configured image already # contains the DeepSeek-V4 TRTLLM build; this path only toggles speculative MTP. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -53,7 +53,6 @@ fi nvidia-smi SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" MOE_BACKEND="TRTLLM" diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b200_vllm.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh index 312d41472..1ef273224 100755 --- a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm.sh @@ -4,7 +4,7 @@ # sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode # (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size). -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # DeepSeek-V4-Pro weights are large; engine startup can exceed the default # 600s. Give it an hour to load. diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b200_vllm_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh index 21b40eeb8..6846223e8 100755 --- a/benchmarks/single_node/dsv4_fp4_b200_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b200_vllm_mtp.sh @@ -5,7 +5,7 @@ # routes prompts through chat-formatted encoding via --dsv4 (required for # meaningful MTP acceptance numbers). -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -27,7 +27,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # DeepSeek-V4-Pro weights are large; engine startup can exceed the default # 600s. Give it an hour to load. diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh similarity index 99% rename from benchmarks/single_node/dsv4_fp4_b300_sglang.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index 8f43ea8a3..6d406f2eb 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -40,7 +40,6 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # /workspace. SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh index 03102778d..dc6af5c76 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" # Tuning inputs from the matrix (all required): # TP -- tensor parallel size -> --tp @@ -51,7 +51,6 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # /workspace. SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b300_trt.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh index 754846912..db27b4f7a 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt.sh @@ -4,7 +4,7 @@ # already contains a TensorRT-LLM DeepSeek-V4 build; do not build TRTLLM at # runtime from this benchmark path. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -54,7 +54,6 @@ fi nvidia-smi SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml" MOE_BACKEND="TRTLLM" diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh index 8aa9d0e78..c725f350e 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_trt_mtp.sh @@ -3,7 +3,7 @@ # DeepSeek-V4-Pro B300 TensorRT-LLM MTP variant. The configured image already # contains the DeepSeek-V4 TRTLLM build; this path only toggles speculative MTP. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -53,7 +53,6 @@ fi nvidia-smi SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} EXTRA_CONFIG_FILE="dsv4-fp4-trt-mtp.yml" MOE_BACKEND="TRTLLM" diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh similarity index 97% rename from benchmarks/single_node/dsv4_fp4_b300_vllm.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh index 92d4bf4ad..947d16a6d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm.sh @@ -4,7 +4,7 @@ # pareto sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode # (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size). -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # DeepSeek-V4-Pro weights are large; engine startup can exceed the default # 600s. Give it an hour to load. diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh similarity index 97% rename from benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh index cb41a9eb1..279e3693a 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_vllm_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -22,7 +22,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export VLLM_ENGINE_READY_TIMEOUT_S=3600 diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/dsv4_fp4_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh index 4307f9605..6771c1788 100644 --- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh similarity index 97% rename from benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh index a4976bdb0..b02a09489 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -83,7 +83,6 @@ export SGLANG_OPT_USE_FUSED_PAGED_COMPRESS=true export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=0 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -124,7 +123,7 @@ python3 -m sglang.launch_server \ --disable-shared-experts-fusion \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ - --chat-template "$(dirname "$0")/chat_templates/deepseek_v4_thinking.jinja" \ + --chat-template "$(dirname "$0")/../chat_templates/deepseek_v4_thinking.jinja" \ --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh similarity index 97% rename from benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh index 83d807c6e..dc8989b3e 100755 --- a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh @@ -21,7 +21,7 @@ set -eo pipefail # --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode # enables full CUDA graph capture for improved throughput on MI355X. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -47,7 +47,6 @@ fi export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/dsv4_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200.sh similarity index 97% rename from benchmarks/single_node/dsv4_fp8_h200.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200.sh index 51e4a72d2..274dee995 100644 --- a/benchmarks/single_node/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200.sh @@ -4,7 +4,7 @@ # the cu129 image and omits the FP4 indexer cache flag (H200 has no FP4 # path). Max-model-len is pinned at 800k per the recipe. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # DeepSeek-V4-Pro weights are large; engine startup can exceed the default # 600s. Give it an hour to load. diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_mtp.sh similarity index 98% rename from benchmarks/single_node/dsv4_fp8_h200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_mtp.sh index 0446ac6d9..bf37eb2d0 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_mtp.sh @@ -6,7 +6,7 @@ # routes prompts through chat-formatted encoding via --dsv4 (required for # meaningful MTP acceptance numbers per AGENTS.md). -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -28,7 +28,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # DeepSeek-V4-Pro weights are large; engine startup can exceed the default # 600s. Give it an hour to load. diff --git a/benchmarks/single_node/dsv4_fp8_h200_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang.sh similarity index 96% rename from benchmarks/single_node/dsv4_fp8_h200_sglang.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang.sh index bf5c6f7b2..3e7132ebe 100644 --- a/benchmarks/single_node/dsv4_fp8_h200_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/dsv4_fp8_h200_sglang_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh similarity index 97% rename from benchmarks/single_node/dsv4_fp8_h200_sglang_mtp.sh rename to benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh index bcba41543..788eff5b8 100644 --- a/benchmarks/single_node/dsv4_fp8_h200_sglang_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp8_h200_sglang_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x.sh similarity index 97% rename from benchmarks/single_node/glm5.1_fp4_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x.sh index c280f3c4f..aada63d56 100644 --- a/benchmarks/single_node/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -x -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -24,7 +24,6 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export SAFETENSORS_FAST_GPU=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 32)) EVAL_CONTEXT_ARGS="" diff --git a/benchmarks/single_node/glm5.1_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/glm5.1_fp4_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x_atom.sh index 036346af3..b1d1b61c8 100644 --- a/benchmarks/single_node/glm5.1_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5.1_fp4_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/glm5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200.sh similarity index 96% rename from benchmarks/single_node/glm5_fp4_b200.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b200.sh index 53cb8afee..a1ae27021 100755 --- a/benchmarks/single_node/glm5_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -21,7 +21,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200_mtp.sh similarity index 97% rename from benchmarks/single_node/glm5_fp4_b200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b200_mtp.sh index ecd5ca0af..7181ae9bc 100755 --- a/benchmarks/single_node/glm5_fp4_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b200_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -25,7 +25,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=1 export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300.sh similarity index 97% rename from benchmarks/single_node/glm5_fp4_b300.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b300.sh index b751ddf7a..10c8a0e4c 100755 --- a/benchmarks/single_node/glm5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -25,7 +25,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300_mtp.sh similarity index 97% rename from benchmarks/single_node/glm5_fp4_b300_mtp.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp4_b300_mtp.sh index db586dad8..bdea441a8 100755 --- a/benchmarks/single_node/glm5_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp4_b300_mtp.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # GLM5 FP8 B200 SGLang recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -29,7 +29,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=1 export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200.sh similarity index 97% rename from benchmarks/single_node/glm5_fp8_b200.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b200.sh index ccaa87b98..2e32a567c 100755 --- a/benchmarks/single_node/glm5_fp8_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -24,7 +24,6 @@ pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" export SGL_ENABLE_JIT_DEEPGEMM=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200_mtp.sh similarity index 97% rename from benchmarks/single_node/glm5_fp8_b200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b200_mtp.sh index 5e4f98533..2c1f6e934 100755 --- a/benchmarks/single_node/glm5_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b200_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -25,7 +25,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=1 export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300.sh similarity index 97% rename from benchmarks/single_node/glm5_fp8_b300.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b300.sh index 730cc3950..b9fe1c351 100644 --- a/benchmarks/single_node/glm5_fp8_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # GLM5 FP8 B200 SGLang recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -32,7 +32,6 @@ pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" export SGL_ENABLE_JIT_DEEPGEMM=0 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh similarity index 97% rename from benchmarks/single_node/glm5_fp8_b300_mtp.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh index 0d4290dd3..5389e6a08 100755 --- a/benchmarks/single_node/glm5_fp8_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # GLM5 FP8 B200 SGLang recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -33,7 +33,6 @@ export SGL_ENABLE_JIT_DEEPGEMM=0 export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} echo "CONC: $CONC, ISL: $ISL, OSL: $OSL" diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200.sh similarity index 96% rename from benchmarks/single_node/glm5_fp8_h200.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_h200.sh index 410c66942..266587de9 100644 --- a/benchmarks/single_node/glm5_fp8_h200.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/glm5_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200_mtp.sh similarity index 96% rename from benchmarks/single_node/glm5_fp8_h200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_h200_mtp.sh index ea7eaccde..133d757dc 100755 --- a/benchmarks/single_node/glm5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_h200_mtp.sh @@ -6,7 +6,7 @@ # nsa/trtllm-mha) since those backends are Blackwell-specific and not # applicable to Hopper. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/glm5_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x.sh similarity index 96% rename from benchmarks/single_node/glm5_fp8_mi325x.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x.sh index c1d24b76d..0564ef8d8 100755 --- a/benchmarks/single_node/glm5_fp8_mi325x.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -18,7 +18,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/glm5_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x_mtp.sh similarity index 97% rename from benchmarks/single_node/glm5_fp8_mi325x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x_mtp.sh index 5e771e74e..fb77d84c2 100755 --- a/benchmarks/single_node/glm5_fp8_mi325x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi325x_mtp.sh @@ -3,7 +3,7 @@ # GLM-5 FP8 on MI325X with EAGLE / MTP speculative decoding. # Mirrors glm5_fp8_mi325x.sh and adds the speculative-* flags. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -22,7 +22,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x.sh similarity index 96% rename from benchmarks/single_node/glm5_fp8_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x.sh index cd99536b9..21defe90c 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -23,7 +23,6 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export SAFETENSORS_FAST_GPU=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/glm5_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/glm5_fp8_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_atom.sh index 036346af3..b1d1b61c8 100644 --- a/benchmarks/single_node/glm5_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_mtp.sh similarity index 97% rename from benchmarks/single_node/glm5_fp8_mi355x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_mtp.sh index 49561dcde..90fa04f5d 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/glm5_fp8_mi355x_mtp.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -x -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -25,7 +25,6 @@ export SAFETENSORS_FAST_GPU=1 export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 32)) EVAL_CONTEXT_ARGS="" diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200.sh similarity index 97% rename from benchmarks/single_node/gptoss_fp4_b200.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200.sh index 8ff373b63..743974df3 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -48,7 +48,6 @@ export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200_trt.sh similarity index 98% rename from benchmarks/single_node/gptoss_fp4_b200_trt.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200_trt.sh index 60bc9eb71..ced9162f9 100644 --- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_b200_trt.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Source benchmark utilities early -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h100.sh similarity index 96% rename from benchmarks/single_node/gptoss_fp4_h100.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_h100.sh index 7208e1b19..dfd842a88 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h100.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -34,7 +34,6 @@ EOF export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200.sh similarity index 96% rename from benchmarks/single_node/gptoss_fp4_h200.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200.sh index 0c1b03bbb..b65c86782 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -47,7 +47,6 @@ EOF SERVER_LOG=/workspace/server.log export TORCH_CUDA_ARCH_LIST="9.0" -PORT=${PORT:-8888} export VLLM_MXFP4_USE_MARLIN=1 diff --git a/benchmarks/single_node/gptoss_fp4_h200_trt.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200_trt.sh similarity index 96% rename from benchmarks/single_node/gptoss_fp4_h200_trt.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200_trt.sh index 3da862a0d..02dd05bc9 100644 --- a/benchmarks/single_node/gptoss_fp4_h200_trt.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_h200_trt.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} set +x diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi300x.sh similarity index 97% rename from benchmarks/single_node/gptoss_fp4_mi300x.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi300x.sh index 572d6b279..c18a5a3ee 100644 --- a/benchmarks/single_node/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi300x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -40,7 +40,6 @@ ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi325x.sh similarity index 97% rename from benchmarks/single_node/gptoss_fp4_mi325x.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi325x.sh index 572d6b279..c18a5a3ee 100644 --- a/benchmarks/single_node/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi325x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -40,7 +40,6 @@ ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh similarity index 97% rename from benchmarks/single_node/gptoss_fp4_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh index 3db687e22..14dedb141 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -41,7 +41,6 @@ ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/gptoss_fp4_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x_atom.sh index ee0810e8f..d3a8a66a1 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/gptoss_fp4_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh similarity index 97% rename from benchmarks/single_node/kimik2.5_fp4_b200.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh index b4e85c14f..59b55c90c 100644 --- a/benchmarks/single_node/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -24,7 +24,6 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/kimik2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_fp4_b300.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh index 44a06ebd5..7526e57c2 100755 --- a/benchmarks/single_node/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -28,7 +28,6 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x.sh similarity index 97% rename from benchmarks/single_node/kimik2.5_fp4_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x.sh index 56e927efc..d4616143a 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -29,7 +29,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_fp4_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x_atom.sh index ca84f8228..6730aded2 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_fp4_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/kimik2.5_int4_b200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b200.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_int4_b200.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b200.sh index 6dd4998ca..cbef22d67 100755 --- a/benchmarks/single_node/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -24,7 +24,6 @@ export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/kimik2.5_int4_b300.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b300.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_int4_b300.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b300.sh index 6674ad8dd..432f97299 100755 --- a/benchmarks/single_node/kimik2.5_int4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -28,7 +28,6 @@ export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/kimik2.5_int4_h200.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_h200.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_int4_h200.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_h200.sh index 1c25d791a..1f18032ff 100755 --- a/benchmarks/single_node/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_h200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -23,7 +23,6 @@ nvidia-smi export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/kimik2.5_int4_mi300x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi300x.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_int4_mi300x.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi300x.sh index bb653a7b6..bb5145a66 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi300x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -24,7 +24,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi325x.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_int4_mi325x.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi325x.sh index bb653a7b6..bb5145a66 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi325x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -24,7 +24,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi355x.sh similarity index 96% rename from benchmarks/single_node/kimik2.5_int4_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi355x.sh index 24685a7e3..5c6b8c73a 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/kimik2.5_int4_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -24,7 +24,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b200.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp4_b200.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b200.sh index 27aef1cc9..fc7877a1c 100755 --- a/benchmarks/single_node/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -23,7 +23,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export VLLM_FLOAT32_MATMUL_PRECISION=high diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b300.sh similarity index 97% rename from benchmarks/single_node/minimaxm2.5_fp4_b300.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b300.sh index a2861b441..1253c116d 100755 --- a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -27,7 +27,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export VLLM_FLOAT32_MATMUL_PRECISION=high diff --git a/benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh similarity index 97% rename from benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh index 4d8fbc9ed..28677ae1e 100755 --- a/benchmarks/single_node/minimaxm2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -36,7 +36,6 @@ EXTRA_VLLM_ARGS="" # fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/minimaxm2.5_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp4_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x_atom.sh index ca84f8228..6730aded2 100644 --- a/benchmarks/single_node/minimaxm2.5_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp4_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b200.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp8_b200.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b200.sh index 19b62239d..9897afca3 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -22,7 +22,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export VLLM_FLOAT32_MATMUL_PRECISION=high diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b300.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp8_b300.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b300.sh index 30821961f..d5b03b59a 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_b300.sh @@ -4,7 +4,7 @@ # does not have a B300-specific recipe, so this script reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export VLLM_FLOAT32_MATMUL_PRECISION=high diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h100.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp8_h100.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h100.sh index 258ec7dc1..012c8b535 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h100.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -23,7 +23,6 @@ nvidia-smi export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h200.sh similarity index 97% rename from benchmarks/single_node/minimaxm2.5_fp8_h200.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h200.sh index 2e87cd828..eab6e6087 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_h200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi300x.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi300x.sh index 65cb8ee8e..8a95dc138 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi300x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ fi export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi325x.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi325x.sh index 13867ce7e..06ad39726 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi325x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -29,7 +29,6 @@ fi export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x.sh similarity index 98% rename from benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x.sh index 56bae46f0..5093a56d6 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -65,7 +65,6 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/minimaxm2.5_fp8_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x_atom.sh index 2a8c67da0..325c97726 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm2.5_fp8_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/qwen3.5_bf16_b200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_bf16_b200.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200.sh index 4087d7973..3f7c6a314 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true export PYTHONUNBUFFERED=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/qwen3.5_bf16_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_bf16_b200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200_mtp.sh index 319d39f58..be314c872 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b200_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true export PYTHONUNBUFFERED=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_bf16_b300.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300.sh index f1056c896..48dc98fa9 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true export PYTHONUNBUFFERED=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300_mtp.sh index 705ca9775..774ca8a3c 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_b300_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -26,7 +26,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true export PYTHONUNBUFFERED=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi300x.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_bf16_mi300x.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi300x.sh index 644b6db8c..32fe60a73 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi300x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -18,7 +18,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_bf16_mi325x.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x.sh index 644b6db8c..32fe60a73 100644 --- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -18,7 +18,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_bf16_mi325x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x_mtp.sh index 65e68e9c8..e9df93c7d 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi325x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi325x_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_bf16_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x.sh index d149e7a40..1661df465 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x_mtp.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_bf16_mi355x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x_mtp.sh index 87605fa80..38230cc88 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_bf16_mi355x_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_fp4_b200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp4_b200.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200.sh index 76dbf5e0f..638bc85ec 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -21,7 +21,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp4_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp4_b200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_mtp.sh index 55e1bd723..5da51d974 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -21,7 +21,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp4_b300.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp4_b300.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300.sh index 18b6cda09..84205cf51 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300.sh @@ -3,7 +3,7 @@ # Follows the SGLang cookbook recipe at # https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -29,7 +29,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true export PYTHONUNBUFFERED=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC >= 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300_mtp.sh similarity index 98% rename from benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300_mtp.sh index 9cb5d5464..0cac9bef7 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b300_mtp.sh @@ -3,7 +3,7 @@ # Follows the SGLang cookbook recipe at # https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5 as of 2026-04-17. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -29,7 +29,6 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true export PYTHONUNBUFFERED=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # Default: recv every ~10 requests; if CONC >= 16, relax to ~30 requests between scheduler recv polls. if [[ $CONC -ge 16 ]]; then diff --git a/benchmarks/single_node/qwen3.5_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp4_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh index 2a0976f8d..e400729ff 100644 --- a/benchmarks/single_node/qwen3.5_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_USE_AITER=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8} if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp4_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_atom.sh index 2a8c67da0..325c97726 100644 --- a/benchmarks/single_node/qwen3.5_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/qwen3.5_fp4_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp4_mi355x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh index 425fbfcc6..e98dec2db 100755 --- a/benchmarks/single_node/qwen3.5_fp4_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_mi355x_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ hf download "$MODEL" export SGLANG_USE_AITER=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.8} if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_b200.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200.sh index 2450493be..4b9005eb8 100755 --- a/benchmarks/single_node/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -21,7 +21,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200_mtp.sh index f6ef90864..a7093d4b8 100755 --- a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b200_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -21,7 +21,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_b300.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300.sh index cbceb6f1b..6644c1320 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi nvidia-smi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300_mtp.sh index ca3b87120..7e799875c 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_b300_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi nvidia-smi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp8_h100.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh index 4c70657aa..daf03a05d 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh @@ -7,7 +7,7 @@ # chunked-prefill-size from 16384 → 8192 to leave more headroom. # Sweep tops out at conc=32 instead of 64 for the same reason. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -28,7 +28,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} MAX_SEQ_LEN=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100_mtp.sh index 86b35f5e7..faa666f8b 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100_mtp.sh @@ -4,7 +4,7 @@ # Mirrors qwen3.5_fp8_h100.sh; adds the speculative-* flags + SGLANG_ENABLE_SPEC_V2=1 # and passes --use-chat-template per AGENTS.md. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -27,7 +27,6 @@ if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} MAX_SEQ_LEN=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/qwen3.5_fp8_h200.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_h200.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200.sh index a8071c520..07ce08a58 100644 --- a/benchmarks/single_node/qwen3.5_fp8_h200.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -21,7 +21,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} MAX_SEQ_LEN=$((ISL + OSL + 20)) if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200_mtp.sh index b68c9d060..98c1ec9db 100644 --- a/benchmarks/single_node/qwen3.5_fp8_h200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h200_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -22,7 +22,6 @@ nvidia-smi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} # MTP (Multi-Token Prediction) Config - EAGLE speculative decoding SPECULATIVE_NUM_STEPS=3 diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi300x.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_mi300x.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi300x.sh index 760f01403..e1607860d 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi300x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -18,7 +18,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_mi325x.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x.sh index 760f01403..e1607860d 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -18,7 +18,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x_mtp.sh similarity index 97% rename from benchmarks/single_node/qwen3.5_fp8_mi325x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x_mtp.sh index 3a8c1d3dd..a8e04064b 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi325x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi325x_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_mi355x.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh index d149e7a40..1661df465 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_mi355x_atom.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom.sh index 2a8c67da0..325c97726 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom_mtp.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_mi355x_atom_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom_mtp.sh index 50d90f380..29351cf33 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_atom_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -20,7 +20,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} export OMP_NUM_THREADS=1 diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh similarity index 96% rename from benchmarks/single_node/qwen3.5_fp8_mi355x_mtp.sh rename to benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh index 87605fa80..38230cc88 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -19,7 +19,6 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} CONTEXT_LENGTH=$((ISL + OSL + 20)) MAX_PREFILL_TOKENS=32768 diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 25e7f4db5..875cbcdd5 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -5,13 +5,35 @@ # the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). The SGLang # recipes are copied exactly from the pinned srt-slurm commit below. -set -x +# -e: abort on any unhandled error. -o pipefail: pipeline fails if any +# stage fails. Without these, errors like a bad `git checkout SHA` get +# silently swallowed and the script continues with broken state. R5 of +# dsv4-fp4-gb300-dynamo-vllm-agentic caught this — a bad checkout left +# the cw shards on origin/HEAD (which happened to be the right commit), +# masking the bug entirely until upstream main moves and breaks us. +set -exo pipefail if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then # Weights staged on shared storage; avoid node-local /scratch symlink drift. export MODEL_PATH="/mnt/vast/models/dsv4" - if [[ $FRAMEWORK == "dynamo-sglang" ]]; then + if [[ "$IS_AGENTIC" == "1" ]]; then + # Agentic multi-node uses upstream NVIDIA/srt-slurm@main, which has + # caught up on every schema feature we need: + # - BenchmarkType.CUSTOM + benchmark.command + benchmark.env + # (the hook that hands off to benchmarks/multi_node/agentic_srt.sh) + # - DynamoConfig.wheel (so our vllm recipes can pin the same + # ai-dynamo wheel as the fixed-seq-len path) + # - default_bash_preamble (no more "Unknown field" warning) + # Per-worker --mem=0 is set via `srun_options:` in the recipe yaml + # (a documented top-level field that srtctl threads through to + # start_srun_process → see docs/config-reference.md#srun_options). + # Pin to HEAD as of when this landed; bump as upstream evolves. + SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" + SRT_SLURM_RECIPES_REF="127597c2926467db06e6707e0aa9227261c6c02a" + SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic" + SRT_RECIPE_DST="recipes/vllm/deepseek-v4/agentic" + elif [[ $FRAMEWORK == "dynamo-sglang" ]]; then SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" SRT_SLURM_RECIPES_REF="main" SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" @@ -45,6 +67,15 @@ export SLURM_ACCOUNT="cw-sup" export NVIDIA_VISIBLE_DEVICES=all export NVIDIA_DRIVER_CAPABILITIES=compute,utility +# Host-side directory holding aiperf's content-addressed dataset mmap cache. +# Bind-mounted into worker containers at /aiperf_mmap_cache via the +# default_mounts: block in srtslurm.yaml below; aiperf reads it via +# AIPERF_DATASET_MMAP_CACHE_DIR (set in each agentic recipe's benchmark.env). +# Without it, every run re-tokenizes and re-writes ~65 GB of mmap files +# per dataset on first use. 777 mode so all gharunner_X SLURM users can +# write to it. +export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/ai-perf-cache" + NGINX_IMAGE="nginx:1.27.4" # Squash files live alongside models on /mnt/vast (shared across nodes). @@ -132,7 +163,11 @@ if [ -e "$HOME/.local/bin/uv" ]; then exit 1 fi -uv venv +# --seed installs pip+setuptools+wheel into the venv. Without it, the +# upstream prefetch-ai-dynamo-wheel.sh script (called by srtctl when a +# recipe has dynamo.wheel set) fails with "No module named pip" because +# uv venv defaults to no-pip. +uv venv --seed source .venv/bin/activate uv pip install -e . @@ -173,6 +208,7 @@ srtctl_root: "${SRTCTL_ROOT}" default_mounts: ${DYNAMO_WHEELS_CACHE_HOST}: /configs/dynamo-wheels + ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache model_paths: dspro: "${MODEL_PATH}" @@ -243,6 +279,23 @@ echo "Extracted JOB_ID: $JOB_ID" LOGS_DIR="outputs/$JOB_ID/logs" LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" +# Snapshot worker logs on any exit path — normal completion, error, +# SIGTERM (gh run cancel sends this to the launcher), even SIGKILL of +# our parent. Without this trap, the cancel-time tar lives only in the +# main flow below (after `wait $POLL_PID`), so a manual `gh run cancel` +# during the tail wait skips it entirely and the +# `Upload server logs` workflow step finds nothing to upload. +# Idempotent: the main-flow tar at the bottom of this script is now a +# no-op because the trap already produced the artifact, but it stays +# for narrative continuity in normal (non-cancel) runs. +_snapshot_server_logs() { + if [ -n "${LOGS_DIR:-}" ] && [ -d "$LOGS_DIR" ] && [ -n "${GITHUB_WORKSPACE:-}" ]; then + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" 2>/dev/null || true + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . 2>/dev/null || true + fi +} +trap _snapshot_server_logs EXIT + while ! ls "$LOG_FILE" &>/dev/null; do if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then echo "ERROR: Job $JOB_ID failed before creating log file" @@ -273,8 +326,9 @@ echo "Collecting results..." if [ -d "$LOGS_DIR" ]; then echo "Found logs directory: $LOGS_DIR" - cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" - tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . + # Tarball + LOGS copy are produced by the EXIT trap defined near + # JOB_ID extraction (so cancel paths also get them); just log here. + echo "multinode_server_logs.tar.gz will be (re)produced on script EXIT." else echo "Warning: Logs directory not found at $LOGS_DIR" fi diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 5248e63ed..ec8c1adb7 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -8,6 +8,15 @@ export SLURM_PARTITION="batch_1" export SLURM_ACCOUNT="benchmark" export ENROOT_ROOTFS_WRITABLE=1 +# Host-side directory holding aiperf's content-addressed dataset mmap cache. +# Bind-mounted into worker containers at /aiperf_mmap_cache via the +# default_mounts: block in srtslurm.yaml below; aiperf reads it via +# AIPERF_DATASET_MMAP_CACHE_DIR (set in each agentic recipe's benchmark.env). +# Without it, every run re-tokenizes and re-writes ~65 GB of mmap files +# per dataset on first use. 777 mode so all gharunner_X SLURM users can +# write to it. +export AIPERF_MMAP_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/ai-perf-cache" + export MODEL_PATH=$MODEL if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then @@ -19,6 +28,15 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH=/scratch/models/DeepSeek-R1-0528 export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Use the node-local /scratch SSD for the 806 GB DSv4-Pro + # checkpoint. Faster than the Vast NFS path, but this dir only + # exists on compute nodes — the GHA runner pod's view does NOT + # have /scratch/models, so srtctl preflight (which stats the path + # from the runner pod) may fail with "Model alias resolved to + # /scratch/models/DeepSeek-V4-Pro, but that path is unavailable." + # If that happens, the next step is either to (a) patch srt-slurm + # to add a skip_model_preflight recipe field, or (b) stub a + # symlink on the runner pod that points at the NFS copy. export MODEL_PATH=/scratch/models/DeepSeek-V4-Pro export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then @@ -31,8 +49,14 @@ fi NGINX_IMAGE="nginx:1.27.4" -SQUASH_FILE="/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="/home/sa-shared/gharunners/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +# Squash files live on the Vast NFS storage; use the /data/ mount +# (not /home/sa-shared/) — both are the same backing storage but the +# /home/sa-shared/ mount has a chronic ELOOP / "Too many levels of +# symbolic links" bug from workflow worker NFS sessions on lockfiles +# AND data files. /data/ has a separate NFS client cache that isn't +# poisoned. See feedback_gb300_nfs_eloop_workaround for diagnosis. +SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" # Run the import on a compute node via srun, not on the login node: # the login node is x86_64 while the compute nodes are aarch64, so the @@ -65,7 +89,40 @@ RUN_KEY=$(printf "%s" "${RESULT_FILENAME:-${RUNNER_NAME:-gb300-nv}}" | sha1sum | SRT_REPO_DIR="${GITHUB_WORKSPACE}/srt-slurm-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}" rm -rf "$SRT_REPO_DIR" -if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then +if [[ "$IS_AGENTIC" == "1" ]]; then + # Agentic multi-node uses cquil11/srt-slurm-nv@cam/no-preflight-flag, + # a thin branch off NVIDIA/srt-slurm@127597c that adds one CLI flag + # (`srtctl apply --no-preflight`) — needed because: + # + # - We want MODEL_PATH=/scratch/models/DeepSeek-V4-Pro (node-local + # NVMe, fast) instead of the NFS path under /data/home/sa-shared. + # - /scratch only exists on GB300 compute nodes; it is NOT mounted + # on the GHA runner pod that invokes srtctl. + # - srtctl's pre-submit model check (_preflight_model in + # src/srtctl/core/validation.py) does a Path.is_dir() in-process + # on the invoking node — so it fails before sbatch is ever + # called with "Model alias 'X' resolved to '/scratch/...', + # but that path is unavailable". + # - --no-preflight skips just the optional Python-level FS check. + # vLLM still fails loudly at runtime if the path is genuinely + # missing on the compute node. + # + # All other upstream schema features we need are inherited from + # NVIDIA HEAD: + # - BenchmarkType.CUSTOM + benchmark.command + benchmark.env + # (hook that hands off to benchmarks/multi_node/agentic_srt.sh) + # - DynamoConfig.wheel (so vllm recipes can pin the ai-dynamo wheel) + # - sbatch_directives / srun_options (top-level recipe fields) + git clone https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + # 854b3fd = --no-preflight flag + # 6e34b8b = benchmark_stage propagates srun_options (needed for + # container-remap-root to reach the agentic_srt.sh srun) + git checkout 6e34b8b83229634d732e41a4e2d6595f46ef60b5 + mkdir -p recipes/vllm/deepseek-v4/agentic + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic" \ + recipes/vllm/deepseek-v4/agentic +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout aflowers/gb200-dsv4-recipes @@ -95,7 +152,11 @@ export PATH="$UV_INSTALL_DIR:$PATH" VENV_DIR="${GITHUB_WORKSPACE}/.venv-srt-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}" rm -rf "$VENV_DIR" -uv venv "$VENV_DIR" +# --seed installs pip+setuptools+wheel into the venv. Without it, the +# upstream prefetch-ai-dynamo-wheel.sh script (called by srtctl when a +# recipe has dynamo.wheel set) fails with "No module named pip" because +# uv venv defaults to no-pip. +uv venv --seed "$VENV_DIR" source "$VENV_DIR/bin/activate" uv pip install -e . @@ -124,6 +185,13 @@ network_interface: "" # Path to srtctl repo root (where the configs live) srtctl_root: "${SRTCTL_ROOT}" +# Cluster-level bind mounts applied to every worker container +# (see srtctl/core/runtime.py — get_srtslurm_setting("default_mounts")). +# Used here for aiperf's persistent mmap cache so the dataset isn't +# re-tokenized + re-written every job. +default_mounts: + "${AIPERF_MMAP_CACHE_HOST_PATH}": "/aiperf_mmap_cache" + # Model path aliases model_paths: "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" @@ -155,7 +223,18 @@ fi # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" -SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +# --no-preflight is only safe on the agentic path, where the recipe +# resolves model.path to /scratch (compute-node-only NVMe) and the +# srtctl process running on the GHA runner pod can't see it. Fixed- +# seq-len recipes still resolve model.path to an NFS-visible location +# where the precheck is a useful sanity guard, so keep enforcement on +# for them. +PREFLIGHT_FLAG="" +if [[ "$IS_AGENTIC" == "1" ]]; then + PREFLIGHT_FLAG="--no-preflight" +fi + +SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT" JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') @@ -174,6 +253,26 @@ echo "Extracted JOB_ID: $JOB_ID" LOGS_DIR="outputs/$JOB_ID/logs" LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" +# Snapshot worker logs on any exit path — normal completion, error, +# SIGTERM (gh run cancel sends this to the launcher), even SIGKILL of +# our parent. Without this trap, the cancel-time tar lives only in the +# main flow below (after `wait $POLL_PID`), so a manual `gh run cancel` +# during the tail wait skips it entirely and the +# `Upload server logs` workflow step finds nothing to upload. +# Idempotent: the main-flow tar at the bottom of this script is now a +# no-op because the trap already produced the artifact, but it stays +# for narrative continuity in normal (non-cancel) runs. +_snapshot_server_logs() { + if [ -n "${LOGS_DIR:-}" ] && [ -d "$LOGS_DIR" ] && [ -n "${GITHUB_WORKSPACE:-}" ]; then + # Copy + tar are independent best-effort; an in-flight write + # from a worker .out file at SIGTERM time would otherwise abort + # the whole script before either succeeds. + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" 2>/dev/null || true + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . 2>/dev/null || true + fi +} +trap _snapshot_server_logs EXIT + # Wait for log file to appear (also check job is still alive) while ! ls "$LOG_FILE" &>/dev/null; do if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then @@ -207,8 +306,9 @@ echo "Collecting results..." if [ -d "$LOGS_DIR" ]; then echo "Found logs directory: $LOGS_DIR" - cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" - tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . + # Tarball + LOGS copy are produced by the EXIT trap defined near + # JOB_ID extraction (so cancel paths also get them); just log here. + echo "multinode_server_logs.tar.gz will be (re)produced on script EXIT." else echo "Warning: Logs directory not found at $LOGS_DIR" fi @@ -281,6 +381,12 @@ if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then fi fi +# Snapshot logs to GITHUB_WORKSPACE BEFORE cleanup, so the EXIT trap's +# `[ -d "$LOGS_DIR" ]` guard isn't already false by the time it fires +# (it runs AFTER the rm below, since EXIT traps are last-thing-before-exit). +# Without this inline call, R25 lost both 1p6d shards' logs. +_snapshot_server_logs + # Clean up srt-slurm outputs to prevent NFS silly-rename lock files # from blocking the next job's checkout on this runner echo "Cleaning up srt-slurm outputs..." diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index b4f594d51..988addedd 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -280,6 +280,7 @@ EOF else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" + AIPERF_MMAP_CACHE_HOST_PATH="/mnt/nfs/sa-shared/gharunners/ai-perf-cache" SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" @@ -306,10 +307,10 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \ --no-container-mount-home \ --container-workdir=/workspace/ \ - --no-container-entrypoint --export=ALL,PORT=8888 \ + --no-container-entrypoint --export=ALL,PORT=8888,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \ bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h100.sh scancel $JOB_ID diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 1486c4fa6..684721497 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" +export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/gharunner/ai-perf-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" @@ -40,10 +41,10 @@ fi srun --jobid=$JOB_ID \ --container-image=$CONTAINER_IMAGE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \ --container-mount-home \ --container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL \ +--no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \ bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh rmdir $SAGEMAKER_SHM_PATH diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index b701d65a6..572056956 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -315,6 +315,7 @@ else --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \ --no-container-mount-home \ + --container-remap-root \ --container-workdir=$CONTAINER_MOUNT_DIR/ \ --no-container-entrypoint --export=ALL,PORT=8888,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \ bash $BENCH_SCRIPT diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 158c30792..23d8d816b 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -1,6 +1,7 @@ #!/usr/bin/bash export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" +export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/data/gharunners/ai-perf-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" @@ -13,10 +14,10 @@ set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,$AIPERF_MMAP_CACHE_HOST_PATH:/aiperf_mmap_cache \ --container-remap-root \ --container-writable \ --container-mount-home \ --container-workdir=/workspace/ \ ---no-container-entrypoint --export=ALL \ +--no-container-entrypoint --export=ALL,AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache \ bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index a8033847e..7a54e3848 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -59,7 +59,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else - BENCHMARK_SUBDIR="single_node" + BENCHMARK_SUBDIR="single_node/fixed_seq_len" fi JOB_ID=$(bash "benchmarks/${BENCHMARK_SUBDIR}/${SCRIPT_NAME}") @@ -223,8 +223,8 @@ else fi SCRIPT_BASE="${EXP_NAME%%_*}_${PRECISION}_mi355x" - SCRIPT_FW="benchmarks/single_node/${SCENARIO_SUBDIR:-}${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" - SCRIPT_FALLBACK="benchmarks/single_node/${SCENARIO_SUBDIR:-}${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" + SCRIPT_FW="benchmarks/single_node/${SCENARIO_SUBDIR:-fixed_seq_len/}${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" + SCRIPT_FALLBACK="benchmarks/single_node/${SCENARIO_SUBDIR:-fixed_seq_len/}${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" if [[ -f "$SCRIPT_FW" ]]; then BENCHMARK_SCRIPT="$SCRIPT_FW" else diff --git a/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py b/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py index aa4b639ca..78925636f 100644 --- a/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py +++ b/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -"""Analyze ISL/OSL/turn distributions from AIPerf benchmark results. +"""Analyze ISL/OSL distributions from AIPerf benchmark results. -Reads profile_export.jsonl and produces summary stats + distribution plots -to verify the benchmark workload matches the intended Qwen trace profile. +Reads profile_export.jsonl and produces mean/median/p75/p90/p95 summary stats +plus all-requests ISL and OSL histograms. Usage: python analyze_benchmark_distributions.py path/to/aiperf_artifacts/ -o output_dir/ @@ -12,8 +12,6 @@ import argparse import json -import math -from collections import Counter, defaultdict from pathlib import Path @@ -29,331 +27,124 @@ def load_records(artifacts_dir: Path) -> list[dict]: return records -def load_trace_replay_records(trace_replay_dir: Path) -> list[dict]: - """Load per-request records from trace_replay detailed_results.csv. +def _stats(values: list[int]) -> dict[str, float]: + sv = sorted(values) + n = len(sv) + return { + "n": n, + "mean": sum(sv) / n, + "median": sv[n // 2], + "p75": sv[int(n * 0.75)], + "p90": sv[int(n * 0.90)], + "p95": sv[int(n * 0.95)], + } - Converts to the same format as AIPerf JSONL records so the analyze() - function can process both formats identically. - """ - import csv - import sys - csv.field_size_limit(sys.maxsize) - csv_path = trace_replay_dir / "detailed_results.csv" - records = [] - with open(csv_path) as f: - reader = csv.DictReader(f) - for row in reader: - if row.get("success") != "True": - continue - records.append({ - "metadata": { - "x_correlation_id": row["trace_id"], - "conversation_id": row["trace_id"], - "turn_index": int(row["request_idx"]), - "benchmark_phase": "profiling", - }, - "metrics": { - "input_sequence_length": {"value": int(row["input_tokens"])}, - "output_sequence_length": {"value": int(row["output_tokens_actual"])}, - }, - }) - return records +def _fmt(s: dict[str, float]) -> str: + return ( + f" n={s['n']:,} mean={s['mean']:,.0f} median={s['median']:,} " + f"p75={s['p75']:,} p90={s['p90']:,} p95={s['p95']:,}" + ) def analyze(records: list[dict], output_dir: Path) -> None: - """Run distribution analysis and save results.""" output_dir.mkdir(parents=True, exist_ok=True) - # Group by conversation - convos: dict[str, list[dict]] = defaultdict(list) + all_isl: list[int] = [] + all_osl: list[int] = [] for r in records: metrics = r.get("metrics", {}) if "input_sequence_length" not in metrics or "output_sequence_length" not in metrics: continue - # Use x_correlation_id (unique per session) not conversation_id (template, reused) - cid = r["metadata"].get("x_correlation_id") or r["metadata"]["conversation_id"] - ti = r["metadata"]["turn_index"] - isl = metrics["input_sequence_length"]["value"] - osl = metrics["output_sequence_length"]["value"] - convos[cid].append({"turn": ti, "isl": isl, "osl": osl}) - - # Sort turns within each conversation - for v in convos.values(): - v.sort(key=lambda x: x["turn"]) - - # Turn count distribution - turn_counts = Counter(len(v) for v in convos.values()) - total_convos = len(convos) - total_requests = len(records) - - lines = [] - lines.append("=" * 70) - lines.append("BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS") - lines.append("=" * 70) - lines.append(f"Total conversations: {total_convos:,}") - lines.append(f"Total requests: {total_requests:,}") - lines.append(f"Avg turns/conv: {total_requests / total_convos:.2f}") - lines.append("") - - lines.append("TURN COUNT DISTRIBUTION:") - lines.append(f" {'Turns':>5s} {'Count':>6s} {'Pct':>6s} Target") - target = {1: 59, 2: 20, 3: 10, 4: 5, 5: 3, 6: 2, 7: 1} - for k in sorted(turn_counts.keys()): - pct = 100 * turn_counts[k] / total_convos - tgt = f"{target.get(k, 0):.0f}%" if k in target else "" - lines.append(f" {k:5d} {turn_counts[k]:6,} {pct:5.1f}% {tgt}") - - # ISL/OSL by turn index - lines.append("") - lines.append("ISL BY TURN INDEX:") - lines.append( - f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" - ) - max_turn = max(t["turn"] for v in convos.values() for t in v) - for ti in range(max_turn + 1): - vals = sorted(t["isl"] for v in convos.values() for t in v if t["turn"] == ti) - if not vals: - continue - n = len(vals) - mean = sum(vals) / n - std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) - median = vals[n // 2] - p5 = vals[int(n * 0.05)] - p95 = vals[int(n * 0.95)] - lines.append( - f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" - ) - - lines.append("") - lines.append("OSL BY TURN INDEX:") - lines.append( - f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" - ) - for ti in range(max_turn + 1): - vals = sorted(t["osl"] for v in convos.values() for t in v if t["turn"] == ti) - if not vals: - continue - n = len(vals) - mean = sum(vals) / n - std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) - median = vals[n // 2] - p5 = vals[int(n * 0.05)] - p95 = vals[int(n * 0.95)] - lines.append( - f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" - ) - - # Overall ISL/OSL stats - all_isl = sorted(t["isl"] for v in convos.values() for t in v) - all_osl = sorted(t["osl"] for v in convos.values() for t in v) - n = len(all_isl) - isl_mean = sum(all_isl) / n - osl_mean = sum(all_osl) / n - lines.append("") - lines.append("ALL REQUESTS ISL:") - lines.append( - f" n={n:,} mean={isl_mean:.0f} median={all_isl[n//2]} " - f"p5={all_isl[int(n*0.05)]} p95={all_isl[int(n*0.95)]}" - ) - lines.append("ALL REQUESTS OSL:") - lines.append( - f" n={n:,} mean={osl_mean:.0f} median={all_osl[n//2]} " - f"p5={all_osl[int(n*0.05)]} p95={all_osl[int(n*0.95)]}" - ) - - # Per-conversation stats - conv_max_isl = sorted(max(t["isl"] for t in v) for v in convos.values()) - conv_total_osl = sorted(sum(t["osl"] for t in v) for v in convos.values()) - nc = len(conv_max_isl) - lines.append("") - lines.append("PER-CONVERSATION MAX ISL (final context size):") - lines.append( - f" n={nc:,} mean={sum(conv_max_isl)/nc:.0f} median={conv_max_isl[nc//2]} " - f"p5={conv_max_isl[int(nc*0.05)]} p95={conv_max_isl[int(nc*0.95)]}" - ) - lines.append("PER-CONVERSATION TOTAL OSL:") - lines.append( - f" n={nc:,} mean={sum(conv_total_osl)/nc:.0f} median={conv_total_osl[nc//2]} " - f"p5={conv_total_osl[int(nc*0.05)]} p95={conv_total_osl[int(nc*0.95)]}" - ) + all_isl.append(metrics["input_sequence_length"]["value"]) + all_osl.append(metrics["output_sequence_length"]["value"]) - # ISL context growth (shows accumulation across turns) - lines.append("") - lines.append("ISL CONTEXT GROWTH (sample multi-turn conversations):") - multi = [(cid, v) for cid, v in convos.items() if len(v) >= 3][:10] - for cid, turns in multi: - isls = " -> ".join(str(t["isl"]) for t in turns) - lines.append(f" {cid}: {isls}") - - lines.append("=" * 70) + if not all_isl: + print("No records with ISL/OSL metrics found.") + return + isl_stats = _stats(all_isl) + osl_stats = _stats(all_osl) + + lines = [ + "=" * 70, + "BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS", + "=" * 70, + f"Total requests: {len(records):,}", + "", + "ALL REQUESTS ISL:", + _fmt(isl_stats), + "ALL REQUESTS OSL:", + _fmt(osl_stats), + "=" * 70, + ] summary_text = "\n".join(lines) print(summary_text) - - # Save summary (output_dir / "workload_distribution_summary.txt").write_text(summary_text) - # Try to generate plots (matplotlib may not be available) try: - _generate_plots(convos, records, output_dir) + _generate_plots(all_isl, all_osl, isl_stats, osl_stats, output_dir) except ImportError: print("matplotlib not available, skipping plots") def _generate_plots( - convos: dict[str, list[dict]], records: list[dict], output_dir: Path + all_isl: list[int], + all_osl: list[int], + isl_stats: dict[str, float], + osl_stats: dict[str, float], + output_dir: Path, ) -> None: - """Generate distribution plots.""" import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt - fig, axes = plt.subplots(3, 3, figsize=(18, 15)) + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) fig.suptitle("Benchmark Workload Distribution Analysis", fontsize=14) - # (0,0) Turn count distribution - ax = axes[0, 0] - turn_counts = Counter(len(v) for v in convos.values()) - turns = sorted(turn_counts.keys()) - counts = [turn_counts[t] for t in turns] - total = sum(counts) - bars = ax.bar(turns, [100 * c / total for c in counts], edgecolor="black", alpha=0.7) - for bar, t in zip(bars, turns): - ax.text( - bar.get_x() + bar.get_width() / 2, - bar.get_height(), - f"{bar.get_height():.0f}%", - ha="center", - va="bottom", - fontsize=8, - ) - ax.set_xlabel("Number of Turns") - ax.set_ylabel("% of Conversations") - ax.set_title(f"Turn Count Distribution (n={total:,})") - ax.grid(True, alpha=0.3, axis="y") - - # (0,1) All requests ISL histogram - ax = axes[0, 1] - all_isl = [t["isl"] for v in convos.values() for t in v] - clip = int(sorted(all_isl)[int(len(all_isl) * 0.99)] * 1.2) - ax.hist([v for v in all_isl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="steelblue") - all_isl_sorted = sorted(all_isl) - median_isl = all_isl_sorted[len(all_isl) // 2] - mean_isl = sum(all_isl) / len(all_isl) - ax.axvline(median_isl, color="red", linestyle="--", label=f"Median: {median_isl:,}") - ax.axvline(mean_isl, color="orange", linestyle="--", label=f"Mean: {mean_isl:,.0f}") - ax.set_xlabel("Input Sequence Length") - ax.set_ylabel("Count") - ax.set_title(f"All Requests ISL (n={len(all_isl):,})") - ax.legend(fontsize=8) - ax.grid(True, alpha=0.3, axis="y") - - # (0,2) All requests OSL histogram - ax = axes[0, 2] - all_osl = [t["osl"] for v in convos.values() for t in v] - clip = min(3000, int(sorted(all_osl)[int(len(all_osl) * 0.99)] * 1.2)) - ax.hist([v for v in all_osl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="coral") - all_osl_sorted = sorted(all_osl) - median_osl = all_osl_sorted[len(all_osl) // 2] - mean_osl = sum(all_osl) / len(all_osl) - ax.axvline(median_osl, color="red", linestyle="--", label=f"Median: {median_osl:,}") - ax.axvline(mean_osl, color="orange", linestyle="--", label=f"Mean: {mean_osl:,.0f}") - ax.set_xlabel("Output Sequence Length") - ax.set_ylabel("Count") - ax.set_title(f"All Requests OSL (n={len(all_osl):,})") - ax.legend(fontsize=8) - ax.grid(True, alpha=0.3, axis="y") - - # (1,0) Average new prefill tokens by turn index (ISL delta per turn) - ax = axes[1, 0] - # Collect deltas grouped by turn index - deltas_by_turn: dict[int, list[int]] = defaultdict(list) - for v in convos.values(): - for i, t in enumerate(v): - if i == 0: - deltas_by_turn[t["turn"]].append(t["isl"]) - else: - deltas_by_turn[t["turn"]].append(max(0, t["isl"] - v[i - 1]["isl"])) - if deltas_by_turn: - turn_indices = sorted(deltas_by_turn.keys()) - means = [sum(deltas_by_turn[ti]) / len(deltas_by_turn[ti]) for ti in turn_indices] - ns = [len(deltas_by_turn[ti]) for ti in turn_indices] - ax.plot(turn_indices, means, marker="o", markersize=3, linewidth=1, color="mediumseagreen") - ax.fill_between(turn_indices, 0, means, alpha=0.2, color="mediumseagreen") - # Label first and last points - if len(turn_indices) > 0: - ax.annotate(f"{means[0]:,.0f}", (turn_indices[0], means[0]), fontsize=7, ha="left", va="bottom") - if len(turn_indices) > 1: - ax.annotate(f"{means[-1]:,.0f}\n(n={ns[-1]})", (turn_indices[-1], means[-1]), fontsize=7, ha="right", va="bottom") - # Overall mean/median across all deltas - all_deltas = [d for dlist in deltas_by_turn.values() for d in dlist] - if all_deltas: - overall_mean = sum(all_deltas) / len(all_deltas) - all_deltas_sorted = sorted(all_deltas) - overall_median = all_deltas_sorted[len(all_deltas) // 2] - ax.axhline(overall_mean, color="orange", linestyle="--", linewidth=1, label=f"Mean: {overall_mean:,.0f}") - ax.axhline(overall_median, color="red", linestyle="--", linewidth=1, label=f"Median: {overall_median:,}") - ax.legend(fontsize=7) - ax.set_xlabel("Turn Index") - ax.set_ylabel("Mean New Prefill Tokens") - ax.set_title("Avg New Prefill Tokens by Turn") - ax.grid(True, alpha=0.3) - - # (1,1) ISL vs OSL scatter - ax = axes[1, 1] - ax.scatter(all_isl, all_osl, alpha=0.15, s=3, c="purple") - ax.set_xlabel("ISL (tokens)") - ax.set_ylabel("OSL (tokens)") - ax.set_title("ISL vs OSL (all requests)") - ax.grid(True, alpha=0.3) - - # (1,2) Per-conversation max ISL vs num turns scatter - ax = axes[1, 2] - conv_turns = [len(v) for v in convos.values()] - conv_max_isl_list = [max(t["isl"] for t in v) for v in convos.values()] - ax.scatter(conv_turns, conv_max_isl_list, alpha=0.3, s=8, c="steelblue") - ax.set_xlabel("Number of Turns") - ax.set_ylabel("Max ISL (tokens)") - ax.set_title("Final Context Size vs Turn Count") - ax.grid(True, alpha=0.3) - - # (2,0) Per-conversation max ISL (final context size per conversation) - ax = axes[2, 0] - conv_max_isl = [max(t["isl"] for t in v) for v in convos.values()] - clip = int(sorted(conv_max_isl)[int(len(conv_max_isl) * 0.99)] * 1.2) - ax.hist([v for v in conv_max_isl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="steelblue") - conv_max_isl_sorted = sorted(conv_max_isl) - median_max = conv_max_isl_sorted[len(conv_max_isl) // 2] - mean_max = sum(conv_max_isl) / len(conv_max_isl) - ax.axvline(median_max, color="red", linestyle="--", label=f"Median: {median_max:,}") - ax.axvline(mean_max, color="orange", linestyle="--", label=f"Mean: {mean_max:,.0f}") - ax.set_xlabel("Max ISL per Conversation (tokens)") + # ISL histogram + ax = axes[0] + isl_sorted = sorted(all_isl) + clip = int(isl_sorted[int(len(isl_sorted) * 0.99)] * 1.2) + ax.hist( + [v for v in all_isl if v <= clip], + bins=80, + edgecolor="black", + alpha=0.7, + color="steelblue", + ) + ax.axvline(isl_stats["median"], color="red", linestyle="--", label=f"Median: {isl_stats['median']:,}") + ax.axvline(isl_stats["mean"], color="orange", linestyle="--", label=f"Mean: {isl_stats['mean']:,.0f}") + ax.axvline(isl_stats["p90"], color="green", linestyle=":", label=f"P90: {isl_stats['p90']:,}") + ax.axvline(isl_stats["p95"], color="purple", linestyle=":", label=f"P95: {isl_stats['p95']:,}") + ax.set_xlabel("Input Sequence Length (tokens)") ax.set_ylabel("Count") - ax.set_title(f"Per-Conversation Final Context Size (n={len(conv_max_isl):,})") + ax.set_title(f"All Requests ISL (n={isl_stats['n']:,})") ax.legend(fontsize=8) ax.grid(True, alpha=0.3, axis="y") - # (3,1) Per-conversation total OSL (sum of all output tokens across turns) - ax = axes[2, 1] - conv_total_osl = [sum(t["osl"] for t in v) for v in convos.values()] - clip = int(sorted(conv_total_osl)[int(len(conv_total_osl) * 0.99)] * 1.2) - ax.hist([v for v in conv_total_osl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="coral") - conv_total_osl_sorted = sorted(conv_total_osl) - median_tosl = conv_total_osl_sorted[len(conv_total_osl) // 2] - mean_tosl = sum(conv_total_osl) / len(conv_total_osl) - ax.axvline(median_tosl, color="red", linestyle="--", label=f"Median: {median_tosl:,}") - ax.axvline(mean_tosl, color="orange", linestyle="--", label=f"Mean: {mean_tosl:,.0f}") - ax.set_xlabel("Total OSL per Conversation (tokens)") + # OSL histogram + ax = axes[1] + osl_sorted = sorted(all_osl) + clip = min(3000, int(osl_sorted[int(len(osl_sorted) * 0.99)] * 1.2)) + ax.hist( + [v for v in all_osl if v <= clip], + bins=80, + edgecolor="black", + alpha=0.7, + color="coral", + ) + ax.axvline(osl_stats["median"], color="red", linestyle="--", label=f"Median: {osl_stats['median']:,}") + ax.axvline(osl_stats["mean"], color="orange", linestyle="--", label=f"Mean: {osl_stats['mean']:,.0f}") + ax.axvline(osl_stats["p90"], color="green", linestyle=":", label=f"P90: {osl_stats['p90']:,}") + ax.axvline(osl_stats["p95"], color="purple", linestyle=":", label=f"P95: {osl_stats['p95']:,}") + ax.set_xlabel("Output Sequence Length (tokens)") ax.set_ylabel("Count") - ax.set_title(f"Per-Conversation Total Output Tokens (n={len(conv_total_osl):,})") + ax.set_title(f"All Requests OSL (n={osl_stats['n']:,})") ax.legend(fontsize=8) ax.grid(True, alpha=0.3, axis="y") - # (2,2) is empty — already placed scatter at (1,2) - axes[2, 2].axis("off") - plt.tight_layout() out = output_dir / "workload_distribution_plots.png" plt.savefig(out, dpi=150, bbox_inches="tight") @@ -362,32 +153,27 @@ def _generate_plots( def main() -> None: - parser = argparse.ArgumentParser( - description="Analyze benchmark workload distributions" - ) - parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ or trace_replay/ directory") + parser = argparse.ArgumentParser(description="Analyze benchmark workload distributions") + parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ directory") parser.add_argument( - "-o", "--output", default=None, help="Output directory (default: same as artifacts_dir)" + "-o", + "--output", + default=None, + help="Output directory (default: same as artifacts_dir)", ) args = parser.parse_args() artifacts_dir = Path(args.artifacts_dir) output_dir = Path(args.output) if args.output else artifacts_dir - # Auto-detect format - trace_replay_csv = artifacts_dir / "detailed_results.csv" aiperf_jsonl = artifacts_dir / "profile_export.jsonl" - - if trace_replay_csv.exists(): - records = load_trace_replay_records(artifacts_dir) - print(f"Loaded {len(records):,} records from {artifacts_dir} (trace replay)") - elif aiperf_jsonl.exists(): - records = load_records(artifacts_dir) - print(f"Loaded {len(records):,} records from {artifacts_dir} (AIPerf)") - else: - print(f"No recognized data files in {artifacts_dir}") + if not aiperf_jsonl.exists(): + print(f"No profile_export.jsonl found in {artifacts_dir}") return + records = load_records(artifacts_dir) + print(f"Loaded {len(records):,} records from {artifacts_dir}") + analyze(records, output_dir) diff --git a/utils/agentic-benchmark/scripts/collect_sweep_results.py b/utils/agentic-benchmark/scripts/collect_sweep_results.py index a7c6111ad..8206385b3 100644 --- a/utils/agentic-benchmark/scripts/collect_sweep_results.py +++ b/utils/agentic-benchmark/scripts/collect_sweep_results.py @@ -100,51 +100,12 @@ def scalar_val(metric_name): } -def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: - """Load per-request metrics from trace_replay detailed_results.csv.""" - df = pd.read_csv(csv_path) - if len(df) == 0: - return None - - # Filter to successful requests only - df = df[df["success"] == True].copy() - if len(df) == 0: - return None - - # Convert to the same schema as _load_aiperf_jsonl - latency_s = df["request_complete_time"] - df["request_start_time"] - return pd.DataFrame({ - "start_time_ms": df["request_start_time"] * 1000, - "ttft_ms": df["ttft"] * 1000, - "tpot_ms": df["itl"] * 1000, - "latency_ms": latency_s * 1000, - "input_num_tokens": df["input_tokens"], - "output_num_tokens": df["output_tokens_actual"], - }) - - def load_experiment(exp_dir: Path) -> dict | None: """Load metrics from a single experiment artifact directory.""" client_csv = exp_dir / "metrics_client_metrics.csv" server_csv = exp_dir / "metrics_server_metrics.csv" - # No more status.txt: an experiment is considered SUCCESS iff its - # trace_replay/detailed_results.csv has at least one successful row. - # Failed / missing jobs show up as FAILED in the summary. - trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" - status = "FAILED" - if trace_replay_csv.exists(): - try: - import csv as _csv - import sys as _sys - _csv.field_size_limit(_sys.maxsize) - with open(trace_replay_csv) as _f: - if any(r.get('success') == 'True' for r in _csv.DictReader(_f)): - status = "SUCCESS" - except Exception: - pass - - # Check for aiperf summary CSV (preferred) or per-record JSONL (fallback) + # An experiment is considered SUCCESS iff aiperf produced a summary CSV. aiperf_summary_csv = None aiperf_artifacts = exp_dir / "aiperf_artifacts" if aiperf_artifacts.exists(): @@ -152,10 +113,9 @@ def load_experiment(exp_dir: Path) -> dict | None: if candidate.exists(): aiperf_summary_csv = candidate - # Check for trace replay output - trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" + status = "SUCCESS" if aiperf_summary_csv is not None else "FAILED" - if not client_csv.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists(): + if not client_csv.exists() and aiperf_summary_csv is None: return None # Parse experiment name from directory. @@ -165,7 +125,10 @@ def load_experiment(exp_dir: Path) -> dict | None: # agentic_{model}_tp{N}_conc{M}_offload{mode}_{extra...} import re name = exp_dir.name - match = re.search(r'tp(\d+)_conc(\d+)_offload(none|cpu|ssd)', name) + match = re.search( + r'tp(\d+)_conc(\d+)_offload(none|cpu|ssd|lmcache-mp|lmcache|hicache)', + name, + ) if not match: print(f"Warning: cannot parse experiment name '{exp_dir.name}', skipping") return None @@ -186,7 +149,7 @@ def load_experiment(exp_dir: Path) -> dict | None: return result try: - # Determine data source: aiperf summary CSV (preferred), custom client CSV, or trace replay CSV + # Determine data source: aiperf summary CSV (preferred) or custom client CSV if aiperf_summary_csv is not None: aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) if aiperf_metrics is None: @@ -215,48 +178,6 @@ def load_experiment(exp_dir: Path) -> dict | None: if total_time_sec <= 0: total_time_sec = df["latency_ms"].sum() / 1000 - num_requests = len(df) - result.update({ - "num_requests": num_requests, - "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, - "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, - "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, - "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, - "mean_ttft_ms": df["ttft_ms"].mean(), - "p50_ttft_ms": df["ttft_ms"].median(), - "p90_ttft_ms": df["ttft_ms"].quantile(0.9), - "p99_ttft_ms": df["ttft_ms"].quantile(0.99), - "mean_tpot_ms": df["tpot_ms"].mean(), - "p50_tpot_ms": df["tpot_ms"].median(), - "p90_tpot_ms": df["tpot_ms"].quantile(0.9), - "p99_tpot_ms": df["tpot_ms"].quantile(0.99), - "mean_latency_ms": df["latency_ms"].mean(), - "p50_latency_ms": df["latency_ms"].median(), - "p90_latency_ms": df["latency_ms"].quantile(0.9), - "p99_latency_ms": df["latency_ms"].quantile(0.99), - }) - elif trace_replay_csv.exists(): - df = _load_trace_replay_csv(trace_replay_csv) - if df is None or len(df) == 0: - return result - - metadata_file = exp_dir / "benchmark_metadata.json" - total_time_sec = None - if metadata_file.exists(): - try: - with open(metadata_file) as f: - metadata = json.load(f) - total_time_sec = metadata.get("benchmark_runtime_sec") - except Exception: - pass - - if not total_time_sec or total_time_sec <= 0: - first_start_ms = df["start_time_ms"].min() - last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() - total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 - if total_time_sec <= 0: - total_time_sec = df["latency_ms"].sum() / 1000 - num_requests = len(df) result.update({ "num_requests": num_requests, diff --git a/utils/aiperf b/utils/aiperf index 7d880a1ef..8473e1545 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 7d880a1ef1ef3d045ca8f8d5c95e142b5bcdf6c2 +Subproject commit 8473e1545476c1d91932aa2402b642b416a23df6 diff --git a/utils/generate_aiperf_plots.py b/utils/generate_aiperf_plots.py new file mode 100755 index 000000000..baefa7db2 --- /dev/null +++ b/utils/generate_aiperf_plots.py @@ -0,0 +1,780 @@ +#!/usr/bin/env python3 +"""Generate metrics_plots.png matching kv-cache-tester's 6x2 layout. + +Reads aiperf's per-record JSONL + server-metrics JSON (with timeslices +enabled via ``--slice-duration``) and emits a PNG with the same panels +the legacy kv-cache-tester pipeline produced. The launchers feed this +$RESULT_DIR after each run so downstream tooling and humans see the +same visual. + +Layout (6 rows x 2 cols, suptitle "vLLM Server Metrics During Benchmark"): + (0,0) KV Cache Utilization Over Time (HBM + External) + (0,1) Request Queue Depth (running / waiting / total) + (1,0) Prefix Cache Hit Rate Per Interval (GPU / External / Combined) + (1,1) Throughput (Total & Decode) with running average + (2,0) KV Offload Transfer Rate (GPU↔CPU MB/s) + (2,1) Cumulative Prefill Token Source Breakdown (stackplot) + (3,0) KV Offload GPU→CPU (Cumulative GB) + (3,1) KV Offload CPU→GPU (Cumulative GB) + (4,0) TTFT vs Time (scatter + rolling avg) + (4,1) Request Latency vs Time (scatter + rolling avg) + (5,0) Interactivity 1/TPOT vs Time (scatter + rolling avg) + (5,1) Preemptions Over Time (rate + cumulative) + +Time-series data comes from server_metrics_export.json's per-series +``timeslices`` array (populated when ``--slice-duration`` is set on the +aiperf CLI). Per-record TTFT / Latency / ITL come from +profile_export.jsonl. Panels with no data still render so the output +shape is constant across run configs. + +Usage: + python3 generate_aiperf_plots.py +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import sys +from collections import defaultdict +from pathlib import Path + +try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt +except ImportError: + print("ERROR: matplotlib not installed; cannot generate plots", file=sys.stderr) + sys.exit(1) + + +# ---- Loaders -------------------------------------------------------------- + + +def load_jsonl_records(path: Path) -> list[dict]: + records: list[dict] = [] + with open(path) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + if obj.get("error"): + continue + records.append(obj) + return records + + +def load_server_metrics(path: Path) -> dict: + if not path.exists(): + return {} + with open(path) as f: + return json.load(f) + + +def metric_value(record: dict, key: str) -> float | None: + m = record.get("metrics", {}).get(key) + if m is None: + return None + v = m.get("value") if isinstance(m, dict) else m + if v is None: + return None + try: + return float(v) + except (TypeError, ValueError): + return None + + +# ---- Server-metrics helpers ---------------------------------------------- + + +def first_update_ns(server_metrics: dict) -> int | None: + summary = server_metrics.get("summary") or {} + info = (summary.get("endpoint_info") or {}).values() + candidates = [ + v.get("first_update_ns") + for v in info + if isinstance(v, dict) and v.get("first_update_ns") is not None + ] + return min(candidates) if candidates else None + + +def metric_entry(server_metrics: dict, name: str) -> dict | None: + metrics = server_metrics.get("metrics") or {} + entry = metrics.get(name) + return entry if isinstance(entry, dict) else None + + +def all_series(entry: dict | None) -> list[dict]: + if entry is None: + return [] + s = entry.get("series") or [] + return s if isinstance(s, list) else [] + + +def series_with_label( + entry: dict | None, label_key: str, label_value: str +) -> dict | None: + """Pick the series whose labels[label_key] matches label_value.""" + for s in all_series(entry): + labels = s.get("labels") or {} + if labels.get(label_key) == label_value: + return s + return None + + +def timeseries_from_series( + series: dict | None, t0_ns: int | None, value_key_priority=("avg", "rate", "total", "max") +) -> tuple[list[float], list[float]]: + """Extract (relative-time-s, value) pairs from a series' timeslices.""" + if series is None or t0_ns is None: + return [], [] + slices = series.get("timeslices") or [] + times: list[float] = [] + values: list[float] = [] + for ts in slices: + start = ts.get("start_ns") + if start is None: + continue + for k in value_key_priority: + if k in ts and ts[k] is not None: + try: + values.append(float(ts[k])) + times.append((start - t0_ns) / 1e9) + break + except (TypeError, ValueError): + continue + return times, values + + +def aggregate_timeseries( + server_metrics: dict, name: str, t0_ns: int | None, + *, + aggregator=sum, + value_key_priority=("avg", "rate", "total", "max"), +) -> tuple[list[float], list[float]]: + """Aggregate timeslices across every series of a metric (sums by default).""" + entry = metric_entry(server_metrics, name) + if entry is None or t0_ns is None: + return [], [] + bucket: dict[int, list[float]] = defaultdict(list) + for s in all_series(entry): + for ts in s.get("timeslices") or []: + start = ts.get("start_ns") + if start is None: + continue + for k in value_key_priority: + if k in ts and ts[k] is not None: + try: + bucket[int(start)].append(float(ts[k])) + break + except (TypeError, ValueError): + continue + if not bucket: + return [], [] + times: list[float] = [] + values: list[float] = [] + for start_ns in sorted(bucket): + times.append((start_ns - t0_ns) / 1e9) + values.append(aggregator(bucket[start_ns])) + return times, values + + +def rolling_average(values: list[float], window: int) -> list[float]: + if window <= 1 or not values: + return list(values) + out: list[float] = [] + for i in range(len(values)): + chunk = values[max(0, i - window) : i + 1] + out.append(sum(chunk) / len(chunk)) + return out + + +def rolling_window(n: int, max_window: int = 50) -> int: + if n <= 10: + return 1 + return min(max_window, max(1, n // 10)) + + +# ---- Panels -------------------------------------------------------------- + + +def panel_kv_cache_usage(ax, server_metrics: dict, t0_ns: int | None) -> None: + times, values = aggregate_timeseries( + server_metrics, "vllm:kv_cache_usage_perc", t0_ns, aggregator=max + ) + cpu_times, cpu_values = aggregate_timeseries( + server_metrics, "vllm:cpu_kv_cache_usage_perc", t0_ns, aggregator=max + ) + + def _norm(v: float) -> float: + return v * 100.0 if 0 <= v <= 1.0 else v + + if values: + gpu_pct = [min(_norm(v), 100.0) for v in values] + ax.scatter(times, gpu_pct, alpha=0.15, s=2, c="blue") + win = rolling_window(len(gpu_pct)) + if win > 1: + ax.plot( + times, + rolling_average(gpu_pct, win), + "b-", + linewidth=2, + label=f"GPU (avg n={win})", + ) + else: + ax.plot(times, gpu_pct, "b-", linewidth=2, label="GPU") + if cpu_values: + cpu_pct = [_norm(v) for v in cpu_values] + ax.plot(cpu_times, cpu_pct, "r--", linewidth=1.5, label="External") + if values or cpu_values: + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("KV Cache Usage (%)") + ax.set_title("KV Cache Utilization Over Time") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + +def panel_queue_depth(ax, server_metrics: dict, t0_ns: int | None) -> None: + rt, rv = aggregate_timeseries( + server_metrics, "vllm:num_requests_running", t0_ns, aggregator=max + ) + wt, wv = aggregate_timeseries( + server_metrics, "vllm:num_requests_waiting", t0_ns, aggregator=max + ) + if rt: + win = rolling_window(len(rv)) + running = rolling_average(rv, win) if win > 1 else rv + ax.plot(rt, running, "g-", label=f"Running (avg n={win})", linewidth=1.5) + if wt: + win = rolling_window(len(wv)) + waiting = rolling_average(wv, win) if win > 1 else wv + ax.plot(wt, waiting, "r-", label=f"Waiting (avg n={win})", linewidth=1.5) + if rt and wt and len(rt) == len(wt): + total = [r + w for r, w in zip(rv, wv)] + win = rolling_window(len(total)) + smoothed = rolling_average(total, win) if win > 1 else total + ax.plot(rt, smoothed, "b-", label=f"Total (avg n={win})", linewidth=1.5) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Requests") + ax.set_title("Request Queue Depth") + if rt or wt: + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + + +def _hit_rate_intervals( + server_metrics: dict, + hits_name: str, + queries_name: str, + t0_ns: int | None, +) -> tuple[list[float], list[float]]: + """Compute per-interval hit rates from cumulative counters' deltas.""" + ht, hv = aggregate_timeseries( + server_metrics, hits_name, t0_ns, value_key_priority=("total",) + ) + qt, qv = aggregate_timeseries( + server_metrics, queries_name, t0_ns, value_key_priority=("total",) + ) + if not ht or not qt or len(ht) != len(qt): + return [], [] + times: list[float] = [] + rates: list[float] = [] + last = 0.0 + for i in range(len(ht)): + dh = hv[i] + dq = qv[i] + if dq > 0: + last = 100.0 * dh / dq + rates.append(last) + times.append(ht[i]) + return times, rates + + +def panel_prefix_cache_hit_rate(ax, server_metrics: dict, t0_ns: int | None) -> None: + gpu_t, gpu_r = _hit_rate_intervals( + server_metrics, + "vllm:prefix_cache_hits", + "vllm:prefix_cache_queries", + t0_ns, + ) + ext_t, ext_r = _hit_rate_intervals( + server_metrics, + "vllm:external_prefix_cache_hits", + "vllm:external_prefix_cache_queries", + t0_ns, + ) + if gpu_t: + ax.scatter(gpu_t, gpu_r, alpha=0.3, s=5, c="purple", label="GPU (HBM)") + win = rolling_window(len(gpu_r)) + if win > 1: + ax.plot( + gpu_t, + rolling_average(gpu_r, win), + "purple", + linewidth=1.5, + label=f"GPU avg (n={win})", + ) + has_ext = bool(ext_t and any(r > 0 for r in ext_r)) + if has_ext: + ax.scatter(ext_t, ext_r, alpha=0.3, s=5, c="orange", label="External") + win = rolling_window(len(ext_r)) + if win > 1: + ax.plot( + ext_t, + rolling_average(ext_r, win), + "orange", + linewidth=1.5, + label=f"External avg (n={win})", + ) + # Combined (only meaningful when external exists). + if gpu_t and len(gpu_t) == len(ext_t): + combined = [ + (g + e) / 2.0 if (g or e) else 0.0 for g, e in zip(gpu_r, ext_r) + ] + ax.scatter(gpu_t, combined, alpha=0.2, s=3, c="green", label="Combined") + win = rolling_window(len(combined)) + if win > 1: + ax.plot( + gpu_t, + rolling_average(combined, win), + "green", + linewidth=2, + label=f"Combined avg (n={win})", + ) + if gpu_t or has_ext: + ax.legend(loc="best", fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Hit Rate (%)") + ax.set_title("Prefix Cache Hit Rate Per Interval (tokens hit / tokens queried)") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + +def panel_throughput(ax, server_metrics: dict, t0_ns: int | None) -> None: + gen_t, gen_v = aggregate_timeseries( + server_metrics, "vllm:generation_tokens", t0_ns, value_key_priority=("rate",) + ) + prompt_t, prompt_v = aggregate_timeseries( + server_metrics, "vllm:prompt_tokens", t0_ns, value_key_priority=("rate",) + ) + if gen_t and prompt_t and len(gen_t) == len(prompt_t): + total = [g + p for g, p in zip(gen_v, prompt_v)] + win = rolling_window(len(total)) + if win > 1: + ax.plot( + gen_t, + rolling_average(total, win), + "steelblue", + linewidth=1.5, + label=f"Total (avg n={win})", + ) + ax.plot( + gen_t, + rolling_average(gen_v, win), + "orange", + linewidth=1.5, + label=f"Decode (avg n={win})", + ) + else: + ax.plot(gen_t, total, "steelblue", linewidth=1, alpha=0.8, label="Total") + ax.plot(gen_t, gen_v, "orange", linewidth=1, alpha=0.8, label="Decode") + # Cumulative running average: cumsum tokens / elapsed. + if gen_t: + cumulative_total = [] + t0 = gen_t[0] + running = 0.0 + for i, t in enumerate(gen_t): + # rate = tokens/s in that window; multiply by window width. + width = (gen_t[i] - gen_t[i - 1]) if i > 0 else 0.0 + running += total[i] * width + elapsed = t - t0 if t > t0 else 1e-9 + cumulative_total.append(running / elapsed if elapsed > 0 else 0.0) + ax.plot(gen_t, cumulative_total, "red", linewidth=2, label="Total Running Avg") + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Tokens/sec") + ax.set_title("Throughput (Total & Decode)") + ax.grid(True, alpha=0.3) + + +def panel_kv_offload_transfer_rate( + ax, server_metrics: dict, t0_ns: int | None +) -> None: + g2c_t, g2c_v = aggregate_timeseries( + server_metrics, + "vllm:kv_offload_bytes_gpu_to_cpu", + t0_ns, + value_key_priority=("rate",), + ) + c2g_t, c2g_v = aggregate_timeseries( + server_metrics, + "vllm:kv_offload_bytes_cpu_to_gpu", + t0_ns, + value_key_priority=("rate",), + ) + has_data = (g2c_t and any(v > 0 for v in g2c_v)) or ( + c2g_t and any(v > 0 for v in c2g_v) + ) + if has_data: + if g2c_t: + mb = [v / 1e6 for v in g2c_v] + ax.scatter(g2c_t, mb, alpha=0.15, s=3, c="blue") + win = rolling_window(len(mb)) + if win > 1: + ax.plot( + g2c_t, + rolling_average(mb, win), + "b-", + linewidth=1.5, + label=f"GPU→CPU (avg n={win})", + ) + else: + ax.plot(g2c_t, mb, "b-", linewidth=1, alpha=0.8, label="GPU→CPU") + if c2g_t: + mb = [v / 1e6 for v in c2g_v] + ax.scatter(c2g_t, mb, alpha=0.15, s=3, c="red") + win = rolling_window(len(mb)) + if win > 1: + ax.plot( + c2g_t, + rolling_average(mb, win), + "r-", + linewidth=1.5, + label=f"CPU→GPU (avg n={win})", + ) + else: + ax.plot(c2g_t, mb, "r-", linewidth=1, alpha=0.8, label="CPU→GPU") + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Transfer Rate (MB/s)") + ax.set_title("KV Offload Transfer Rate") + ax.grid(True, alpha=0.3) + + +def _prompt_token_source_series( + server_metrics: dict, source_label: str, t0_ns: int | None +) -> tuple[list[float], list[float]]: + """vllm:prompt_tokens_by_source has labels {source: local_compute|local_cache_hit|external_kv_transfer}.""" + entry = metric_entry(server_metrics, "vllm:prompt_tokens_by_source") + s = series_with_label(entry, "source", source_label) + return timeseries_from_series(s, t0_ns, value_key_priority=("total",)) + + +def panel_prefill_source_breakdown( + ax, server_metrics: dict, t0_ns: int | None +) -> None: + c_t, c_v = _prompt_token_source_series(server_metrics, "local_compute", t0_ns) + h_t, h_v = _prompt_token_source_series(server_metrics, "local_cache_hit", t0_ns) + e_t, e_v = _prompt_token_source_series( + server_metrics, "external_kv_transfer", t0_ns + ) + # Align timestamps: use the union of all sample timestamps. + if not (c_t or h_t or e_t): + ax.set_xlabel("Time (s)") + ax.set_ylabel("% of Prefill Tokens") + ax.set_title("Cumulative Prefill Token Source Breakdown") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + return + # Build per-timestamp cumulative values; counters are already cumulative + # totals from the scrape (rate=delta over slice, but ``total`` here is + # the slice total — accumulate ourselves). + samples = sorted(set(c_t) | set(h_t) | set(e_t)) + + def _cum_at(times: list[float], values: list[float]) -> dict: + d: dict[float, float] = {} + running = 0.0 + for t, v in zip(times, values): + running += v + d[t] = running + # Forward-fill for missing samples. + out: dict[float, float] = {} + last = 0.0 + for t in samples: + if t in d: + last = d[t] + out[t] = last + return out + + cum_c = _cum_at(c_t, c_v) + cum_h = _cum_at(h_t, h_v) + cum_e = _cum_at(e_t, e_v) + pct_c: list[float] = [] + pct_h: list[float] = [] + pct_e: list[float] = [] + for t in samples: + c = cum_c[t] + h = cum_h[t] + e = cum_e[t] + total = c + h + e + if total > 0: + pct_c.append(100.0 * c / total) + pct_h.append(100.0 * h / total) + pct_e.append(100.0 * e / total) + else: + pct_c.append(0.0) + pct_h.append(0.0) + pct_e.append(0.0) + ax.stackplot( + samples, + pct_c, + pct_h, + pct_e, + labels=["Prefill", "HBM Cache Hit", "Offload Cache Hit"], + colors=["coral", "steelblue", "mediumseagreen"], + alpha=0.8, + ) + ax.legend(fontsize=8, loc="lower left") + ax.set_xlabel("Time (s)") + ax.set_ylabel("% of Prefill Tokens") + ax.set_title("Cumulative Prefill Token Source Breakdown") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + +def panel_kv_offload_cumulative( + ax, + server_metrics: dict, + metric_name: str, + title: str, + color: str, + t0_ns: int | None, +) -> None: + times, values = aggregate_timeseries( + server_metrics, metric_name, t0_ns, value_key_priority=("total",) + ) + if times and any(v > 0 for v in values): + cumulative: list[float] = [] + running = 0.0 + for v in values: + running += v + cumulative.append(running / 1e9) # GB + ax.plot(times, cumulative, f"{color}-", linewidth=1.5) + ax.fill_between(times, cumulative, alpha=0.2, color=color) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Cumulative Transfer (GB)") + ax.set_title(title) + ax.grid(True, alpha=0.3) + + +def panel_per_record_metric( + ax, + request_times_s: list[float], + values: list[float], + *, + color: str, + ylabel: str, + title: str, +) -> None: + if not values: + ax.set_xlabel("Time (s)") + ax.set_ylabel(ylabel) + ax.set_title(title) + ax.grid(True, alpha=0.3) + return + ax.scatter(request_times_s, values, alpha=0.3, s=5, c=color) + win = rolling_window(len(values)) + if win > 1: + ax.plot( + request_times_s, + rolling_average(values, win), + "r-", + linewidth=1.5, + label=f"Rolling avg (n={win})", + ) + ax.legend(loc="best", fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel(ylabel) + ax.set_title(title) + ax.grid(True, alpha=0.3) + + +def panel_preemptions(ax, server_metrics: dict, t0_ns: int | None) -> None: + times, values = aggregate_timeseries( + server_metrics, "vllm:num_preemptions", t0_ns, value_key_priority=("total",) + ) + if not times: + ax.set_xlabel("Time (s)") + ax.set_ylabel("Preemptions/sec") + ax.set_title("Preemptions Over Time") + ax.grid(True, alpha=0.3) + return + # ``total`` is the per-slice delta; convert to rate by dividing by slice + # width (assume uniform: median diff between consecutive starts). + if len(times) >= 2: + diffs = [times[i] - times[i - 1] for i in range(1, len(times))] + slice_w = max(1e-9, statistics.median(diffs)) + else: + slice_w = 1.0 + rates = [v / slice_w for v in values] + if any(r > 0 for r in rates): + ax.scatter(times, rates, alpha=0.15, s=3, c="red") + win = rolling_window(len(rates), max_window=30) + if win > 1: + ax.plot( + times, + rolling_average(rates, win), + "r-", + linewidth=1.5, + label=f"Rolling avg (n={win})", + ) + # Cumulative on twin axis. + cumulative: list[float] = [] + running = 0.0 + for v in values: + running += v + cumulative.append(running) + ax2 = ax.twinx() + ax2.plot(times, cumulative, "b--", linewidth=1, alpha=0.5, label="Cumulative") + ax2.set_ylabel("Cumulative Preemptions", color="blue") + ax2.tick_params(axis="y", labelcolor="blue") + ax.set_xlabel("Time (s)") + ax.set_ylabel("Preemptions/sec", color="red") + ax.tick_params(axis="y", labelcolor="red") + ax.set_title("Preemptions Over Time") + ax.grid(True, alpha=0.3) + + +# ---- Main ---------------------------------------------------------------- + + +def main(argv: list[str]) -> int: + parser = argparse.ArgumentParser( + description="Generate metrics_plots.png from aiperf artifacts (kv-cache-tester layout)" + ) + parser.add_argument( + "result_dir", + type=Path, + help="Result dir containing trace_replay/ subdirectory", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=None, + help="Output PNG path (default: /metrics_plots.png)", + ) + args = parser.parse_args(argv) + + # benchmark_lib.sh writes aiperf output to /aiperf_artifacts/ + # (--output-artifact-dir). Older runs used trace_replay/, kept as fallback. + artifact = args.result_dir / "aiperf_artifacts" + if not (artifact / "profile_export.jsonl").exists(): + legacy = args.result_dir / "trace_replay" + if (legacy / "profile_export.jsonl").exists(): + artifact = legacy + jsonl_path = artifact / "profile_export.jsonl" + server_metrics_path = artifact / "server_metrics_export.json" + + if not jsonl_path.exists() and artifact.is_dir(): + for child in sorted(artifact.iterdir()): + if child.is_dir() and (child / "profile_export.jsonl").is_file(): + jsonl_path = child / "profile_export.jsonl" + server_metrics_path = child / "server_metrics_export.json" + break + + if not jsonl_path.exists(): + print(f"ERROR: {jsonl_path} not found", file=sys.stderr) + return 1 + + records = load_jsonl_records(jsonl_path) + server_metrics = load_server_metrics(server_metrics_path) + t0_ns = first_update_ns(server_metrics) + + starts_ns = [ + int(r["metadata"]["request_start_ns"]) + for r in records + if r.get("metadata", {}).get("request_start_ns") + ] + first_record_start = min(starts_ns) if starts_ns else 0 + request_times_s = [(s - first_record_start) / 1e9 for s in starts_ns] + + ttfts_ms: list[float] = [] + e2es_ms: list[float] = [] + interactivities: list[float] = [] + for r in records: + ttft = metric_value(r, "time_to_first_token") + e2e = metric_value(r, "request_latency") + itl = metric_value(r, "inter_token_latency") + ttfts_ms.append(ttft if ttft is not None else 0.0) + e2es_ms.append(e2e if e2e is not None else 0.0) + # Interactivity: tokens/sec from per-token latency (ms). + interactivities.append(1000.0 / itl if itl and itl > 0 else 0.0) + + fig, axes = plt.subplots(6, 2, figsize=(14, 24)) + fig.suptitle("vLLM Server Metrics During Benchmark", fontsize=14) + + panel_kv_cache_usage(axes[0, 0], server_metrics, t0_ns) + panel_queue_depth(axes[0, 1], server_metrics, t0_ns) + panel_prefix_cache_hit_rate(axes[1, 0], server_metrics, t0_ns) + panel_throughput(axes[1, 1], server_metrics, t0_ns) + panel_kv_offload_transfer_rate(axes[2, 0], server_metrics, t0_ns) + panel_prefill_source_breakdown(axes[2, 1], server_metrics, t0_ns) + panel_kv_offload_cumulative( + axes[3, 0], + server_metrics, + "vllm:kv_offload_bytes_gpu_to_cpu", + "KV Offload: GPU → CPU (Cumulative)", + "b", + t0_ns, + ) + panel_kv_offload_cumulative( + axes[3, 1], + server_metrics, + "vllm:kv_offload_bytes_cpu_to_gpu", + "KV Offload: CPU → GPU (Cumulative)", + "r", + t0_ns, + ) + panel_per_record_metric( + axes[4, 0], + request_times_s, + ttfts_ms, + color="blue", + ylabel="TTFT (ms)", + title="Time to First Token vs Time", + ) + panel_per_record_metric( + axes[4, 1], + request_times_s, + e2es_ms, + color="green", + ylabel="Latency (ms)", + title="Request Latency vs Time", + ) + panel_per_record_metric( + axes[5, 0], + request_times_s, + interactivities, + color="purple", + ylabel="Interactivity (tokens/sec)", + title="Decode Speed (1/TPOT) vs Time", + ) + panel_preemptions(axes[5, 1], server_metrics, t0_ns) + + plt.tight_layout() + out_path = args.output or (args.result_dir / "metrics_plots.png") + plt.savefig(out_path, dpi=150) + plt.close(fig) + print(f"Saved {out_path}") + if records: + ttft_clean = [v for v in ttfts_ms if v > 0] + e2e_clean = [v for v in e2es_ms if v > 0] + if ttft_clean and e2e_clean: + print( + f" Records: {len(records)} | " + f"TTFT median {statistics.median(ttft_clean):.0f}ms | " + f"E2E median {statistics.median(e2e_clean):.0f}ms" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 9f38292f4..53efcca9f 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -832,44 +832,45 @@ def generate_test_config_sweep(args, all_config_data, runner_data=None): continue for conc in conc_values: - if is_multinode: - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.SPEC_DECODING.value: spec_decoding, - Fields.PREFILL.value: prefill, - Fields.DECODE.value: decode, - Fields.CONC.value: conc, - Fields.DURATION.value: duration, - Fields.EXP_NAME.value: ( - f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}" - f"_d{decode[Fields.NUM_WORKER.value]}x{decode[Fields.TP.value]}_conc{conc}" - ), - Fields.DISAGG.value: disagg, - Fields.SCENARIO_TYPE.value: "agentic-coding", - } - else: - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.TP.value: tp, - Fields.EP.value: ep if ep is not None else 1, - Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, - Fields.CONC.value: conc, - Fields.OFFLOADING.value: offloading, - Fields.DURATION.value: duration, - Fields.EXP_NAME.value: f"{model_code}_tp{tp}_conc{conc}_offload{offloading}", - Fields.SCENARIO_TYPE.value: "agentic-coding", - } - matrix_values.append(validate_agentic_matrix_entry(entry)) + for runner_value in runners_for_entry: + if is_multinode: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc, + Fields.DURATION.value: duration, + Fields.EXP_NAME.value: ( + f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}" + f"_d{decode[Fields.NUM_WORKER.value]}x{decode[Fields.TP.value]}_conc{conc}" + ), + Fields.DISAGG.value: disagg, + Fields.SCENARIO_TYPE.value: "agentic-coding", + } + else: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.TP.value: tp, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.CONC.value: conc, + Fields.OFFLOADING.value: offloading, + Fields.DURATION.value: duration, + Fields.EXP_NAME.value: f"{model_code}_tp{tp}_conc{conc}_offload{offloading}", + Fields.SCENARIO_TYPE.value: "agentic-coding", + } + matrix_values.append(validate_agentic_matrix_entry(entry)) return matrix_values diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index 297e57524..9bb473896 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -1619,6 +1619,48 @@ def test_runner_node_filter_no_match_skips_config(self, sample_multinode_config, assert result == [] + def test_runner_node_filter_expands_agentic_config_runner(self, sample_runner_config): + """Agentic test-config entries should support concrete runner targeting.""" + config = { + "qwen-agentic-hicache": { + "image": "sglang-rocm", + "model": "Qwen/Qwen3.5-397B-A17B-FP8", + "model-prefix": "qwen3.5", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "scenarios": { + "agentic-coding": [ + { + "duration": 1800, + "search-space": [ + { + "tp": 8, + "ep": 1, + "offloading": "hicache", + "conc-list": [64], + } + ], + } + ] + }, + } + } + args = argparse.Namespace( + config_keys=["qwen-agentic-hicache"], + seq_lens=None, + conc=None, + scenario_type=["agentic-coding"], + runner_node_filter="mi300x-amd_1", + ) + + result = generate_test_config_sweep(args, config, sample_runner_config) + + assert len(result) == 1 + assert result[0]["runner"] == "mi300x-amd_1" + assert result[0]["scenario-type"] == "agentic-coding" + # ============================================================================= # Test apply_node_type_defaults @@ -1970,4 +2012,3 @@ def test_prefill_entries_never_in_single_or_evals(self, mixed_entries): assert all('prefill' in x for x in multi) assert all('prefill' not in x for x in single) assert all('prefill' not in x for x in evals) - diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index 1274fd86a..c385017b1 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -3,9 +3,11 @@ from validation import ( Fields, SingleNodeMatrixEntry, + SingleNodeAgenticMatrixEntry, MultiNodeMatrixEntry, WorkerConfig, SingleNodeSearchSpaceEntry, + AgenticCodingSearchSpaceEntry, MultiNodeSearchSpaceEntry, SingleNodeSeqLenConfig, MultiNodeSeqLenConfig, @@ -305,6 +307,61 @@ def test_extra_field_forbidden(self, valid_single_node_matrix_entry): SingleNodeMatrixEntry(**valid_single_node_matrix_entry) +# ============================================================================= +# Test Agentic Matrix Entries +# ============================================================================= + +class TestAgenticMatrixEntries: + """Tests for agentic coding validation models.""" + + def test_lmcache_mp_offloading_is_valid_for_single_node_agentic_entry(self): + """LMCache MP is a valid agentic offloading backend.""" + entry = SingleNodeAgenticMatrixEntry(**{ + "image": "cquil/vllm-openai:v0.21.0-8813c92", + "model": "deepseek-ai/DeepSeek-V4-Pro", + "model-prefix": "dsv4", + "precision": "fp4", + "framework": "vllm", + "runner": "b200-dgxc", + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 1, + "offloading": "lmcache-mp", + "duration": 1800, + "exp-name": "dsv4_tp8_conc1_offloadlmcache-mp", + "scenario-type": "agentic-coding", + }) + assert entry.offloading == "lmcache-mp" + + def test_lmcache_mp_offloading_is_valid_for_agentic_search_space(self): + """Agentic search-space entries can request LMCache MP offloading.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "lmcache-mp", + "conc-list": [1, 2], + }) + assert entry.offloading == "lmcache-mp" + + def test_lmcache_offloading_is_valid_for_agentic_search_space(self): + """Agentic search-space entries can request in-process LMCache.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "lmcache", + "conc-list": [1, 2], + }) + assert entry.offloading == "lmcache" + + def test_hicache_offloading_is_valid_for_agentic_search_space(self): + """Agentic search-space entries can request SGLang HiCache.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "hicache", + "conc-list": [1, 2], + }) + assert entry.offloading == "hicache" + + # ============================================================================= # Test MultiNodeMatrixEntry # ============================================================================= diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index dd245aec7..4e3f0bbd7 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -156,7 +156,9 @@ class SingleNodeAgenticMatrixEntry(BaseModel): ep: int dp_attn: bool = Field(alias=Fields.DP_ATTN.value) conc: int - offloading: Literal["none", "cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "ssd", "lmcache", "lmcache-mp", "hicache"] = Field( + alias=Fields.OFFLOADING.value + ) duration: int = Field(default=1800, alias=Fields.DURATION.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value) @@ -338,7 +340,9 @@ class AgenticCodingSearchSpaceEntry(BaseModel): default="none", alias=Fields.SPEC_DECODING.value) prefill: Optional[WorkerConfig] = None decode: Optional[WorkerConfig] = None - offloading: Literal["none", "cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "ssd", "lmcache", "lmcache-mp", "hicache"] = Field( + default="none", alias=Fields.OFFLOADING.value + ) conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value) conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value) conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value) diff --git a/utils/process_agentic_result.py b/utils/process_agentic_result.py index 10aaff80e..3c4015ce6 100644 --- a/utils/process_agentic_result.py +++ b/utils/process_agentic_result.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Process aiperf agentic-replay output into the InferenceX agg_*.json shape. -Reads aiperf's three artifact files from $RESULT_DIR/trace_replay/ and emits +Reads aiperf's three artifact files from $RESULT_DIR/aiperf_artifacts/ and emits $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json with the same key schema fixed-seq-len and the legacy kv-cache-tester pipeline produce, so utils/summarize.py and sibling aggregators keep working without changes. @@ -37,7 +37,7 @@ # Trace metadata lookup: conversation_id (= trace id) -> per-turn dict with # ``hash_ids`` and ``output_length``. Built lazily from the HF dataset cache. _TRACE_METADATA_CACHE: dict[str, list[dict]] | None = None -_HF_DATASET = "semianalysisai/cc-traces-weka-042026" +_HF_DATASET = "semianalysisai/cc-traces-weka-with-subagents-051926" # ---- helpers --------------------------------------------------------------- @@ -626,11 +626,11 @@ def _resolve_artifact_dir(result_dir: Path) -> Path: aiperf accepts ``--output-artifact-dir`` and writes directly into it when ``--num-profile-runs == 1`` (our default), but creates a per-run subdir - when that flag is > 1. Handle both: prefer ``result_dir/trace_replay`` + when that flag is > 1. Handle both: prefer ``result_dir/aiperf_artifacts`` when it has the export files, else descend into the first child dir that does. """ - base = result_dir / "trace_replay" + base = result_dir / "aiperf_artifacts" if (base / "profile_export.jsonl").is_file(): return base if base.is_dir(): diff --git a/utils/proxy_to_weka.py b/utils/proxy_to_weka.py new file mode 100644 index 000000000..3b5a28afb --- /dev/null +++ b/utils/proxy_to_weka.py @@ -0,0 +1,514 @@ +#!/usr/bin/env python3 +"""Convert flat per-session JSONL dumps into weka-format trace JSON. + +Reads /.jsonl produced by `sample_proxy_traces.py` +and writes /..//.json in the v1 weka trace +format consumed by the kv-cache-tester replayer (see +utils/aiperf/src/aiperf/dataset/loader/weka_trace_models.py). + +Subagent grouping mirrors the conversation-view algorithm from the +SemiAnalysis claude-code-proxy: + + 1. Walk session rows chronologically. + 2. A row with `subagent_label IS NULL` is a parent (main-agent) turn. + 3. A run of consecutive non-null-label rows is a "stretch". The + stretch ends as soon as a NULL-label row appears. + 4. Inside the stretch, group by `subagent_label`. Each label group + becomes one WekaSubagentEntry with its label rows as inner + WekaNormalRequest entries (in chronological order). + 5. Different labels inside the same stretch produce sibling entries + (the dashboard renders parallel groups for each). + +Hash IDs (24-char hex strings in the proxy DB) are remapped to small +per-trace ints so we can emit `hash_id_scope: "local"`. The mapping is +session-scoped: first-seen hash gets 0, second 1, etc. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any + + +def _dump_trace_inline_hash_ids(trace: dict, path: Path) -> None: + """Write the trace as indented JSON, but with every ``hash_ids`` + array kept on a single line regardless of length. + + `json.dump(..., indent=2)` always expands arrays to one element + per line, which turns the weka file into thousands of one-int + lines that drown out the actual structure. We work around it + with a two-phase serialize: substitute each ``hash_ids`` list + with a placeholder string before dumping, then text-replace the + placeholder with a compact one-line array. Robust against weird + list contents because the substitution happens at object level, + not at the JSON-text level. + """ + placeholders: list[list[Any]] = [] + + def _substitute(obj): + if isinstance(obj, dict): + out: dict[str, Any] = {} + for k, v in obj.items(): + if k == "hash_ids" and isinstance(v, list): + idx = len(placeholders) + placeholders.append(v) + out[k] = f"@@HASHIDS_{idx}@@" + else: + out[k] = _substitute(v) + return out + if isinstance(obj, list): + return [_substitute(x) for x in obj] + return obj + + text = json.dumps(_substitute(trace), indent=2) + text = re.sub( + r'"@@HASHIDS_(\d+)@@"', + lambda m: json.dumps(placeholders[int(m.group(1))], separators=(", ", ": ")), + text, + ) + with path.open("w") as f: + f.write(text + "\n") + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument( + "--in-dir", "-i", type=Path, required=True, + help="Directory containing .jsonl files (the output of sample_proxy_traces.py).", + ) + p.add_argument( + "--out-dir", "-o", type=Path, required=True, + help="Directory to write .json weka traces into.", + ) + return p.parse_args() + + +_SLUG_RE = re.compile(r"[^a-z0-9]+") + + +def slugify(label: str) -> str: + return _SLUG_RE.sub("_", label.lower()).strip("_") or "subagent" + + +def load_session_rows(path: Path) -> list[dict]: + rows: list[dict] = [] + with path.open() as f: + for line in f: + line = line.strip() + if line: + rows.append(json.loads(line)) + rows.sort(key=lambda r: r["timestamp"]) + + # Drop exact-duplicate rows. The proxy occasionally records the same + # logical request twice — observed at ~1.5% of subagent inner rows on + # the v5 + CC>=2.1.139 pool, concentrated in heavy-fanout subagents. + # Without deduping, the weka conversion would inflate token counts / + # request counts and the converter would also misclassify the + # duplicate row as "concurrent with itself" when grouping. + # + # Fingerprint: (timestamp, model, input_tokens, output_tokens, + # duration_ms, agent_id). Two distinct logical requests landing on + # the same nanosecond timestamp with identical token counts AND the + # same agent_id are so unlikely that collapsing them is safe. + seen: set[tuple] = set() + deduped: list[dict] = [] + for r in rows: + fp = ( + r.get("timestamp"), + r.get("model"), + r.get("input_tokens"), + r.get("output_tokens"), + r.get("duration_ms"), + r.get("agent_id") or "", + ) + if fp in seen: + continue + seen.add(fp) + deduped.append(r) + n_dropped = len(rows) - len(deduped) + if n_dropped: + print( + f" dedup: dropped {n_dropped} exact-duplicate row(s) from {path.name}", + file=sys.stderr, + ) + return deduped + + +def remap_hash(h: str, m: dict[str, int]) -> int: + if h not in m: + m[h] = len(m) + return m[h] + + +def infer_block_size(rows: list[dict]) -> int: + """Anthropic's KV-cache uses a constant 64-token block. The proxy's + `hash_token_count` can drift below `len(hash_ids) * 64` on rows + where the prompt's trailing partial block isn't hashed — naive + division over the first row gives nonsense (53 for a 377-token + utility call). We don't infer; we constant 64. + """ + return 64 + + +def effective_input_length(row: dict, block_size: int = 64) -> int: + """Effective ``in`` for the weka request. + + We want the replayed prompt to be EXACTLY what the proxy hashed and + nothing more — the unhashed tail (typically the volatile user + message of the turn) is synthesized junk at replay time and doesn't + represent real content. So ``in`` is the proxy's own + ``hash_token_count`` whenever it's populated. Fallback chain: + + 1. ``hash_token_count`` — proxy's exact accounting, handles + last-block-partial residues + (e.g. 212 not 256 for 4 blocks). + 2. ``len(hash_ids) * block_size`` — clean block-multiple if the + proxy didn't record the count. + 3. ``input + cache_read + cache_write`` — total prompt length, + used only when no hash + coverage exists. + """ + hash_tok = row.get("hash_token_count") or 0 + if hash_tok > 0: + return hash_tok + hashes = row.get("hash_ids") or [] + if hashes: + return len(hashes) * block_size + return ( + (row.get("input_tokens") or 0) + + (row.get("cache_read_input_tokens") or 0) + + (row.get("cache_write_tokens") or 0) + ) + + +def build_normal_request( + row: dict, hash_map: dict[str, int], think_time: float | None +) -> dict: + """Inner subagent request — Normal type, per weka v1 spec.""" + out = { + "t": row["t_sec"], + "type": "n", + "model": row["model"], + "in": effective_input_length(row), + "out": row.get("output_tokens") or 0, + "hash_ids": [remap_hash(h, hash_map) for h in (row.get("hash_ids") or [])], + "api_time": (row.get("duration_ms") or 0) / 1000.0, + } + if think_time is not None: + out["think_time"] = think_time + return out + + +def build_top_request( + row: dict, hash_map: dict[str, int], think_time: float | None +) -> dict: + """Top-level main-agent request — Normal or Streaming.""" + out = { + "t": row["t_sec"], + "model": row["model"], + "in": effective_input_length(row), + "out": row.get("output_tokens") or 0, + "hash_ids": [remap_hash(h, hash_map) for h in (row.get("hash_ids") or [])], + "api_time": (row.get("duration_ms") or 0) / 1000.0, + } + if think_time is not None: + out["think_time"] = think_time + if row.get("is_streaming"): + out["type"] = "s" + ttft_ms = row.get("ttft_ms") + if ttft_ms is not None: + out["ttft"] = ttft_ms / 1000.0 + else: + out["type"] = "n" + return out + + +def compute_think_times(rows: list[dict]) -> list[float | None]: + """Wall-clock gap from the previous chronological row's end. + + First row gets None (no prior). Negative gaps clamp to 0 (the proxy + timestamps are millisecond-precise; minor reorderings within the + same millisecond can produce small negatives). + """ + out: list[float | None] = [] + prev_end: float | None = None + for r in rows: + if prev_end is None: + out.append(None) + else: + gap = r["t_sec"] - prev_end + out.append(max(0.0, gap)) + prev_end = r["t_sec"] + (r.get("duration_ms") or 0) / 1000.0 + return out + + +# Claude CLI version at which `x-claude-code-agent-id` became the +# canonical sub-agent signal. On rows >= this version, a labelled row +# without a header id is treated as a utility call (Title Generation, +# Statusline Agent, …), demoted to a main turn instead of getting its +# own SubagentEntry. Diverges intentionally from the dashboard, which +# still renders those as subagents — we want clean weka traces. +MIN_CLI_FOR_HEADER_AS_TRUTH = (2, 1, 139) + + +def _parse_cli_version(s: str | None) -> tuple[int, int, int] | None: + if not s: + return None + parts = s.split(".") + if len(parts) != 3: + return None + try: + return (int(parts[0]), int(parts[1]), int(parts[2])) + except ValueError: + return None + + +def _is_utility_label_only(row: dict) -> bool: + """True if the row's `subagent_label` should be ignored on new CLI. + + A "utility" row is one labelled as a sub-agent by the proxy's + pattern matcher but with no header-derived id. On CLI versions + where `x-claude-code-agent-id` is authoritative, the absence of + that header means this isn't a Task-tool-spawned sub-agent — it's + a utility call (Title Generation / Name Generation / Statusline) + that should appear in the trace as a regular main turn. + """ + if not row.get("subagent_label"): + return False + if row.get("agent_id") or row.get("thread_id"): + return False + cli = _parse_cli_version(row.get("cli_version")) + return cli is not None and cli >= MIN_CLI_FOR_HEADER_AS_TRUTH + + +def _id_group_key(row: dict) -> str | None: + """Match `idGroupKey` in subagent-runs.ts. + + Returns a stable cross-session key when we have a header-derived id, + else None (caller falls back to legacy contiguous-stretch grouping). + """ + if not row.get("subagent_label"): + return None + if row.get("agent_id"): + return f"cc-agent::{row['agent_id']}" + if row.get("thread_id"): + return f"{row['subagent_label']}::thread::{row['thread_id']}" + return None + + +def build_subagent_entry( + label: str, + instance_idx: int, + items: list[tuple[dict, float | None]], + hash_map: dict[str, int], +) -> dict: + inner = [build_normal_request(row, hash_map, tt) for row, tt in items] + first_row = items[0][0] + last_row = items[-1][0] + end_t = last_row["t_sec"] + (last_row.get("duration_ms") or 0) / 1000.0 + duration_ms = int(round((end_t - first_row["t_sec"]) * 1000)) + total_tokens = sum(r["in"] + r["out"] for r in inner) + models = sorted({row["model"] for row, _ in items}) + # agent_id suffix priority: Claude Code agent-id (canonical when + # present) > Codex thread-id. Matches the dashboard's + # getSubagentRunLabel which suffixes with the last 8 chars. + cc_agent_id = first_row.get("agent_id") + thread_id = first_row.get("thread_id") + agent_id = f"{slugify(label)}_{instance_idx:03d}" + suffix = cc_agent_id or thread_id + if suffix: + agent_id = f"{agent_id}_{suffix[-8:]}" + return { + "t": first_row["t_sec"], + "type": "subagent", + "agent_id": agent_id, + "subagent_type": label, + "duration_ms": duration_ms, + "total_tokens": total_tokens, + # tool_use_count is not tracked in the proxy DB; leave as None + # (the model field defaults to None). + "tool_use_count": None, + "status": "completed", + "requests": inner, + "models": models, + } + + +def session_to_weka(session_id: str, rows: list[dict]) -> dict: + if not rows: + return { + "id": session_id, + "models": [], + "block_size": 64, + "hash_id_scope": "local", + "requests": [], + } + + # Demote utility-labelled rows (no header id) on new CLI versions + # so they appear as main turns instead of 1-inner SubagentEntries. + # We work on a shallow copy that nulls out subagent_label on those + # rows; everything else is unchanged. + n_demoted = 0 + demoted_rows: list[dict] = [] + for r in rows: + if _is_utility_label_only(r): + r = {**r, "subagent_label": None} + n_demoted += 1 + demoted_rows.append(r) + if n_demoted: + print( + f" demoted {n_demoted} utility-labelled row(s) to main turns " + f"(no x-claude-code-agent-id on CLI >= " + f"{'.'.join(str(x) for x in MIN_CLI_FOR_HEADER_AS_TRUTH)})", + file=sys.stderr, + ) + rows = demoted_rows + + think_times = compute_think_times(rows) + hash_map: dict[str, int] = {} + block_size = infer_block_size(rows) + + out_requests: list[dict] = [] + instance_count: dict[str, int] = {} + models_seen: set[str] = set() + + # Pass 1: pre-collect ALL rows belonging to each header-keyed group + # across the entire session, not just within contiguous label + # stretches. A sub-agent running in the background while the user + # makes more main-agent requests would otherwise get fragmented + # into one entry per stretch. The agent-id / thread-id header is + # stable across fragments — collapse them. Mirrors the pass-1 logic + # in subagent-runs.ts:buildRequestRuns. + id_groups: dict[str, list[tuple[dict, float | None]]] = {} + for r, tt in zip(rows, think_times): + key = _id_group_key(r) + if key is None: + continue + id_groups.setdefault(key, []).append((r, tt)) + + # Pass 2: walk chronologically and emit: + # - main turn (null label) → emit at its position + # - id-keyed sub-agent, first sight → emit FULL collected group + # - id-keyed sub-agent, already seen → skip (already grouped) + # - label-only sub-agent (no header) → fall back to old stretch- + # based grouping + # + # For agent-id (Claude Code ≥ 2.1.139) groups, the per-request label + # drifts arbitrarily across the agent's life (e.g. General Agent ↔ + # Web Search Agent). We follow the dashboard and use a flat + # 'Subagent' label for those. For thread-id (Codex) groups, the + # label is stable so we keep the original. + emitted: set[str] = set() + i = 0 + while i < len(rows): + row = rows[i] + if row.get("subagent_label") is None: + out_requests.append(build_top_request(row, hash_map, think_times[i])) + models_seen.add(row["model"]) + i += 1 + continue + + key = _id_group_key(row) + if key is not None: + if key not in emitted: + emitted.add(key) + items = id_groups[key] + # Claude Code agent-id groups use the flat 'Subagent' + # label since per-request system-prompt labels drift. + use_label = ( + "Subagent" if row.get("agent_id") else row["subagent_label"] + ) + instance_count[use_label] = instance_count.get(use_label, 0) + 1 + entry = build_subagent_entry( + use_label, instance_count[use_label], items, hash_map + ) + out_requests.append(entry) + models_seen.update(entry["models"]) + i += 1 + continue + + # Legacy contiguous-stretch fallback for label-only sub-agents + # (pre-2.1.139 Claude Code or rows with no header coverage). + # Same algorithm as before: collect consecutive same-label rows + # bounded by main-agent turns, group by label. + stretch_rows: list[tuple[dict, float | None]] = [] + while (i < len(rows) + and rows[i].get("subagent_label") is not None + and _id_group_key(rows[i]) is None): + stretch_rows.append((rows[i], think_times[i])) + i += 1 + groups: dict[str, list[tuple[dict, float | None]]] = {} + for r, tt in stretch_rows: + groups.setdefault(r["subagent_label"], []).append((r, tt)) + for label, items in groups.items(): + instance_count[label] = instance_count.get(label, 0) + 1 + entry = build_subagent_entry( + label, instance_count[label], items, hash_map + ) + out_requests.append(entry) + models_seen.update(entry["models"]) + + return { + "id": session_id, + "models": sorted(models_seen), + "block_size": block_size, + "hash_id_scope": "local", + "requests": out_requests, + } + + +def main() -> int: + args = parse_args() + + in_files = sorted(p for p in args.in_dir.glob("*.jsonl")) + if not in_files: + sys.exit(f"ERROR: no .jsonl files in {args.in_dir}") + + args.out_dir.mkdir(parents=True, exist_ok=True) + + n_traces = 0 + n_top = 0 + n_subagent_entries = 0 + n_inner = 0 + for src in in_files: + session_id = src.stem + rows = load_session_rows(src) + trace = session_to_weka(session_id, rows) + + out_path = args.out_dir / f"{session_id}.json" + _dump_trace_inline_hash_ids(trace, out_path) + + n_traces += 1 + for entry in trace["requests"]: + if entry.get("type") == "subagent": + n_subagent_entries += 1 + n_inner += len(entry["requests"]) + else: + n_top += 1 + + print( + f"{session_id}: {len(rows)} row(s) -> " + f"{len(trace['requests'])} entries " + f"({sum(1 for e in trace['requests'] if e.get('type') == 'subagent')} subagent groups)" + f" -> {out_path}", + file=sys.stderr, + ) + + print( + f"\nWrote {n_traces} trace(s): " + f"{n_top} main turns, " + f"{n_subagent_entries} subagent groups ({n_inner} inner requests)", + file=sys.stderr, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/utils/test_process_agentic_result.py b/utils/test_process_agentic_result.py index c54e79736..38477b62a 100644 --- a/utils/test_process_agentic_result.py +++ b/utils/test_process_agentic_result.py @@ -1,6 +1,6 @@ """Smoke tests for process_agentic_result.py against synthetic aiperf output. -The processor consumes three files in $RESULT_DIR/trace_replay/: +The processor consumes three files in $RESULT_DIR/aiperf_artifacts/: profile_export.jsonl, profile_export_aiperf.json, and (optionally) server_metrics_export.json. It writes one $RESULT_FILENAME.json under $AGENTIC_OUTPUT_DIR. We build a minimal @@ -94,7 +94,7 @@ def _make_record( def _write_fixture(tmp_path: Path) -> Path: """Build a $RESULT_DIR with aiperf-shaped artifacts. Returns RESULT_DIR.""" result_dir = tmp_path / "results" - artifact = result_dir / "trace_replay" + artifact = result_dir / "aiperf_artifacts" artifact.mkdir(parents=True) # 5 records across 2 conversations; turn indices grow within each. @@ -264,7 +264,7 @@ def test_processor_response_cache_hit_rate_populated_when_cached_tokens_present( tmp_path: Path, ): result_dir = tmp_path / "results" - artifact = result_dir / "trace_replay" + artifact = result_dir / "aiperf_artifacts" artifact.mkdir(parents=True) rec = _make_record( conv_id="trace-A", @@ -301,7 +301,7 @@ def test_processor_parses_real_server_metrics_schema(tmp_path: Path): iterated the metrics dict like a list. """ result_dir = _write_fixture(tmp_path) - artifact = result_dir / "trace_replay" + artifact = result_dir / "aiperf_artifacts" server_metrics = { "schema_version": "1.0", "summary": { @@ -368,7 +368,7 @@ def test_processor_parses_real_server_metrics_schema(tmp_path: Path): def test_processor_aggregates_across_multiple_series(tmp_path: Path): """Counters with multiple series (multi-endpoint) sum across them.""" result_dir = _write_fixture(tmp_path) - artifact = result_dir / "trace_replay" + artifact = result_dir / "aiperf_artifacts" server_metrics = { "metrics": { "vllm:prefix_cache_hits": { @@ -468,7 +468,7 @@ def test_processor_loads_traces_jsonl_for_theoretical_cache(tmp_path: Path): def test_processor_supports_per_run_subdir_layout(tmp_path: Path): """When --num-profile-runs > 1, aiperf writes into a per-run subdir.""" result_dir = tmp_path / "results" - artifact = result_dir / "trace_replay" / "run_0" + artifact = result_dir / "aiperf_artifacts" / "run_0" artifact.mkdir(parents=True) rec = _make_record( conv_id="trace-A", diff --git a/utils/trace-replay b/utils/trace-replay deleted file mode 160000 index 9074e186d..000000000 --- a/utils/trace-replay +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9074e186da47998c0171a6053aecc70b24625b3b