From d06b95854686cda2591ccd1d1572e6e832676470 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Tue, 26 May 2026 10:42:03 -0700 Subject: [PATCH 1/2] Update glm-5 b200 sglang container image to nightly-dev-cu13-20260523-c112f762 --- .github/configs/nvidia-master.yaml | 8 ++++---- perf-changelog.yaml | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3d1a70d42..553e69e86 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2208,7 +2208,7 @@ qwen3.5-fp4-b200-sglang-mtp: - { tp: 2, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-b200-sglang: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: b200 @@ -2227,7 +2227,7 @@ glm5-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp8-b200-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: b200 @@ -2307,7 +2307,7 @@ glm5-fp8-b300-sglang-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } glm5-fp4-b200-sglang: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: nvidia/GLM-5-NVFP4 model-prefix: glm5 runner: b200 @@ -2328,7 +2328,7 @@ glm5-fp4-b200-sglang: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp4-b200-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: nvidia/GLM-5-NVFP4 model-prefix: glm5 runner: b200 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 208a2da6f..69756453e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3129,3 +3129,27 @@ description: - "Add --use-chat-template to run_benchmark_serving so prompts are formatted with the Qwen chat template (matching the other Qwen MTP recipes)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555 + +- config-keys: + - glm5-fp4-b200-sglang + description: + - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561 + +- config-keys: + - glm5-fp4-b200-sglang-mtp + description: + - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561 + +- config-keys: + - glm5-fp8-b200-sglang + description: + - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561 + +- config-keys: + - glm5-fp8-b200-sglang-mtp + description: + - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561 From 12cae11a40131af097975b2105e41297b5bb25fc Mon Sep 17 00:00:00 2001 From: Ankur Singh Date: Thu, 28 May 2026 10:16:35 -0700 Subject: [PATCH 2/2] remove cutlass gemm backend --- benchmarks/single_node/glm5_fp8_b200.sh | 1 - benchmarks/single_node/glm5_fp8_b200_mtp.sh | 1 - perf-changelog.yaml | 66 ++++++++++++++++++--- 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/glm5_fp8_b200.sh index ccaa87b98..2cd84dddc 100755 --- a/benchmarks/single_node/glm5_fp8_b200.sh +++ b/benchmarks/single_node/glm5_fp8_b200.sh @@ -45,7 +45,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --tool-call-parser glm47 \ --reasoning-parser glm45 \ --kv-cache-dtype fp8_e4m3 --quantization fp8 \ ---fp8-gemm-backend cutlass \ --attention-backend nsa \ --nsa-decode-backend trtllm --nsa-prefill-backend trtllm \ --moe-runner-backend flashinfer_trtllm \ diff --git a/benchmarks/single_node/glm5_fp8_b200_mtp.sh b/benchmarks/single_node/glm5_fp8_b200_mtp.sh index 5e4f98533..ecd5ca0af 100755 --- a/benchmarks/single_node/glm5_fp8_b200_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_b200_mtp.sh @@ -46,7 +46,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --tool-call-parser glm47 \ --reasoning-parser glm45 \ --kv-cache-dtype fp8_e4m3 --quantization fp8 \ ---fp8-gemm-backend cutlass \ --attention-backend nsa \ --nsa-decode-backend trtllm --nsa-prefill-backend trtllm \ --moe-runner-backend flashinfer_trtllm \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 69756453e..b7142c023 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2449,6 +2449,15 @@ - "Update SGLang image from v0.5.10.post1-cu130 to v0.5.11-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1346 +- config-keys: + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Fix the eval result of dsr1 fp4 with fp8 blockwise combine" + - "Bump the image to May 19" + - "Add conc 512 new sweep point" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1566 + - config-keys: - kimik2.5-int4-h200-vllm description: @@ -2965,6 +2974,18 @@ - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440 +- config-keys: + - kimik2.5-fp4-mi355x-vllm-disagg + description: + - "Add Kimi-K2.5-MXFP4 FP4 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm-disagg + description: + - "Add MiniMax-M2.5 FP8 vLLM disagg PD recipe (1P2D, MoRI-EP + MoRI-IO) for MI355X on vllm/vllm-openai-rocm:nightly" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1569 + - config-keys: - dsv4-fp4-mi355x-vllm description: @@ -3131,24 +3152,51 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1555 - config-keys: - - glm5-fp4-b200-sglang + - minimaxm2.5-fp8-h200-vllm description: - - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561 + - "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404" + - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354 - config-keys: - - glm5-fp4-b200-sglang-mtp + - dsv4-fp4-mi355x-sglang description: - - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561 + - "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4" + - "Add args to avoid kvcache pool full issue on high conc" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1568 - config-keys: - - glm5-fp8-b200-sglang + - qwen3.5-fp8-h200-sglang + - dsr1-fp8-mi355x-sglang description: - - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1561 + - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang-disagg + description: + - "Add Qwen3.5-397B-A17B-FP8 MI355X SGLang disaggregated prefill-decode benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511" + - "1P+1D TP8/EP1 smoke sweep for 1k1k and 8k1k (conc 8-512); MoRI transfer backend" + - "Add models.yaml server flags and multinode launch script qwen3.5_fp8_mi355x_sglang-disagg.sh" + - "8k1k row uses dp-attn=false (matches 1k1k): with --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI, so num_shared_slots stays at the global value (1) and the (num_experts - num_shared_slots) % moe_ep_size assertion in fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared). Track upstream sglang; flip back to dp-attn=true once MoRI is added to is_deepep_class_backend() or shared-slot accounting is reconciled." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1570 + +- config-keys: + - glm5-fp8-mi355x-sglang-disagg + description: + - "Add GLM-5-FP8 MI355X SGLang disaggregated prefill-decode benchmark" + - "Image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523 (bumped from .v0.5.12-...-20260517 to unlock the PD-disagg MoRI overlay; matches chun-chang/sglang-disagg-qwen3.5)" + - "Adds patches/mori_conn.py overlay (bind-mounted via job.slurm) to fix sglang v0.5.12.post1 MoRI/PD startup crashes for hybrid-attention models (GLM-5 NSA, etc.): sender flatten, state_types plural fallback, consumer normalize, SWA/DSA rank/length normalize. Validated: GSM8K=0.971 strict/0.970 flexible on chun-chang. Auto-applied for v0.5.12.post1 images; opt-out via MORI_CONN_PATCH=skip." + - "1P+1D TP8/EP1 CI smoke sweep for 1k1k and 8k1k (conc 8-512)" + - "Add GLM-5-FP8 models.yaml flags, setup_deps.sh (aiter gluon + transformers glm_moe_dsa), GLM-5 env tuning in env.sh" + - "Add multinode launch script glm5_fp8_mi355x_sglang-disagg.sh; server.sh sources setup_deps.sh" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1572 - config-keys: + - glm5-fp4-b200-sglang + - glm5-fp4-b200-sglang-mtp + - glm5-fp8-b200-sglang - glm5-fp8-b200-sglang-mtp description: - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762"