diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f8cc486b2..fed0f2119 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2208,7 +2208,7 @@ qwen3.5-fp4-b200-sglang-mtp: - { tp: 2, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-b200-sglang: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: b200 @@ -2227,7 +2227,7 @@ glm5-fp8-b200-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp8-b200-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: b200 @@ -2307,7 +2307,7 @@ glm5-fp8-b300-sglang-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } glm5-fp4-b200-sglang: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: nvidia/GLM-5-NVFP4 model-prefix: glm5 runner: b200 @@ -2328,7 +2328,7 @@ glm5-fp4-b200-sglang: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp4-b200-sglang-mtp: - image: lmsysorg/sglang:v0.5.12-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260523-c112f762 model: nvidia/GLM-5-NVFP4 model-prefix: glm5 runner: b200 diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/glm5_fp8_b200.sh index ccaa87b98..2cd84dddc 100755 --- a/benchmarks/single_node/glm5_fp8_b200.sh +++ b/benchmarks/single_node/glm5_fp8_b200.sh @@ -45,7 +45,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --tool-call-parser glm47 \ --reasoning-parser glm45 \ --kv-cache-dtype fp8_e4m3 --quantization fp8 \ ---fp8-gemm-backend cutlass \ --attention-backend nsa \ --nsa-decode-backend trtllm --nsa-prefill-backend trtllm \ --moe-runner-backend flashinfer_trtllm \ diff --git a/benchmarks/single_node/glm5_fp8_b200_mtp.sh b/benchmarks/single_node/glm5_fp8_b200_mtp.sh index 5e4f98533..ecd5ca0af 100755 --- a/benchmarks/single_node/glm5_fp8_b200_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_b200_mtp.sh @@ -46,7 +46,6 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --tool-call-parser glm47 \ --reasoning-parser glm45 \ --kv-cache-dtype fp8_e4m3 --quantization fp8 \ ---fp8-gemm-backend cutlass \ --attention-backend nsa \ --nsa-decode-backend trtllm --nsa-prefill-backend trtllm \ --moe-runner-backend flashinfer_trtllm \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b7182a39c..941ee7d72 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3192,3 +3192,12 @@ - "Add GLM-5-FP8 models.yaml flags, setup_deps.sh (aiter gluon + transformers glm_moe_dsa), GLM-5 env tuning in env.sh" - "Add multinode launch script glm5_fp8_mi355x_sglang-disagg.sh; server.sh sources setup_deps.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1572 + +- config-keys: + - glm5-fp4-b200-sglang + - glm5-fp4-b200-sglang-mtp + - glm5-fp8-b200-sglang + - glm5-fp8-b200-sglang-mtp + description: + - "Update SGLang image from v0.5.11-cu130 to nightly-dev-cu13-20260523-c112f762" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1567