From c3b92eb8cf791f36c6816a0af88aa4ad0ba1f185 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Wed, 20 May 2026 17:32:01 -0700 Subject: [PATCH 1/3] Tune H100 Qwen SGLang Pareto recipe --- .github/configs/nvidia-master.yaml | 8 ++- benchmarks/single_node/qwen3.5_fp8_h100.sh | 61 ++++++++++++++++++---- perf-changelog.yaml | 9 ++++ 3 files changed, 65 insertions(+), 13 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f8cc486b2..18d215ca8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9208,11 +9208,15 @@ qwen3.5-fp8-h100-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } qwen3.5-fp8-h100-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh index 4c70657aa..fa7697b25 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100.sh @@ -1,17 +1,15 @@ #!/usr/bin/env bash # Qwen-3.5-397B-A17B FP8 on H100 via sglang. -# Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations: -# H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit -# more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and -# chunked-prefill-size from 16384 → 8192 to leave more headroom. -# Sweep tops out at conc=32 instead of 64 for the same reason. +# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64, +# and TP8/EP8 with DP attention at conc 128-256. source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -35,7 +33,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN" fi -echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" +PARALLEL_ARGS=(--tp "$TP") +if [ "${EP_SIZE}" -gt 1 ]; then + PARALLEL_ARGS+=(--expert-parallel-size "$EP_SIZE") +fi + +SCHEDULER_RECV_INTERVAL= +if [ "${DP_ATTENTION}" != "true" ]; then + case "$CONC" in + 1|2|4) + SCHEDULER_RECV_INTERVAL=2 + ;; + 8) + SCHEDULER_RECV_INTERVAL=60 + ;; + 16) + SCHEDULER_RECV_INTERVAL=30 + ;; + 32) + SCHEDULER_RECV_INTERVAL=1200 + ;; + 64) + SCHEDULER_RECV_INTERVAL=600 + ;; + *) + echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2 + exit 1 + ;; + esac +fi + +SCHEDULER_ARGS=() +if [ -n "$SCHEDULER_RECV_INTERVAL" ]; then + SCHEDULER_ARGS=(--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL") +fi +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention) +fi + +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" +echo "SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:-none}" +echo "SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]}" start_gpu_monitor @@ -44,15 +82,14 @@ python3 -m sglang.launch_server \ --model "$MODEL" \ --host 0.0.0.0 \ --port "$PORT" \ - --tp "$TP" \ - --expert-parallel-size "$EP_SIZE" \ + "${PARALLEL_ARGS[@]}" \ --reasoning-parser qwen3 \ --tool-call-parser qwen3_coder \ --enable-flashinfer-allreduce-fusion \ - --max-running-requests 64 \ - --chunked-prefill-size 8192 \ + --max-running-requests 256 \ + --chunked-prefill-size 16384 \ --decode-log-interval 1 \ - --mem-fraction-static 0.75 \ + --mem-fraction-static 0.8 \ --cuda-graph-max-bs "$CONC" \ --context-length "$MAX_SEQ_LEN" \ --kv-cache-dtype fp8_e4m3 \ @@ -62,7 +99,9 @@ python3 -m sglang.launch_server \ --tokenizer-worker-num 6 \ --mamba-ssm-dtype bfloat16 \ --disable-radix-cache \ + --enable-symm-mem \ --trust-remote-code \ + "${SCHEDULER_ARGS[@]}" \ > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 935cded22..66b38c131 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3200,3 +3200,12 @@ - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523, 1P1D TP8/EP1, dp-attn false, conc [8..512]" - "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579 + +- config-keys: + - qwen3.5-fp8-h100-sglang + description: + - "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps" + - "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256" + - "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64" + - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544 From 4587b6e0789c2cbbae4d31900257deb9306c799b Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 28 May 2026 17:42:18 -0700 Subject: [PATCH 2/3] Use TEP for Qwen H100 high concurrency --- .github/configs/nvidia-master.yaml | 4 ++-- benchmarks/single_node/qwen3.5_fp8_h100.sh | 6 ++++-- perf-changelog.yaml | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 18d215ca8..b9fde5b29 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9210,13 +9210,13 @@ qwen3.5-fp8-h100-sglang: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } qwen3.5-fp8-h100-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh index fa7697b25..0c8afdd94 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100.sh @@ -1,8 +1,7 @@ #!/usr/bin/env bash # Qwen-3.5-397B-A17B FP8 on H100 via sglang. -# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64, -# and TP8/EP8 with DP attention at conc 128-256. +# Uses TP8/EP1 at conc 1-8 and TP8/EP8 at conc 16-256. source "$(dirname "$0")/../benchmark_lib.sh" @@ -56,6 +55,9 @@ if [ "${DP_ATTENTION}" != "true" ]; then 64) SCHEDULER_RECV_INTERVAL=600 ;; + 128|256) + SCHEDULER_RECV_INTERVAL=1920 + ;; *) echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2 exit 1 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 66b38c131..07702b9bf 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3205,7 +3205,7 @@ - qwen3.5-fp8-h100-sglang description: - "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps" - - "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256" - - "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64" + - "Use TP8/EP1 for conc 1-8 and TP8/EP8 for conc 16-256" + - "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256" - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544 From 213a1d250682ecfe0e4c7318974d3cb4c21482bd Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 28 May 2026 17:46:53 -0700 Subject: [PATCH 3/3] Simplify Qwen H100 TEP sweep config --- .github/configs/nvidia-master.yaml | 6 +-- benchmarks/single_node/qwen3.5_fp8_h100.sh | 56 ++++++++++------------ 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b9fde5b29..93ff553f8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9209,14 +9209,12 @@ qwen3.5-fp8-h100-sglang: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } - - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } - - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 128, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 256 } qwen3.5-fp8-h100-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh index 0c8afdd94..d3b3dd8b2 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100.sh @@ -8,7 +8,6 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ - DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -38,42 +37,37 @@ if [ "${EP_SIZE}" -gt 1 ]; then fi SCHEDULER_RECV_INTERVAL= -if [ "${DP_ATTENTION}" != "true" ]; then - case "$CONC" in - 1|2|4) - SCHEDULER_RECV_INTERVAL=2 - ;; - 8) - SCHEDULER_RECV_INTERVAL=60 - ;; - 16) - SCHEDULER_RECV_INTERVAL=30 - ;; - 32) - SCHEDULER_RECV_INTERVAL=1200 - ;; - 64) - SCHEDULER_RECV_INTERVAL=600 - ;; - 128|256) - SCHEDULER_RECV_INTERVAL=1920 - ;; - *) - echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2 - exit 1 - ;; - esac -fi +case "$CONC" in + 1|2|4) + SCHEDULER_RECV_INTERVAL=2 + ;; + 8) + SCHEDULER_RECV_INTERVAL=60 + ;; + 16) + SCHEDULER_RECV_INTERVAL=30 + ;; + 32) + SCHEDULER_RECV_INTERVAL=1200 + ;; + 64) + SCHEDULER_RECV_INTERVAL=600 + ;; + 128|256) + SCHEDULER_RECV_INTERVAL=1920 + ;; + *) + echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2 + exit 1 + ;; +esac SCHEDULER_ARGS=() if [ -n "$SCHEDULER_RECV_INTERVAL" ]; then SCHEDULER_ARGS=(--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL") fi -if [ "${DP_ATTENTION}" = "true" ]; then - PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention) -fi -echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" +echo "TP: $TP, EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" echo "SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:-none}" echo "SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]}"