Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8854,6 +8854,77 @@ dsv4-fp4-gb300-dynamo-sglang:
tp: 12
ep: 12
dp-attn: true
# --- Weiliang wide-EP sweep (srt-slurm PR#173), 18 nodes total ---
# EP=12: 15P+3D, conc=12000.
- conc-list: [12000]
prefill:
num-worker: 15
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-15p1d-dep4-dep12-18-c12000.yaml"
decode:
num-worker: 1
tp: 12
ep: 12
dp-attn: true
# EP=16: 14P+4D, conc=8192.
- conc-list: [8192]
prefill:
num-worker: 14
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-18-c8192.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# EP=24: 12P+6D, conc=3000.
- conc-list: [3000]
prefill:
num-worker: 12
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep24-18-c3000.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true
# EP=32: 10P+8D, conc=2500.
- conc-list: [2500]
prefill:
num-worker: 10
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
# EP=40: 8P+10D, conc=2048.
- conc-list: [2048]
prefill:
num-worker: 8
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep40-18-c2048.yaml"
decode:
num-worker: 1
tp: 40
ep: 40
dp-attn: true

glm5-fp8-b200-dynamo-sglang:
image: lmsysorg/sglang:v0.5.11-cu130
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
name: "disagg-gb300-10p1d-dep4-dep32-18-c2500"

# Weiliang wide-EP sweep point: EP=32, 10P+8D = 18 nodes, conc=2500.
# Matches srt-slurm PR#173 zip_override EP=32 topology.
# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image).

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd"
precision: "fp4"

dynamo:
hash: "81d0555ee23519cea80a42b4fe824e30368b7300"
install: true

slurm:
time_limit: "03:00:00"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 10
prefill_workers: 10
gpus_per_prefill: 4
decode_nodes: 8
decode_workers: 1
gpus_per_decode: 32

frontend:
type: dynamo
enable_multiple_frontends: false
env:
DYN_ROUTER_LOAD_BLOCK_SIZE: "1"
args:
router-mode: "kv"
router-kv-overlap-score-weight: 0
router-queue-threshold: 64
router-temperature: 0.5
no-kv-events: true

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1"
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_LOG_FORWARD_ITERS: "1"
SGLANG_LOG_MS: "1"
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1"
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_LOG_FORWARD_ITERS: "1"
SGLANG_LOG_MS: "1"
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
watchdog-timeout: 86400
skip-tokenizer-init: true
stream-interval: 60

tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4

enable-dp-attention: true
moe-a2a-backend: "megamoe"
deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}'
moe-dense-tp-size: 1

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake
enable-dp-lm-head: true

mem-fraction-static: 0.90
max-running-requests: 512
cuda-graph-max-bs: 512
chunked-prefill-size: 32768

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
watchdog-timeout: 86400
skip-tokenizer-init: true
stream-interval: 60

load-balance-method: "total_requests"
moe-a2a-backend: "megamoe"

moe-dense-tp-size: 1

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake
disaggregation-decode-polling-interval: 8

mem-fraction-static: 0.94
swa-full-tokens-ratio: 0.20
context-length: 9216
tensor-parallel-size: 32
data-parallel-size: 32
expert-parallel-size: 32
enable-dp-attention: true
enable-dp-lm-head: true
max-running-requests: 18432
cuda-graph-max-bs: 1280


benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "2500"
req_rate: "inf"
use_chat_template: false
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
name: "disagg-gb300-12p1d-dep4-dep24-18-c3000"

# Weiliang wide-EP sweep point: EP=24, 12P+6D = 18 nodes, conc=3000.
# Matches srt-slurm PR#173 zip_override EP=24 topology.
# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image).

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd"
precision: "fp4"

dynamo:
hash: "81d0555ee23519cea80a42b4fe824e30368b7300"
install: true

slurm:
time_limit: "03:00:00"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 12
prefill_workers: 12
gpus_per_prefill: 4
decode_nodes: 6
decode_workers: 1
gpus_per_decode: 24

frontend:
type: dynamo
enable_multiple_frontends: false
env:
DYN_ROUTER_LOAD_BLOCK_SIZE: "1"
args:
router-mode: "kv"
router-kv-overlap-score-weight: 0
router-queue-threshold: 64
router-temperature: 0.5
no-kv-events: true

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1"
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_LOG_FORWARD_ITERS: "1"
SGLANG_LOG_MS: "1"
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_FORCE_MISS: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1"
SGLANG_OPT_USE_ONLINE_COMPRESS: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_LOG_FORWARD_ITERS: "1"
SGLANG_LOG_MS: "1"
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
watchdog-timeout: 86400
skip-tokenizer-init: true
stream-interval: 60

tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4

enable-dp-attention: true
moe-a2a-backend: "megamoe"
deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}'
moe-dense-tp-size: 1

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake
enable-dp-lm-head: true

mem-fraction-static: 0.90
max-running-requests: 512
cuda-graph-max-bs: 512
chunked-prefill-size: 32768

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
watchdog-timeout: 86400
skip-tokenizer-init: true
stream-interval: 60

load-balance-method: "total_requests"
moe-a2a-backend: "megamoe"

moe-dense-tp-size: 1

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake
disaggregation-decode-polling-interval: 8

mem-fraction-static: 0.94
swa-full-tokens-ratio: 0.20
context-length: 9216
tensor-parallel-size: 24
data-parallel-size: 24
expert-parallel-size: 24
enable-dp-attention: true
enable-dp-lm-head: true
max-running-requests: 18432
cuda-graph-max-bs: 1280


benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "3000"
req_rate: "inf"
use_chat_template: false
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Loading
Loading