diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f8cc486b2..636d678e4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8854,6 +8854,77 @@ dsv4-fp4-gb300-dynamo-sglang: tp: 12 ep: 12 dp-attn: true + # --- Weiliang wide-EP sweep (srt-slurm PR#173), 18 nodes total --- + # EP=12: 15P+3D, conc=12000. + - conc-list: [12000] + prefill: + num-worker: 15 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-15p1d-dep4-dep12-18-c12000.yaml" + decode: + num-worker: 1 + tp: 12 + ep: 12 + dp-attn: true + # EP=16: 14P+4D, conc=8192. + - conc-list: [8192] + prefill: + num-worker: 14 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-18-c8192.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # EP=24: 12P+6D, conc=3000. + - conc-list: [3000] + prefill: + num-worker: 12 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep24-18-c3000.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true + # EP=32: 10P+8D, conc=2500. + - conc-list: [2500] + prefill: + num-worker: 10 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + # EP=40: 8P+10D, conc=2048. + - conc-list: [2048] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep40-18-c2048.yaml" + decode: + num-worker: 1 + tp: 40 + ep: 40 + dp-attn: true glm5-fp8-b200-dynamo-sglang: image: lmsysorg/sglang:v0.5.11-cu130 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml new file mode 100644 index 000000000..70f703c37 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml @@ -0,0 +1,158 @@ +name: "disagg-gb300-10p1d-dep4-dep32-18-c2500" + +# Weiliang wide-EP sweep point: EP=32, 10P+8D = 18 nodes, conc=2500. +# Matches srt-slurm PR#173 zip_override EP=32 topology. +# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" + precision: "fp4" + +dynamo: + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 10 + gpus_per_prefill: 4 + decode_nodes: 8 + decode_workers: 1 + gpus_per_decode: 32 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + enable-dp-lm-head: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + moe-dense-tp-size: 1 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.20 + context-length: 9216 + tensor-parallel-size: 32 + data-parallel-size: 32 + expert-parallel-size: 32 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 18432 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2500" + req_rate: "inf" + use_chat_template: false + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep24-18-c3000.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep24-18-c3000.yaml new file mode 100644 index 000000000..ec64cc9a5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep24-18-c3000.yaml @@ -0,0 +1,158 @@ +name: "disagg-gb300-12p1d-dep4-dep24-18-c3000" + +# Weiliang wide-EP sweep point: EP=24, 12P+6D = 18 nodes, conc=3000. +# Matches srt-slurm PR#173 zip_override EP=24 topology. +# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" + precision: "fp4" + +dynamo: + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 12 + gpus_per_prefill: 4 + decode_nodes: 6 + decode_workers: 1 + gpus_per_decode: 24 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + enable-dp-lm-head: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + moe-dense-tp-size: 1 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.20 + context-length: 9216 + tensor-parallel-size: 24 + data-parallel-size: 24 + expert-parallel-size: 24 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 18432 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3000" + req_rate: "inf" + use_chat_template: false + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-18-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-18-c8192.yaml new file mode 100644 index 000000000..d393ae421 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-18-c8192.yaml @@ -0,0 +1,158 @@ +name: "disagg-gb300-14p1d-dep4-dep16-18-c8192" + +# Weiliang wide-EP sweep point: EP=16, 14P+4D = 18 nodes, conc=8192. +# Matches srt-slurm PR#173 zip_override EP=16 topology. +# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" + precision: "fp4" + +dynamo: + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 14 + prefill_workers: 14 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + enable-dp-lm-head: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + moe-dense-tp-size: 1 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.20 + context-length: 9216 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 18432 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: false + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-15p1d-dep4-dep12-18-c12000.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-15p1d-dep4-dep12-18-c12000.yaml new file mode 100644 index 000000000..720f9cdf3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-15p1d-dep4-dep12-18-c12000.yaml @@ -0,0 +1,158 @@ +name: "disagg-gb300-15p1d-dep4-dep12-18-c12000" + +# Weiliang wide-EP sweep point: EP=12, 15P+3D = 18 nodes, conc=12000. +# Matches srt-slurm PR#173 zip_override EP=12 topology. +# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" + precision: "fp4" + +dynamo: + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 15 + prefill_workers: 15 + gpus_per_prefill: 4 + decode_nodes: 3 + decode_workers: 1 + gpus_per_decode: 12 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + enable-dp-lm-head: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + moe-dense-tp-size: 1 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.20 + context-length: 9216 + tensor-parallel-size: 12 + data-parallel-size: 12 + expert-parallel-size: 12 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 18432 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "12000" + req_rate: "inf" + use_chat_template: false + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep40-18-c2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep40-18-c2048.yaml new file mode 100644 index 000000000..3fe8a107e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep40-18-c2048.yaml @@ -0,0 +1,159 @@ +name: "disagg-gb300-8p1d-dep4-dep40-18-c2048" + +# Weiliang wide-EP sweep point: EP=40, 8P+10D = 18 nodes, conc=2048. +# Matches srt-slurm PR#173 zip_override EP=40 topology. +# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd" + precision: "fp4" + +dynamo: + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + install: true + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 8 + gpus_per_prefill: 4 + decode_nodes: 10 + decode_workers: 1 + gpus_per_decode: 40 + +frontend: + type: dynamo + enable_multiple_frontends: false + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_FORCE_MISS: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1280" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND: "1" + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "megamoe" + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + moe-dense-tp-size: 1 + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + enable-dp-lm-head: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + load-balance-method: "total_requests" + moe-a2a-backend: "megamoe" + + moe-dense-tp-size: 1 + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + disaggregation-decode-polling-interval: 8 + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.20 + context-length: 9216 + tensor-parallel-size: 40 + data-parallel-size: 40 + expert-parallel-size: 40 + ep-num-redundant-experts: 16 + enable-dp-attention: true + enable-dp-lm-head: true + max-running-requests: 18432 + cuda-graph-max-bs: 1280 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" + use_chat_template: false + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 935cded22..025be7f78 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3200,3 +3200,11 @@ - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523, 1P1D TP8/EP1, dp-attn false, conc [8..512]" - "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Add wide-EP sweep configs (EP=12/16/24/32/40) matching srt-slurm PR#173 topology (18 nodes total)" + - "EP=12 15P+3D conc=12000, EP=16 14P+4D conc=8192, EP=24 12P+6D conc=3000, EP=32 10P+8D conc=2500, EP=40 8P+10D conc=2048" + - "Aligned decode params with Weiliang config: swa-full-tokens-ratio=0.20, max-running-requests=18432, moe-dense-tp-size=1; added prefill enable-dp-lm-head and cuda-graph-max-bs=512" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1586