NVIDIA · hungryGeek16 · Feb 15, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 19, 2025
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
diff --git a/build_tools/VERSION.txt b/build_tools/VERSION.txt
@@ -1 +1 @@
-2.15.0.dev0
+2.15.0
diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst
@@ -229,6 +229,16 @@ Operation fuser
 
 .. autoapiclass:: transformer_engine.pytorch.ops.SwiGLU
 
+.. autoapifunction:: transformer_engine.pytorch.triton.mhc.mhc_fused_sinkhorn
+
+.. autoapifunction:: transformer_engine.pytorch.triton.mhc.mhc_fused_scale
+
+.. autoapifunction:: transformer_engine.pytorch.triton.mhc.mhc_fused_aggregate
+
+.. autoapifunction:: transformer_engine.pytorch.triton.mhc.mhc_fused_expand_combine
+
+.. autoapifunction:: transformer_engine.pytorch.triton.mhc.mhc_fused_projection
+
 Deprecated functions
 --------------------
 

diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
@@ -36,7 +36,7 @@ NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml
 pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || test_fail "test_perf.py"
 
 # standard sanity and numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "debug test_sanity.py"
+NVTE_GROUPED_LINEAR_SINGLE_PARAM=1 NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "debug test_sanity.py"
 NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "debug test_numerics.py"
 
 if [ "$RET" -ne 0 ]; then

diff --git a/qa/L0_pytorch_lint/test.sh b/qa/L0_pytorch_lint/test.sh
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
@@ -24,7 +24,7 @@ mkdir -p "$XML_LOG_DIR"
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
+NVTE_GROUPED_LINEAR_SINGLE_PARAM=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_deferred_init.xml $TE_PATH/tests/pytorch/test_deferred_init.py || test_fail "test_deferred_init.py"
 PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "test_numerics.py"
@@ -37,11 +37,11 @@ python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_quantized_tensor
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8blockwisetensor.xml $TE_PATH/tests/pytorch/test_float8blockwisetensor.py || test_fail "test_float8blockwisetensor.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_scaling_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_scaling_exact.py || test_fail "test_float8_blockwise_scaling_exact.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise_gemm_exact.xml $TE_PATH/tests/pytorch/test_float8_blockwise_gemm_exact.py || test_fail "test_float8_blockwise_gemm_exact.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_grouped_tensor.xml $TE_PATH/tests/pytorch/test_grouped_tensor.py || test_fail "test_grouped_tensor.py"
+NVTE_GROUPED_LINEAR_SINGLE_PARAM=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/test_grouped_tensor.xml $TE_PATH/tests/pytorch/test_grouped_tensor.py || test_fail "test_grouped_tensor.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
-NVTE_CUTEDSL_FUSED_GROUPED_MLP=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
+NVTE_GROUPED_LINEAR_SINGLE_PARAM=1 NVTE_CUTEDSL_FUSED_GROUPED_MLP=1 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_backward_override.xml $TE_PATH/tests/pytorch/test_backward_override.py || test_fail "test_backward_override.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
@@ -58,6 +58,8 @@ fi
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_partial_cast.xml $TE_PATH/tests/pytorch/test_partial_cast.py || test_fail "test_partial_cast.py"
+# Disable autotuning to make unittests faster. In addition, disable TF32 path to fully align with the pytorch reference implementation's precision
+NVTE_DISABLE_TRITON_AUTOTUNING=1 NVIDIA_TF32_OVERRIDE=0 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_mhc.xml $TE_PATH/tests/pytorch/test_mhc.py || test_fail "test_mhc.py"
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"

diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
@@ -32,6 +32,7 @@ add_executable(test_operator
                test_multi_unpadding.cu
                test_causal_softmax.cu
                test_swizzle.cu
+               test_multi_swizzle.cu
                test_swap_first_dims.cu
                test_grouped_gemm.cu
                ../test_common.cu)
+1 −1		CMakeLists.txt
+2 −0		README.md
+359 −199		include/cudnn_frontend/graph_interface.h
+14 −0		include/cudnn_frontend/graph_properties.h
+7 −7		include/cudnn_frontend/node/diagonal_band_mask.h
+23 −2		include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
+38 −5		include/cudnn_frontend/node/sdpa_fp8_bwd.h
+7 −7		include/cudnn_frontend/node/softmax.h
+202 −192		include/cudnn_frontend/plans.h
+1 −1		include/cudnn_frontend_version.h
+1 −0		python/cudnn/README.md
+25 −1		python/cudnn/__init__.py
+137 −61		python/cudnn/discrete_grouped_gemm/discrete_grouped_gemm_dswiglu/api.py
+207 −173		...cudnn/discrete_grouped_gemm/discrete_grouped_gemm_dswiglu/discrete_B_blockscaled_grouped_gemm_dglu_dbias.py
+146 −61		python/cudnn/discrete_grouped_gemm/discrete_grouped_gemm_swiglu/api.py
+241 −128		...on/cudnn/discrete_grouped_gemm/discrete_grouped_gemm_swiglu/discrete_B_blockscaled_grouped_gemm_glu_bias.py
+37 −8		python/cudnn/discrete_grouped_gemm/discrete_kernel_utils.py
+3 −0		python/cudnn/experimental/__init__.py
+3 −0		python/cudnn/experimental/ops/__init__.py
+1,079 −0		python/cudnn/experimental/ops/sdpa.py
+189 −412		python/cudnn/grouped_gemm/grouped_gemm_dglu/api.py
+0 −4,427		python/cudnn/grouped_gemm/grouped_gemm_dglu/continugous_blockscaled_grouped_gemm_dglu_quant_dbias_fusion.py
+159 −97		python/cudnn/grouped_gemm/grouped_gemm_dglu/moe_blockscaled_grouped_gemm_dglu_dbias.py
+4 −2		python/cudnn/grouped_gemm/grouped_gemm_dswiglu/grouped_gemm_dswiglu_quant.py
+202 −403		python/cudnn/grouped_gemm/grouped_gemm_glu/api.py
+0 −3,713		python/cudnn/grouped_gemm/grouped_gemm_glu/continugous_blockscaled_grouped_gemm_glu_quant_bias_fusion.py
+218 −90		python/cudnn/grouped_gemm/grouped_gemm_glu/moe_blockscaled_grouped_gemm_glu_bias.py
+349 −60		python/cudnn/grouped_gemm/grouped_gemm_quant/api.py
+10 −5		python/cudnn/grouped_gemm/grouped_gemm_quant/grouped_gemm_quant.py
+6 −4		python/cudnn/grouped_gemm/grouped_gemm_swiglu/grouped_gemm_swiglu_quant.py
+36 −7		python/cudnn/grouped_gemm/moe_kernel_helpers.py
+12 −0		python/cudnn/sdpa/__init__.py
+581 −0		python/cudnn/sdpa/api.py
+438 −0		python/cudnn/sdpa/fmha_backward_sm100_2kernel.py
+3,016 −0		python/cudnn/sdpa/fmha_dkdv_d256_sm100.py
+1,968 −0		python/cudnn/sdpa/fmha_dq_d256_sm100.py
+1,143 −0		python/cudnn/sdpa/fmha_utils.py
+784 −0		python/cudnn/sdpa/utils.py
+24 −0		python/cudnn/wrapper.py
+47 −0		python/pygraph/pygraph.cpp
+23 −2		python/pygraph/pygraph.h
+10 −4		python/pygraph/sdpa.cpp
+2 −4		samples/cpp/misc/serialization.cpp
+2 −2		samples/cpp/sdpa/fp16_fwd_with_max_and_sum_exp.cpp
+2 −1		samples/legacy_samples/fp8_flash_mha_sample.cpp
+2 −2		samples/legacy_samples/fp8_flash_mha_sample.h
+1 −1		samples/legacy_samples/test_list.cpp
+4 −4		test/cpp/tensor.cpp
+9 −1		test/python/conftest.py
+152 −0		test/python/fe_api/test_discrete_grouped_gemm_dswiglu.py
+201 −7		test/python/fe_api/test_discrete_grouped_gemm_dswiglu_utils.py
+148 −0		test/python/fe_api/test_discrete_grouped_gemm_swiglu.py
+15 −1		test/python/fe_api/test_discrete_grouped_gemm_swiglu_utils.py
+3 −0		test/python/fe_api/test_fe_api_utils.py
+384 −0		test/python/fe_api/test_grouped_gemm_dglu.py
+19 −8		test/python/fe_api/test_grouped_gemm_dswiglu_utils.py
+389 −0		test/python/fe_api/test_grouped_gemm_glu.py
+391 −0		test/python/fe_api/test_grouped_gemm_quant.py
+45 −22		test/python/fe_api/test_grouped_gemm_quant_utils.py
+28 −12		test/python/fe_api/test_grouped_gemm_swiglu_utils.py
+157 −0		test/python/fe_api/test_sdpa_bwd.py
+352 −0		test/python/fe_api/test_sdpa_bwd_utils.py
+1 −0		test/python/sdpa/fp16.py
+6 −2		test/python/sdpa/fp8.py
+11 −9		test/python/sdpa/mxfp8.py
+4 −1		test/python/sdpa/mxfp8_ref.py
+1 −0		test/python/sdpa/random_config.py
+579 −0		test/python/test_cudnn_sdpa_op.py
+32 −6		test/python/test_mhas_v2.py
+107 −0		test/python/test_sdpa_fp8_serialization.py
+7 −1		tools/cudnn_repro/README.md
+13 −34		tools/cudnn_repro/cudnn_repro/__main__.py
+44 −0		tools/cudnn_repro/cudnn_repro/repro_command.py
+55 −0		tools/cudnn_repro/cudnn_repro/routing.py
+2 −7		tools/cudnn_repro/cudnn_repro/stage1_annotate.py
+67 −15		tools/cudnn_repro/cudnn_repro/stage1_annotate_sdpa_bwd.py
+168 −0		tools/cudnn_repro/cudnn_repro/stage1_annotate_sdpa_fp8_bwd.py
+168 −0		tools/cudnn_repro/cudnn_repro/stage1_annotate_sdpa_fp8_fwd.py
+2 −7		tools/cudnn_repro/cudnn_repro/stage2_build_repro.py
+4 −32		tools/cudnn_repro/cudnn_repro/stage2_build_repro_sdpa_bwd.py
+26 −0		tools/cudnn_repro/cudnn_repro/stage2_build_repro_sdpa_fp8_bwd.py
+26 −0		tools/cudnn_repro/cudnn_repro/stage2_build_repro_sdpa_fp8_fwd.py
+4 −31		tools/cudnn_repro/cudnn_repro/stage2_build_repro_sdpa_fwd.py
+61 −0		tools/cudnn_repro/cudnn_repro/utils.py
+172 −0		tools/cudnn_repro/tests/test_cudnn_repro_bwd.py
+90 −0		tools/cudnn_repro/tests/test_cudnn_repro_closed_loop.py
+229 −0		tools/cudnn_repro/tests/test_cudnn_repro_fp8.py
+25 −0		tools/cudnn_repro/tests/test_cudnn_repro_fp8_closed_loop.py
+94 −0		tools/cudnn_repro/tests/test_cudnn_repro_schema.py