From e6ebca7137197f2bf5973501c3eb5a4147daa73b Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Fri, 17 Apr 2026 14:14:13 +0800 Subject: [PATCH 01/15] Create test_method.py --- cookbook/transformers/test_method.py | 49 ++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 cookbook/transformers/test_method.py diff --git a/cookbook/transformers/test_method.py b/cookbook/transformers/test_method.py new file mode 100644 index 00000000..a846309d --- /dev/null +++ b/cookbook/transformers/test_method.py @@ -0,0 +1,49 @@ +#固定随机 +import random +import numpy as np +def seed_all_own(seed=1234, mode=True, is_gpu=True): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + os.environ['GLOBAL_SEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(mode) + if is_gpu: + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + os.environ['CUDA_LAUNCH_BLOCKING'] = '1' + torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enable = False + torch.backends.cudnn.benchmark = False + else: + import torch_npu + os.environ['HCCL_DETERMINISTIC'] = 'true' + os.environ['CLOSE_MATMUL_K_SHIFT'] = '1' + torch_npu.npu.manual_seed_all(seed) + torch_npu.npu.manual_seed(seed) + print("====== seed all ========") +seed_all_own(is_gpu=False) +from msprobe.pytorch import seed_all +seed_all(mode=True) + +def get_time(): + import time + torch.npu.synchronize() + return time.time() + + +def set_modules_to_forward_prefetch(block, num_to_forward_prefetch): + for i, layer in enumerate(block.layers): + if i < num_to_forward_prefetch: + continue + layers_to_prefetch = [layers[i + j] for j in range(1, num_to_forward_prefetch + 1)] + layer.set_modules_to_forward_prefetch(layers_to_prefetch) + + +def set_modules_to_backward_prefetch(block, num_to_backward_prefetch): + for i, layer in enumerate(block.layers): + if i < num_to_backward_prefetch: + continue + layers_to_prefetch = [layers[i - j] for j in range(1, num_to_backward_prefetch + 1)] + layer.set_modules_to_backward_prefetch(layers_to_prefetch) From 1e7f7e538afa5d1cdc77ace5f08711b1e5bc1e8b Mon Sep 17 00:00:00 2001 From: a550580874 Date: Tue, 21 Apr 2026 18:40:29 +0800 Subject: [PATCH 02/15] npu moe patch --- cookbook/transformers/fsdp2_moe.py | 19 +++- cookbook/transformers/fsdp2_moe_npu.sh | 10 ++ cookbook/transformers/monkey_patch_npu.py | 116 ++++++++++++++++++++++ cookbook/transformers/test_method.py | 49 --------- src/twinkle/utils/__init__.py | 2 +- src/twinkle/utils/utils.py | 8 ++ 6 files changed, 151 insertions(+), 53 deletions(-) create mode 100644 cookbook/transformers/fsdp2_moe_npu.sh create mode 100644 cookbook/transformers/monkey_patch_npu.py delete mode 100644 cookbook/transformers/test_method.py diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 3ea649d3..0fdfbdb5 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -8,6 +8,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.model import TransformersModel from twinkle.preprocessor import SelfCognitionProcessor +from twinkle.utils import is_torch_npu_available # Construct a device_mesh, fsdp=4, dp=2 device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) @@ -16,6 +17,14 @@ logger = get_logger() +# npu patch +if is_torch_npu_available(): + from monkey_patch_npu import apply_hf_moe_grouped_mm_patch + apply_hf_moe_grouped_mm_patch() + import torch_npu + from torch_npu.contrib import transfer_to_npu + + def eval(model): # 100 Samples @@ -30,8 +39,8 @@ def eval(model): metrics = model.calculate_metric(is_training=False) return metrics - def train(): + # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -44,8 +53,8 @@ def train(): dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel, transformer_cls_names_to_wrap=Qwen3MoeSparseMoeBlock to avoid hang of fsdp2 model = TransformersModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']}) - # Patch MoE model to fix the hang bug, support transformers==4.* - model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch') + # # Patch MoE model to fix the hang bug, support transformers==4.* + # model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch') lora_config = LoraConfig( r=8, lora_alpha=32, @@ -65,7 +74,9 @@ def train(): logger.info(f'Total steps: {len(dataloader)}') loss_metric = 99.0 # lora: 34G * 8 + rank = dist.get_rank() for step, batch in enumerate(dataloader): + start_time = get_time() # Do forward and backward model.forward_backward(inputs=batch) # Step @@ -81,6 +92,8 @@ def train(): if loss_metric > float(metrics['loss']): model.save(f'checkpoint-{step}') loss_metric = float(metrics['loss']) + if rank == 0: + print(f"step_time: {get_time() - start_time}") model.save(f'last-checkpoint') diff --git a/cookbook/transformers/fsdp2_moe_npu.sh b/cookbook/transformers/fsdp2_moe_npu.sh new file mode 100644 index 00000000..80de2583 --- /dev/null +++ b/cookbook/transformers/fsdp2_moe_npu.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# CANN loading +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +# export CPU_AFFINITY_CONF=2 +# export HCCL_BUFFSIZE=400 +# export HCCL_CONNECT_TIMEOUT=1600 + +ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2_moe.py >> gemm_change.log diff --git a/cookbook/transformers/monkey_patch_npu.py b/cookbook/transformers/monkey_patch_npu.py new file mode 100644 index 00000000..7557e741 --- /dev/null +++ b/cookbook/transformers/monkey_patch_npu.py @@ -0,0 +1,116 @@ +import functools +import torch +from twinkle.utils import is_torch_npu_available + +if is_torch_npu_available: + import torch_npu + +class GmmFunction(torch.autograd.Function): + """ + 对应 transformers 的 _grouped_mm 语义层,但这里直接使用 3D weight: + x: [M, K] + group_list: [E] + weight_ekn: [E, K, N] + + 输出: + y: [M, N] + """ + + @staticmethod + def forward(ctx, x: torch.Tensor, group_list: torch.Tensor, weight_ekn: torch.Tensor): + assert x.dim() == 2, f"x must be [M, K], got {tuple(x.shape)}" + assert group_list.dim() == 1, f"group_list must be [E], got {tuple(group_list.shape)}" + assert weight_ekn.dim() == 3, f"weight_ekn must be [E, K, N], got {tuple(weight_ekn.shape)}" + assert group_list.numel() == weight_ekn.size(0), ( + f"group_list len {group_list.numel()} != num_experts {weight_ekn.size(0)}" + ) + assert x.size(1) == weight_ekn.size(1), ( + f"input dim mismatch: x.shape={tuple(x.shape)}, weight_ekn.shape={tuple(weight_ekn.shape)}" + ) + + group_list = group_list.to(torch.int64) + + ctx.save_for_backward(x, group_list, weight_ekn) + + # 关键:single x + single 3D weight + outputs = torch_npu.npu_grouped_matmul( + [x], + [weight_ekn], + group_list=group_list, + group_type=0, + split_item=2, + group_list_type=1, + ) + return outputs[0] + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + x, group_list, weight_ekn = ctx.saved_tensors + + # -------------------------------------------------- + # 1) grad_input + # dX_i = dY_i @ W_i^T + # weight_ekn.transpose(-2, -1): [E, N, K] + # -------------------------------------------------- + grad_input = torch_npu.npu_grouped_matmul( + [grad_output], + [weight_ekn.transpose(-2, -1).contiguous()], + bias=None, + group_list=group_list, + group_type=0, + split_item=2, + group_list_type=1, + )[0] + + # -------------------------------------------------- + # 2) grad_weight + # dW_i = X_i^T @ dY_i + # + # 延续你前面已验证过的约束: + # group_type=2 时,x 必须是 transpose view,不能 contiguous() + # + # 这里输出希望得到 packed [E, K, N] + # -------------------------------------------------- + grad_weight = torch_npu.npu_grouped_matmul( + [x.transpose(0, 1)], + [grad_output], + bias=None, + group_list=group_list, + group_type=2, + split_item=3, + group_list_type=1, + )[0] + + return grad_input, None, grad_weight.contiguous() + + +def _grouped_mm_npu(input: torch.Tensor, weight_ekn: torch.Tensor, offs: torch.Tensor) -> torch.Tensor: + """ + 对齐 transformers 的 _grouped_mm(input, weight, offs) + + input: [M, K] + weight_ekn: [E, K, N] + offs: [E] cumulative ends + """ + assert input.dim() == 2, f"input must be [M, K], got {tuple(input.shape)}" + assert weight_ekn.dim() == 3, f"weight_ekn must be [E, K, N], got {tuple(weight_ekn.shape)}" + assert offs.dim() == 1, f"offs must be [E], got {tuple(offs.shape)}" + assert weight_ekn.size(0) == offs.numel(), ( + f"weight_ekn.size(0)={weight_ekn.size(0)} != offs.numel()={offs.numel()}" + ) + + counts = torch.empty_like(offs) + counts[0] = offs[0] + if offs.numel() > 1: + counts[1:] = offs[1:] - offs[:-1] + counts = counts.to(torch.int64) + + return GmmFunction.apply(input, counts, weight_ekn) + + +def apply_hf_moe_grouped_mm_patch(): + import transformers.integrations.moe as hf_moe + + hf_moe._grouped_mm = _grouped_mm_npu + print("[PATCH] transformers.integrations.moe._grouped_mm -> _grouped_mm_npu") + diff --git a/cookbook/transformers/test_method.py b/cookbook/transformers/test_method.py deleted file mode 100644 index a846309d..00000000 --- a/cookbook/transformers/test_method.py +++ /dev/null @@ -1,49 +0,0 @@ -#固定随机 -import random -import numpy as np -def seed_all_own(seed=1234, mode=True, is_gpu=True): - random.seed(seed) - os.environ['PYTHONHASHSEED'] = str(seed) - os.environ['GLOBAL_SEED'] = str(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.use_deterministic_algorithms(mode) - if is_gpu: - os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' - os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - torch.cuda.manual_seed_all(seed) - torch.cuda.manual_seed(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.enable = False - torch.backends.cudnn.benchmark = False - else: - import torch_npu - os.environ['HCCL_DETERMINISTIC'] = 'true' - os.environ['CLOSE_MATMUL_K_SHIFT'] = '1' - torch_npu.npu.manual_seed_all(seed) - torch_npu.npu.manual_seed(seed) - print("====== seed all ========") -seed_all_own(is_gpu=False) -from msprobe.pytorch import seed_all -seed_all(mode=True) - -def get_time(): - import time - torch.npu.synchronize() - return time.time() - - -def set_modules_to_forward_prefetch(block, num_to_forward_prefetch): - for i, layer in enumerate(block.layers): - if i < num_to_forward_prefetch: - continue - layers_to_prefetch = [layers[i + j] for j in range(1, num_to_forward_prefetch + 1)] - layer.set_modules_to_forward_prefetch(layers_to_prefetch) - - -def set_modules_to_backward_prefetch(block, num_to_backward_prefetch): - for i, layer in enumerate(block.layers): - if i < num_to_backward_prefetch: - continue - layers_to_prefetch = [layers[i - j] for j in range(1, num_to_backward_prefetch + 1)] - layer.set_modules_to_backward_prefetch(layers_to_prefetch) diff --git a/src/twinkle/utils/__init__.py b/src/twinkle/utils/__init__.py index cca7e63b..b852fa99 100644 --- a/src/twinkle/utils/__init__.py +++ b/src/twinkle/utils/__init__.py @@ -14,5 +14,5 @@ stateless_init_process_group, to_device) from .transformers_utils import find_all_linears, find_layers, get_modules_to_not_convert from .unsafe import check_unsafe, trust_remote_code -from .utils import copy_files_by_pattern, deep_getattr +from .utils import copy_files_by_pattern, deep_getattr, is_torch_npu_available from .vision_tools import load_image, load_mm_file diff --git a/src/twinkle/utils/utils.py b/src/twinkle/utils/utils.py index 0b0ae4d0..610489eb 100644 --- a/src/twinkle/utils/utils.py +++ b/src/twinkle/utils/utils.py @@ -77,3 +77,11 @@ def should_exclude_file(file_path, file_name): destination = os.path.join(dest_dir, file_name) if not os.path.exists(destination): shutil.copy2(file_path, destination) + +def is_torch_npu_available(): + try: + import torch + import torch_npu # noqa: F401 + return hasattr(torch, "npu") and torch.npu.is_available() + except Exception: + return False \ No newline at end of file From 1cebcfa69d68a7dee58a798ef2816aeaacab0eaa Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:43:52 +0800 Subject: [PATCH 03/15] delete log --- cookbook/transformers/fsdp2_moe_npu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbook/transformers/fsdp2_moe_npu.sh b/cookbook/transformers/fsdp2_moe_npu.sh index 80de2583..8df9128e 100644 --- a/cookbook/transformers/fsdp2_moe_npu.sh +++ b/cookbook/transformers/fsdp2_moe_npu.sh @@ -7,4 +7,4 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh # export HCCL_BUFFSIZE=400 # export HCCL_CONNECT_TIMEOUT=1600 -ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2_moe.py >> gemm_change.log +ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2_moe.py From 72f79bf4a57a9f23622e5b7a28019755bb2c5254 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:45:24 +0800 Subject: [PATCH 04/15] Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- cookbook/transformers/monkey_patch_npu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbook/transformers/monkey_patch_npu.py b/cookbook/transformers/monkey_patch_npu.py index 7557e741..15a97b77 100644 --- a/cookbook/transformers/monkey_patch_npu.py +++ b/cookbook/transformers/monkey_patch_npu.py @@ -2,7 +2,7 @@ import torch from twinkle.utils import is_torch_npu_available -if is_torch_npu_available: +if is_torch_npu_available(): import torch_npu class GmmFunction(torch.autograd.Function): From 32d929561df1f184e4a042494807784fb567c07b Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:48:50 +0800 Subject: [PATCH 05/15] Update fsdp2_moe.py --- cookbook/transformers/fsdp2_moe.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 0fdfbdb5..5e268d77 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -40,7 +40,6 @@ def eval(model): return metrics def train(): - # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -53,8 +52,8 @@ def train(): dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel, transformer_cls_names_to_wrap=Qwen3MoeSparseMoeBlock to avoid hang of fsdp2 model = TransformersModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']}) - # # Patch MoE model to fix the hang bug, support transformers==4.* - # model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch') + # Patch MoE model to fix the hang bug, support transformers==4.* + model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch') lora_config = LoraConfig( r=8, lora_alpha=32, @@ -76,7 +75,6 @@ def train(): # lora: 34G * 8 rank = dist.get_rank() for step, batch in enumerate(dataloader): - start_time = get_time() # Do forward and backward model.forward_backward(inputs=batch) # Step @@ -92,8 +90,6 @@ def train(): if loss_metric > float(metrics['loss']): model.save(f'checkpoint-{step}') loss_metric = float(metrics['loss']) - if rank == 0: - print(f"step_time: {get_time() - start_time}") model.save(f'last-checkpoint') From 08c2320566437fc729c08fb4dd171034251704f6 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:50:27 +0800 Subject: [PATCH 06/15] Remove unnecessary blank line in fsdp2_moe.py --- cookbook/transformers/fsdp2_moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 5e268d77..8cf6a5cb 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -25,7 +25,6 @@ from torch_npu.contrib import transfer_to_npu - def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) From f2c350f9d2ebdfe53dc5a6557757e62f4a76e377 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:51:03 +0800 Subject: [PATCH 07/15] Add NPU patch for HF MoE grouped MM --- cookbook/transformers/fsdp2_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 8cf6a5cb..6b929fd2 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -17,6 +17,7 @@ logger = get_logger() + # npu patch if is_torch_npu_available(): from monkey_patch_npu import apply_hf_moe_grouped_mm_patch @@ -38,6 +39,7 @@ def eval(model): metrics = model.calculate_metric(is_training=False) return metrics + def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) From c1c21a1109d19cb457febc603b815704daf7dead Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:52:30 +0800 Subject: [PATCH 08/15] Update fsdp2_moe.py --- cookbook/transformers/fsdp2_moe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 6b929fd2..a0ed3307 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -25,7 +25,6 @@ import torch_npu from torch_npu.contrib import transfer_to_npu - def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) @@ -74,7 +73,6 @@ def train(): logger.info(f'Total steps: {len(dataloader)}') loss_metric = 99.0 # lora: 34G * 8 - rank = dist.get_rank() for step, batch in enumerate(dataloader): # Do forward and backward model.forward_backward(inputs=batch) From e1f172b58aef74d8002b19abd7b5d7bac42d8d46 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:15:04 +0800 Subject: [PATCH 09/15] Clean up comments in monkey_patch_npu.py Removed Chinese comments and unnecessary code comments for clarity. --- cookbook/transformers/monkey_patch_npu.py | 36 ++--------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/cookbook/transformers/monkey_patch_npu.py b/cookbook/transformers/monkey_patch_npu.py index 15a97b77..a12f5c33 100644 --- a/cookbook/transformers/monkey_patch_npu.py +++ b/cookbook/transformers/monkey_patch_npu.py @@ -6,16 +6,6 @@ import torch_npu class GmmFunction(torch.autograd.Function): - """ - 对应 transformers 的 _grouped_mm 语义层,但这里直接使用 3D weight: - x: [M, K] - group_list: [E] - weight_ekn: [E, K, N] - - 输出: - y: [M, N] - """ - @staticmethod def forward(ctx, x: torch.Tensor, group_list: torch.Tensor, weight_ekn: torch.Tensor): assert x.dim() == 2, f"x must be [M, K], got {tuple(x.shape)}" @@ -32,7 +22,6 @@ def forward(ctx, x: torch.Tensor, group_list: torch.Tensor, weight_ekn: torch.Te ctx.save_for_backward(x, group_list, weight_ekn) - # 关键:single x + single 3D weight outputs = torch_npu.npu_grouped_matmul( [x], [weight_ekn], @@ -46,12 +35,7 @@ def forward(ctx, x: torch.Tensor, group_list: torch.Tensor, weight_ekn: torch.Te @staticmethod def backward(ctx, grad_output: torch.Tensor): x, group_list, weight_ekn = ctx.saved_tensors - - # -------------------------------------------------- - # 1) grad_input - # dX_i = dY_i @ W_i^T - # weight_ekn.transpose(-2, -1): [E, N, K] - # -------------------------------------------------- + grad_input = torch_npu.npu_grouped_matmul( [grad_output], [weight_ekn.transpose(-2, -1).contiguous()], @@ -61,16 +45,7 @@ def backward(ctx, grad_output: torch.Tensor): split_item=2, group_list_type=1, )[0] - - # -------------------------------------------------- - # 2) grad_weight - # dW_i = X_i^T @ dY_i - # - # 延续你前面已验证过的约束: - # group_type=2 时,x 必须是 transpose view,不能 contiguous() - # - # 这里输出希望得到 packed [E, K, N] - # -------------------------------------------------- + grad_weight = torch_npu.npu_grouped_matmul( [x.transpose(0, 1)], [grad_output], @@ -85,13 +60,6 @@ def backward(ctx, grad_output: torch.Tensor): def _grouped_mm_npu(input: torch.Tensor, weight_ekn: torch.Tensor, offs: torch.Tensor) -> torch.Tensor: - """ - 对齐 transformers 的 _grouped_mm(input, weight, offs) - - input: [M, K] - weight_ekn: [E, K, N] - offs: [E] cumulative ends - """ assert input.dim() == 2, f"input must be [M, K], got {tuple(input.shape)}" assert weight_ekn.dim() == 3, f"weight_ekn must be [E, K, N], got {tuple(weight_ekn.shape)}" assert offs.dim() == 1, f"offs must be [E], got {tuple(offs.shape)}" From 87a999c3bebcf5fc4a378cfd8b01c8d1df9a1e87 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:06:17 +0800 Subject: [PATCH 10/15] Update fsdp2_moe_npu.sh --- cookbook/transformers/fsdp2_moe_npu.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cookbook/transformers/fsdp2_moe_npu.sh b/cookbook/transformers/fsdp2_moe_npu.sh index 8df9128e..349f9d0d 100644 --- a/cookbook/transformers/fsdp2_moe_npu.sh +++ b/cookbook/transformers/fsdp2_moe_npu.sh @@ -3,8 +3,4 @@ # CANN loading source /usr/local/Ascend/ascend-toolkit/set_env.sh -# export CPU_AFFINITY_CONF=2 -# export HCCL_BUFFSIZE=400 -# export HCCL_CONNECT_TIMEOUT=1600 - ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 fsdp2_moe.py From 3021e73cf9f67921f07be1a0ba2396efef1f07a7 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Fri, 24 Apr 2026 17:27:38 +0800 Subject: [PATCH 11/15] clean code --- cookbook/transformers/fsdp2_moe.py | 7 +++---- src/twinkle/kernel/__init__.py | 1 + .../twinkle/kernel}/monkey_patch_npu.py | 9 +++++---- 3 files changed, 9 insertions(+), 8 deletions(-) rename {cookbook/transformers => src/twinkle/kernel}/monkey_patch_npu.py (95%) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index a0ed3307..cdc24ad6 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -9,6 +9,7 @@ from twinkle.model import TransformersModel from twinkle.preprocessor import SelfCognitionProcessor from twinkle.utils import is_torch_npu_available +from twinkle.kernel import apply_npu_patch # Construct a device_mesh, fsdp=4, dp=2 device_mesh = DeviceMesh.from_sizes(fsdp_size=4, dp_size=2) @@ -20,10 +21,8 @@ # npu patch if is_torch_npu_available(): - from monkey_patch_npu import apply_hf_moe_grouped_mm_patch - apply_hf_moe_grouped_mm_patch() - import torch_npu - from torch_npu.contrib import transfer_to_npu + apply_npu_patch() + def eval(model): # 100 Samples diff --git a/src/twinkle/kernel/__init__.py b/src/twinkle/kernel/__init__.py index fb07ba03..8de317bf 100644 --- a/src/twinkle/kernel/__init__.py +++ b/src/twinkle/kernel/__init__.py @@ -7,6 +7,7 @@ from .function import apply_function_kernel, register_function_kernel from .layer import apply_layer_kernel, register_layer_batch, register_layer_kernel from .registry import register_external_layer as _register_external_layer +from .monkey_patch_npu import apply_npu_patch logger = getLogger(__name__) diff --git a/cookbook/transformers/monkey_patch_npu.py b/src/twinkle/kernel/monkey_patch_npu.py similarity index 95% rename from cookbook/transformers/monkey_patch_npu.py rename to src/twinkle/kernel/monkey_patch_npu.py index a12f5c33..d1e94511 100644 --- a/cookbook/transformers/monkey_patch_npu.py +++ b/src/twinkle/kernel/monkey_patch_npu.py @@ -1,9 +1,5 @@ import functools import torch -from twinkle.utils import is_torch_npu_available - -if is_torch_npu_available(): - import torch_npu class GmmFunction(torch.autograd.Function): @staticmethod @@ -82,3 +78,8 @@ def apply_hf_moe_grouped_mm_patch(): hf_moe._grouped_mm = _grouped_mm_npu print("[PATCH] transformers.integrations.moe._grouped_mm -> _grouped_mm_npu") +def apply_npu_patch(): + import torch + import torch_npu + from torch_npu.contrib import transfer_to_npu + apply_hf_moe_grouped_mm_patch() \ No newline at end of file From 588ae42796a746830a9d7c29cdee834a6e2417de Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Fri, 24 Apr 2026 18:13:31 +0800 Subject: [PATCH 12/15] update --- src/twinkle/kernel/monkey_patch_npu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/twinkle/kernel/monkey_patch_npu.py b/src/twinkle/kernel/monkey_patch_npu.py index d1e94511..cff8ebc7 100644 --- a/src/twinkle/kernel/monkey_patch_npu.py +++ b/src/twinkle/kernel/monkey_patch_npu.py @@ -1,5 +1,6 @@ import functools import torch +import torch_npu class GmmFunction(torch.autograd.Function): @staticmethod From ad59ddd55bf43ee3887f2da381b7faeba50d4fe3 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Sat, 25 Apr 2026 15:03:40 +0800 Subject: [PATCH 13/15] fix the linting --- cookbook/transformers/fsdp2_moe.py | 4 ++-- src/twinkle/kernel/__init__.py | 2 +- src/twinkle/kernel/monkey_patch_npu.py | 32 +++++++++++++------------- src/twinkle/utils/utils.py | 5 ++-- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index cdc24ad6..12259877 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -22,8 +22,8 @@ # npu patch if is_torch_npu_available(): apply_npu_patch() - - + + def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) diff --git a/src/twinkle/kernel/__init__.py b/src/twinkle/kernel/__init__.py index 8de317bf..a042afe8 100644 --- a/src/twinkle/kernel/__init__.py +++ b/src/twinkle/kernel/__init__.py @@ -6,8 +6,8 @@ from .base import DeviceType, ModeType, is_kernels_enabled from .function import apply_function_kernel, register_function_kernel from .layer import apply_layer_kernel, register_layer_batch, register_layer_kernel -from .registry import register_external_layer as _register_external_layer from .monkey_patch_npu import apply_npu_patch +from .registry import register_external_layer as _register_external_layer logger = getLogger(__name__) diff --git a/src/twinkle/kernel/monkey_patch_npu.py b/src/twinkle/kernel/monkey_patch_npu.py index cff8ebc7..90616c9e 100644 --- a/src/twinkle/kernel/monkey_patch_npu.py +++ b/src/twinkle/kernel/monkey_patch_npu.py @@ -2,18 +2,18 @@ import torch import torch_npu + class GmmFunction(torch.autograd.Function): + @staticmethod def forward(ctx, x: torch.Tensor, group_list: torch.Tensor, weight_ekn: torch.Tensor): - assert x.dim() == 2, f"x must be [M, K], got {tuple(x.shape)}" - assert group_list.dim() == 1, f"group_list must be [E], got {tuple(group_list.shape)}" - assert weight_ekn.dim() == 3, f"weight_ekn must be [E, K, N], got {tuple(weight_ekn.shape)}" + assert x.dim() == 2, f'x must be [M, K], got {tuple(x.shape)}' + assert group_list.dim() == 1, f'group_list must be [E], got {tuple(group_list.shape)}' + assert weight_ekn.dim() == 3, f'weight_ekn must be [E, K, N], got {tuple(weight_ekn.shape)}' assert group_list.numel() == weight_ekn.size(0), ( - f"group_list len {group_list.numel()} != num_experts {weight_ekn.size(0)}" - ) + f'group_list len {group_list.numel()} != num_experts {weight_ekn.size(0)}') assert x.size(1) == weight_ekn.size(1), ( - f"input dim mismatch: x.shape={tuple(x.shape)}, weight_ekn.shape={tuple(weight_ekn.shape)}" - ) + f'input dim mismatch: x.shape={tuple(x.shape)}, weight_ekn.shape={tuple(weight_ekn.shape)}') group_list = group_list.to(torch.int64) @@ -32,7 +32,7 @@ def forward(ctx, x: torch.Tensor, group_list: torch.Tensor, weight_ekn: torch.Te @staticmethod def backward(ctx, grad_output: torch.Tensor): x, group_list, weight_ekn = ctx.saved_tensors - + grad_input = torch_npu.npu_grouped_matmul( [grad_output], [weight_ekn.transpose(-2, -1).contiguous()], @@ -42,7 +42,7 @@ def backward(ctx, grad_output: torch.Tensor): split_item=2, group_list_type=1, )[0] - + grad_weight = torch_npu.npu_grouped_matmul( [x.transpose(0, 1)], [grad_output], @@ -57,12 +57,11 @@ def backward(ctx, grad_output: torch.Tensor): def _grouped_mm_npu(input: torch.Tensor, weight_ekn: torch.Tensor, offs: torch.Tensor) -> torch.Tensor: - assert input.dim() == 2, f"input must be [M, K], got {tuple(input.shape)}" - assert weight_ekn.dim() == 3, f"weight_ekn must be [E, K, N], got {tuple(weight_ekn.shape)}" - assert offs.dim() == 1, f"offs must be [E], got {tuple(offs.shape)}" + assert input.dim() == 2, f'input must be [M, K], got {tuple(input.shape)}' + assert weight_ekn.dim() == 3, f'weight_ekn must be [E, K, N], got {tuple(weight_ekn.shape)}' + assert offs.dim() == 1, f'offs must be [E], got {tuple(offs.shape)}' assert weight_ekn.size(0) == offs.numel(), ( - f"weight_ekn.size(0)={weight_ekn.size(0)} != offs.numel()={offs.numel()}" - ) + f'weight_ekn.size(0)={weight_ekn.size(0)} != offs.numel()={offs.numel()}') counts = torch.empty_like(offs) counts[0] = offs[0] @@ -77,10 +76,11 @@ def apply_hf_moe_grouped_mm_patch(): import transformers.integrations.moe as hf_moe hf_moe._grouped_mm = _grouped_mm_npu - print("[PATCH] transformers.integrations.moe._grouped_mm -> _grouped_mm_npu") + print('[PATCH] transformers.integrations.moe._grouped_mm -> _grouped_mm_npu') + def apply_npu_patch(): import torch import torch_npu from torch_npu.contrib import transfer_to_npu - apply_hf_moe_grouped_mm_patch() \ No newline at end of file + apply_hf_moe_grouped_mm_patch() diff --git a/src/twinkle/utils/utils.py b/src/twinkle/utils/utils.py index 610489eb..211a0a7a 100644 --- a/src/twinkle/utils/utils.py +++ b/src/twinkle/utils/utils.py @@ -78,10 +78,11 @@ def should_exclude_file(file_path, file_name): if not os.path.exists(destination): shutil.copy2(file_path, destination) + def is_torch_npu_available(): try: import torch import torch_npu # noqa: F401 - return hasattr(torch, "npu") and torch.npu.is_available() + return hasattr(torch, 'npu') and torch.npu.is_available() except Exception: - return False \ No newline at end of file + return False From 090f91e67a8ed52cc68a2955edc6256676a6bc0f Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Sat, 25 Apr 2026 16:23:38 +0800 Subject: [PATCH 14/15] update is_npu_available use --- cookbook/transformers/fsdp2_moe.py | 4 ++-- src/twinkle/utils/__init__.py | 2 +- src/twinkle/utils/utils.py | 8 -------- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 12259877..a2965ee5 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -8,7 +8,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.model import TransformersModel from twinkle.preprocessor import SelfCognitionProcessor -from twinkle.utils import is_torch_npu_available +from twinkle.utils.framework import Torch from twinkle.kernel import apply_npu_patch # Construct a device_mesh, fsdp=4, dp=2 @@ -20,7 +20,7 @@ # npu patch -if is_torch_npu_available(): +if Torch.is_npu_available(): apply_npu_patch() diff --git a/src/twinkle/utils/__init__.py b/src/twinkle/utils/__init__.py index b852fa99..cca7e63b 100644 --- a/src/twinkle/utils/__init__.py +++ b/src/twinkle/utils/__init__.py @@ -14,5 +14,5 @@ stateless_init_process_group, to_device) from .transformers_utils import find_all_linears, find_layers, get_modules_to_not_convert from .unsafe import check_unsafe, trust_remote_code -from .utils import copy_files_by_pattern, deep_getattr, is_torch_npu_available +from .utils import copy_files_by_pattern, deep_getattr from .vision_tools import load_image, load_mm_file diff --git a/src/twinkle/utils/utils.py b/src/twinkle/utils/utils.py index 211a0a7a..782b9d74 100644 --- a/src/twinkle/utils/utils.py +++ b/src/twinkle/utils/utils.py @@ -78,11 +78,3 @@ def should_exclude_file(file_path, file_name): if not os.path.exists(destination): shutil.copy2(file_path, destination) - -def is_torch_npu_available(): - try: - import torch - import torch_npu # noqa: F401 - return hasattr(torch, 'npu') and torch.npu.is_available() - except Exception: - return False From 429922c255d091cdb3fa827fc8d5b7c8f101ae69 Mon Sep 17 00:00:00 2001 From: a550580874 <82751568+a550580874@users.noreply.github.com> Date: Sat, 25 Apr 2026 16:25:05 +0800 Subject: [PATCH 15/15] clean --- src/twinkle/utils/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/twinkle/utils/utils.py b/src/twinkle/utils/utils.py index 782b9d74..0b0ae4d0 100644 --- a/src/twinkle/utils/utils.py +++ b/src/twinkle/utils/utils.py @@ -77,4 +77,3 @@ def should_exclude_file(file_path, file_name): destination = os.path.join(dest_dir, file_name) if not os.path.exists(destination): shutil.copy2(file_path, destination) -