From dc7438bf7d8513f94f4690398121c3e8002a5cde Mon Sep 17 00:00:00 2001 From: 0hujun <96733800+0hujun@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:39:24 +0800 Subject: [PATCH 1/2] fix: when setting fsdp size unuse megatron for gather in npu --- src/twinkle/utils/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/twinkle/utils/framework.py b/src/twinkle/utils/framework.py index 1cd04fec..d59d0e2d 100644 --- a/src/twinkle/utils/framework.py +++ b/src/twinkle/utils/framework.py @@ -43,7 +43,7 @@ def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None): output_objects = [object] group_size = 1 if dist.is_available() and dist.is_initialized(): - if Platform.device_prefix() == 'npu': + if Platform.device_prefix() == 'npu' and 'fsdp' not in device_mesh.mesh_dim_names: # On NPU, letting Python object collectives use the default HCCL # group previously hung in 8-card metric collection at # ``dist.all_gather_object(...)``. Reuse Megatron's dedicated Gloo From badf1a3c8364c8786cccc8bc82a6de160107737c Mon Sep 17 00:00:00 2001 From: 0hujun <96733800+0hujun@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:15:45 +0800 Subject: [PATCH 2/2] fix: when setting fsdp size unuse megatron for gather in npu --- src/twinkle/utils/framework.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/twinkle/utils/framework.py b/src/twinkle/utils/framework.py index d59d0e2d..7bdb9b64 100644 --- a/src/twinkle/utils/framework.py +++ b/src/twinkle/utils/framework.py @@ -43,7 +43,7 @@ def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None): output_objects = [object] group_size = 1 if dist.is_available() and dist.is_initialized(): - if Platform.device_prefix() == 'npu' and 'fsdp' not in device_mesh.mesh_dim_names: + if Platform.device_prefix() == 'npu' and not device_mesh.has_dim('fsdp'): # On NPU, letting Python object collectives use the default HCCL # group previously hung in 8-card metric collection at # ``dist.all_gather_object(...)``. Reuse Megatron's dedicated Gloo @@ -51,8 +51,9 @@ def gather_object(object: Any, device_mesh: DeviceMesh, process_group=None): # variant, otherwise the rank span for metric aggregation is wrong. if importlib.util.find_spec('megatron.core') is not None: from megatron.core import parallel_state as mpu - process_group = mpu.get_data_parallel_group_gloo( - with_context_parallel=getattr(device_mesh, 'cp_world_size', 1) > 1) + if mpu.model_parallel_is_initialized(): + process_group = mpu.get_data_parallel_group_gloo( + with_context_parallel=getattr(device_mesh, 'cp_world_size', 1) > 1) group_size = dist.get_world_size(group=process_group) if group_size > 1: output_objects = [None for _ in range(group_size)]