From 22e72df13dbe4ab197ea8a68d93160b63c4348eb Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 29 Apr 2026 06:53:06 +0000 Subject: [PATCH 1/3] Fix tensor bridge DLL import failure on Windows aoti_torch_get_current_cuda_stream lives in torch_cuda.dll, not torch_cpu.dll. The stub import library pointed at the wrong DLL, causing "The specified procedure could not be found" on Windows. - Move aoti_torch_get_current_cuda_stream from aoti_shim.def (torch_cpu.dll) to new aoti_shim_cuda.def (torch_cuda.dll) - Update build_hooks.py to generate stub libs for both DLLs via a loop - Add torch_cuda.dll to delvewheel exclude list Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/build_hooks.py | 23 +++++++++++-------- cuda_core/cuda/core/_include/aoti_shim.def | 1 - .../cuda/core/_include/aoti_shim_cuda.def | 5 ++++ cuda_core/pyproject.toml | 2 +- 4 files changed, 19 insertions(+), 12 deletions(-) create mode 100644 cuda_core/cuda/core/_include/aoti_shim_cuda.def diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 444da18eb13..7174df48764 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -183,19 +183,22 @@ def get_sources(mod_name): # related to free-threading builds. extra_compile_args += ["-DCYTHON_TRACE_NOGIL=1", "-DCYTHON_USE_SYS_MONITORING=0"] - # On Windows, _tensor_bridge.pyx needs a stub import library so the MSVC - # linker can resolve the AOTI symbols (they live in torch_cpu.dll at - # runtime). We generate the .lib from a .def file at build time. + # On Windows, _tensor_bridge.pyx needs stub import libraries so the MSVC + # linker can resolve the AOTI symbols at link time. At runtime the symbols + # resolve from the actual DLLs loaded by 'import torch'. + # - aoti_shim.def -> torch_cpu.dll (dtype, device, tensor metadata) + # - aoti_shim_cuda.def -> torch_cuda.dll (CUDA stream access) _aoti_extra_link_args = [] if sys.platform == "win32": - _def_file = os.path.join("cuda", "core", "_include", "aoti_shim.def") - _lib_file = os.path.join("build", "aoti_shim.lib") os.makedirs("build", exist_ok=True) - subprocess.check_call( # noqa: S603 - ["lib", f"/DEF:{_def_file}", f"/OUT:{_lib_file}", "/MACHINE:X64"], # noqa: S607 - stdout=subprocess.DEVNULL, - ) - _aoti_extra_link_args = [_lib_file] + for def_name in ("aoti_shim", "aoti_shim_cuda"): + def_file = os.path.join("cuda", "core", "_include", f"{def_name}.def") + lib_file = os.path.join("build", f"{def_name}.lib") + subprocess.check_call( # noqa: S603 + ["lib", f"/DEF:{def_file}", f"/OUT:{lib_file}", "/MACHINE:X64"], # noqa: S607 + stdout=subprocess.DEVNULL, + ) + _aoti_extra_link_args.append(lib_file) def get_extra_link_args(mod_name): if mod_name == "_tensor_bridge" and _aoti_extra_link_args: diff --git a/cuda_core/cuda/core/_include/aoti_shim.def b/cuda_core/cuda/core/_include/aoti_shim.def index 5cc6897e815..e21097bd25e 100644 --- a/cuda_core/cuda/core/_include/aoti_shim.def +++ b/cuda_core/cuda/core/_include/aoti_shim.def @@ -34,4 +34,3 @@ EXPORTS aoti_torch_get_device_index aoti_torch_device_type_cpu aoti_torch_device_type_cuda - aoti_torch_get_current_cuda_stream diff --git a/cuda_core/cuda/core/_include/aoti_shim_cuda.def b/cuda_core/cuda/core/_include/aoti_shim_cuda.def new file mode 100644 index 00000000000..a754c3b7d6d --- /dev/null +++ b/cuda_core/cuda/core/_include/aoti_shim_cuda.def @@ -0,0 +1,5 @@ +; Stub import library for CUDA-specific AOTI symbols (torch_cuda.dll). +; See aoti_shim.def for the torch_cpu.dll counterpart. +LIBRARY torch_cuda.dll +EXPORTS + aoti_torch_get_current_cuda_stream diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index aa403409894..fca40ff707d 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -117,4 +117,4 @@ archs = "native" [tool.cibuildwheel.windows] archs = "AMD64" before-build = "pip install delvewheel" -repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll\" -w {dest_dir} {wheel}" +repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll;torch_cuda.dll\" -w {dest_dir} {wheel}" From 77c0b7bfcdfa46f36febe44e1c4a8d5193f079ff Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 29 Apr 2026 07:24:09 +0000 Subject: [PATCH 2/3] Add SPDX headers to aoti_shim_cuda.def Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_include/aoti_shim_cuda.def | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cuda_core/cuda/core/_include/aoti_shim_cuda.def b/cuda_core/cuda/core/_include/aoti_shim_cuda.def index a754c3b7d6d..0c38f3ffaa9 100644 --- a/cuda_core/cuda/core/_include/aoti_shim_cuda.def +++ b/cuda_core/cuda/core/_include/aoti_shim_cuda.def @@ -1,3 +1,6 @@ +; SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +; SPDX-License-Identifier: Apache-2.0 +; ; Stub import library for CUDA-specific AOTI symbols (torch_cuda.dll). ; See aoti_shim.def for the torch_cpu.dll counterpart. LIBRARY torch_cuda.dll From 50294b86f9c084c9a0ceceff380ceb7658dea6d2 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Wed, 29 Apr 2026 20:07:36 +0000 Subject: [PATCH 3/3] Resolve aoti_torch_get_current_cuda_stream lazily at runtime The symbol lives in torch_cuda (not torch_cpu), so linking against it at build time breaks CPU-only PyTorch installs and requires a second stub import library on Windows. Instead, resolve it lazily on first use via dlsym (Linux) / LoadLibrary+GetProcAddress (Windows). The cached function pointer keeps subsequent calls fully in C with zero Python overhead. This reverts the two-def-file approach from the previous commit and replaces it with a self-contained inline C helper that handles both platforms. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/build_hooks.py | 26 +++++------ cuda_core/cuda/core/_include/aoti_shim.h | 18 +++++--- .../cuda/core/_include/aoti_shim_cuda.def | 8 ---- cuda_core/cuda/core/_tensor_bridge.pyx | 43 +++++++++++++++++-- cuda_core/pyproject.toml | 2 +- 5 files changed, 65 insertions(+), 32 deletions(-) delete mode 100644 cuda_core/cuda/core/_include/aoti_shim_cuda.def diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 7174df48764..f4fb4af01f7 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -183,22 +183,22 @@ def get_sources(mod_name): # related to free-threading builds. extra_compile_args += ["-DCYTHON_TRACE_NOGIL=1", "-DCYTHON_USE_SYS_MONITORING=0"] - # On Windows, _tensor_bridge.pyx needs stub import libraries so the MSVC - # linker can resolve the AOTI symbols at link time. At runtime the symbols - # resolve from the actual DLLs loaded by 'import torch'. - # - aoti_shim.def -> torch_cpu.dll (dtype, device, tensor metadata) - # - aoti_shim_cuda.def -> torch_cuda.dll (CUDA stream access) + # On Windows, _tensor_bridge.pyx needs a stub import library so the MSVC + # linker can resolve the AOTI symbols (they live in torch_cpu.dll at + # runtime). We generate the .lib from a .def file at build time. + # Note: aoti_torch_get_current_cuda_stream lives in torch_cuda.dll and + # is resolved lazily at runtime (not via the stub lib) — see + # _tensor_bridge.pyx. _aoti_extra_link_args = [] if sys.platform == "win32": + _def_file = os.path.join("cuda", "core", "_include", "aoti_shim.def") + _lib_file = os.path.join("build", "aoti_shim.lib") os.makedirs("build", exist_ok=True) - for def_name in ("aoti_shim", "aoti_shim_cuda"): - def_file = os.path.join("cuda", "core", "_include", f"{def_name}.def") - lib_file = os.path.join("build", f"{def_name}.lib") - subprocess.check_call( # noqa: S603 - ["lib", f"/DEF:{def_file}", f"/OUT:{lib_file}", "/MACHINE:X64"], # noqa: S607 - stdout=subprocess.DEVNULL, - ) - _aoti_extra_link_args.append(lib_file) + subprocess.check_call( # noqa: S603 + ["lib", f"/DEF:{_def_file}", f"/OUT:{_lib_file}", "/MACHINE:X64"], # noqa: S607 + stdout=subprocess.DEVNULL, + ) + _aoti_extra_link_args = [_lib_file] def get_extra_link_args(mod_name): if mod_name == "_tensor_bridge" and _aoti_extra_link_args: diff --git a/cuda_core/cuda/core/_include/aoti_shim.h b/cuda_core/cuda/core/_include/aoti_shim.h index 809bdb1a2a6..464d27de46c 100644 --- a/cuda_core/cuda/core/_include/aoti_shim.h +++ b/cuda_core/cuda/core/_include/aoti_shim.h @@ -52,10 +52,13 @@ typedef struct AtenTensorOpaque* AtenTensorHandle; /* * IMPORTANT: Keep the AOTI_SHIM_API declaration list below in sync with - * aoti_shim.def. On Windows, build_hooks.py turns that .def file into the + * aoti_shim.def. On Windows, build_hooks.py turns that .def file into the * stub import library that MSVC needs to link _tensor_bridge without making - * PyTorch a build-time dependency. If you add, remove, or rename an imported - * AOTI symbol here, update aoti_shim.def in the same change. + * PyTorch a build-time dependency. If you add, remove, or rename an + * imported AOTI symbol here, update aoti_shim.def in the same change. + * + * Exception: aoti_torch_get_current_cuda_stream lives in torch_cuda (not + * torch_cpu) and is resolved lazily at runtime — see _tensor_bridge.pyx. */ /* ---- tensor metadata --------------------------------------------------- */ @@ -105,10 +108,11 @@ AOTI_SHIM_API AOTITorchError aoti_torch_get_device_index( AOTI_SHIM_API int32_t aoti_torch_device_type_cpu(void); AOTI_SHIM_API int32_t aoti_torch_device_type_cuda(void); -/* ---- stream -------------------------------------------------------------- */ - -AOTI_SHIM_API AOTITorchError aoti_torch_get_current_cuda_stream( - int32_t device_index, void** ret_stream); +/* ---- stream -------------------------------------------------------------- + * aoti_torch_get_current_cuda_stream is NOT declared here — it lives in + * torch_cuda (not torch_cpu) and is resolved at runtime. See the inline + * C helper _resolve_cuda_stream_fn() in _tensor_bridge.pyx. + * ---------------------------------------------------------------------- */ #ifdef __cplusplus } /* extern "C" */ diff --git a/cuda_core/cuda/core/_include/aoti_shim_cuda.def b/cuda_core/cuda/core/_include/aoti_shim_cuda.def deleted file mode 100644 index 0c38f3ffaa9..00000000000 --- a/cuda_core/cuda/core/_include/aoti_shim_cuda.def +++ /dev/null @@ -1,8 +0,0 @@ -; SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -; SPDX-License-Identifier: Apache-2.0 -; -; Stub import library for CUDA-specific AOTI symbols (torch_cuda.dll). -; See aoti_shim.def for the torch_cpu.dll counterpart. -LIBRARY torch_cuda.dll -EXPORTS - aoti_torch_get_current_cuda_stream diff --git a/cuda_core/cuda/core/_tensor_bridge.pyx b/cuda_core/cuda/core/_tensor_bridge.pyx index 388ca738dbb..07eec56537b 100644 --- a/cuda_core/cuda/core/_tensor_bridge.pyx +++ b/cuda_core/cuda/core/_tensor_bridge.pyx @@ -103,8 +103,38 @@ cdef extern from "_include/aoti_shim.h": int32_t aoti_torch_device_type_cpu() int32_t aoti_torch_device_type_cuda() - # stream - AOTITorchError aoti_torch_get_current_cuda_stream(int32_t, void**) + # Note: aoti_torch_get_current_cuda_stream is NOT declared here because + # it lives in torch_cuda.dll (not torch_cpu.dll). It is resolved lazily + # at runtime via dlsym / GetProcAddress — see _resolve_cuda_stream_fn(). + +# Runtime resolution for aoti_torch_get_current_cuda_stream. +# This symbol lives in torch_cuda.dll (Windows) / libtorch_cuda.so (Linux), +# NOT in torch_cpu. We resolve it lazily on first use so that the module +# can be imported even with CPU-only PyTorch. +ctypedef AOTITorchError (*_get_cuda_stream_fn_t)(int32_t, void**) nogil + +cdef extern from *: + """ + #ifdef _WIN32 + #include + static void* _resolve_cuda_stream_fn(void) { + HMODULE h = LoadLibraryA("torch_cuda.dll"); + if (!h) return NULL; + return (void*)GetProcAddress(h, "aoti_torch_get_current_cuda_stream"); + } + #else + #include + #ifndef RTLD_DEFAULT + #define RTLD_DEFAULT ((void*)0) + #endif + static void* _resolve_cuda_stream_fn(void) { + return dlsym(RTLD_DEFAULT, "aoti_torch_get_current_cuda_stream"); + } + #endif + """ + void* _resolve_cuda_stream_fn() nogil + +cdef _get_cuda_stream_fn_t _cached_get_cuda_stream = NULL import numpy import sys @@ -274,10 +304,17 @@ cpdef int sync_torch_stream(int32_t device_index, the consumer stream wait on it. This is a no-op if both streams are the same. """ + global _cached_get_cuda_stream cdef void* producer_s cdef EventHandle h_event - check_aoti(aoti_torch_get_current_cuda_stream(device_index, &producer_s), + if _cached_get_cuda_stream == NULL: + _cached_get_cuda_stream = <_get_cuda_stream_fn_t>_resolve_cuda_stream_fn() + if _cached_get_cuda_stream == NULL: + raise RuntimeError( + "Cannot resolve aoti_torch_get_current_cuda_stream from " + "torch_cuda — is CUDA-enabled PyTorch installed?") + check_aoti(_cached_get_cuda_stream(device_index, &producer_s), b"aoti_torch_get_current_cuda_stream") if producer_s != consumer_s: with nogil: diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index fca40ff707d..aa403409894 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -117,4 +117,4 @@ archs = "native" [tool.cibuildwheel.windows] archs = "AMD64" before-build = "pip install delvewheel" -repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll;torch_cuda.dll\" -w {dest_dir} {wheel}" +repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll\" -w {dest_dir} {wheel}"