From 22e72df13dbe4ab197ea8a68d93160b63c4348eb Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 06:53:06 +0000
Subject: [PATCH 1/3] Fix tensor bridge DLL import failure on Windows

aoti_torch_get_current_cuda_stream lives in torch_cuda.dll, not
torch_cpu.dll. The stub import library pointed at the wrong DLL,
causing "The specified procedure could not be found" on Windows.

- Move aoti_torch_get_current_cuda_stream from aoti_shim.def
  (torch_cpu.dll) to new aoti_shim_cuda.def (torch_cuda.dll)
- Update build_hooks.py to generate stub libs for both DLLs
  via a loop
- Add torch_cuda.dll to delvewheel exclude list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/build_hooks.py                      | 23 +++++++++++--------
 cuda_core/cuda/core/_include/aoti_shim.def    |  1 -
 .../cuda/core/_include/aoti_shim_cuda.def     |  5 ++++
 cuda_core/pyproject.toml                      |  2 +-
 4 files changed, 19 insertions(+), 12 deletions(-)
 create mode 100644 cuda_core/cuda/core/_include/aoti_shim_cuda.def

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 444da18eb13..7174df48764 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -183,19 +183,22 @@ def get_sources(mod_name):
         # related to free-threading builds.
         extra_compile_args += ["-DCYTHON_TRACE_NOGIL=1", "-DCYTHON_USE_SYS_MONITORING=0"]
 
-    # On Windows, _tensor_bridge.pyx needs a stub import library so the MSVC
-    # linker can resolve the AOTI symbols (they live in torch_cpu.dll at
-    # runtime).  We generate the .lib from a .def file at build time.
+    # On Windows, _tensor_bridge.pyx needs stub import libraries so the MSVC
+    # linker can resolve the AOTI symbols at link time.  At runtime the symbols
+    # resolve from the actual DLLs loaded by 'import torch'.
+    #   - aoti_shim.def       -> torch_cpu.dll   (dtype, device, tensor metadata)
+    #   - aoti_shim_cuda.def  -> torch_cuda.dll   (CUDA stream access)
     _aoti_extra_link_args = []
     if sys.platform == "win32":
-        _def_file = os.path.join("cuda", "core", "_include", "aoti_shim.def")
-        _lib_file = os.path.join("build", "aoti_shim.lib")
         os.makedirs("build", exist_ok=True)
-        subprocess.check_call(  # noqa: S603
-            ["lib", f"/DEF:{_def_file}", f"/OUT:{_lib_file}", "/MACHINE:X64"],  # noqa: S607
-            stdout=subprocess.DEVNULL,
-        )
-        _aoti_extra_link_args = [_lib_file]
+        for def_name in ("aoti_shim", "aoti_shim_cuda"):
+            def_file = os.path.join("cuda", "core", "_include", f"{def_name}.def")
+            lib_file = os.path.join("build", f"{def_name}.lib")
+            subprocess.check_call(  # noqa: S603
+                ["lib", f"/DEF:{def_file}", f"/OUT:{lib_file}", "/MACHINE:X64"],  # noqa: S607
+                stdout=subprocess.DEVNULL,
+            )
+            _aoti_extra_link_args.append(lib_file)
 
     def get_extra_link_args(mod_name):
         if mod_name == "_tensor_bridge" and _aoti_extra_link_args:
diff --git a/cuda_core/cuda/core/_include/aoti_shim.def b/cuda_core/cuda/core/_include/aoti_shim.def
index 5cc6897e815..e21097bd25e 100644
--- a/cuda_core/cuda/core/_include/aoti_shim.def
+++ b/cuda_core/cuda/core/_include/aoti_shim.def
@@ -34,4 +34,3 @@ EXPORTS
     aoti_torch_get_device_index
     aoti_torch_device_type_cpu
     aoti_torch_device_type_cuda
-    aoti_torch_get_current_cuda_stream
diff --git a/cuda_core/cuda/core/_include/aoti_shim_cuda.def b/cuda_core/cuda/core/_include/aoti_shim_cuda.def
new file mode 100644
index 00000000000..a754c3b7d6d
--- /dev/null
+++ b/cuda_core/cuda/core/_include/aoti_shim_cuda.def
@@ -0,0 +1,5 @@
+; Stub import library for CUDA-specific AOTI symbols (torch_cuda.dll).
+; See aoti_shim.def for the torch_cpu.dll counterpart.
+LIBRARY torch_cuda.dll
+EXPORTS
+    aoti_torch_get_current_cuda_stream
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index aa403409894..fca40ff707d 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -117,4 +117,4 @@ archs = "native"
 [tool.cibuildwheel.windows]
 archs = "AMD64"
 before-build = "pip install delvewheel"
-repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll\" -w {dest_dir} {wheel}"
+repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll;torch_cuda.dll\" -w {dest_dir} {wheel}"

From 77c0b7bfcdfa46f36febe44e1c4a8d5193f079ff Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 07:24:09 +0000
Subject: [PATCH 2/3] Add SPDX headers to aoti_shim_cuda.def

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/cuda/core/_include/aoti_shim_cuda.def | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cuda_core/cuda/core/_include/aoti_shim_cuda.def b/cuda_core/cuda/core/_include/aoti_shim_cuda.def
index a754c3b7d6d..0c38f3ffaa9 100644
--- a/cuda_core/cuda/core/_include/aoti_shim_cuda.def
+++ b/cuda_core/cuda/core/_include/aoti_shim_cuda.def
@@ -1,3 +1,6 @@
+; SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+; SPDX-License-Identifier: Apache-2.0
+;
 ; Stub import library for CUDA-specific AOTI symbols (torch_cuda.dll).
 ; See aoti_shim.def for the torch_cpu.dll counterpart.
 LIBRARY torch_cuda.dll

From 50294b86f9c084c9a0ceceff380ceb7658dea6d2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 29 Apr 2026 20:07:36 +0000
Subject: [PATCH 3/3] Resolve aoti_torch_get_current_cuda_stream lazily at
 runtime

The symbol lives in torch_cuda (not torch_cpu), so linking against it
at build time breaks CPU-only PyTorch installs and requires a second
stub import library on Windows.

Instead, resolve it lazily on first use via dlsym (Linux) /
LoadLibrary+GetProcAddress (Windows).  The cached function pointer
keeps subsequent calls fully in C with zero Python overhead.

This reverts the two-def-file approach from the previous commit and
replaces it with a self-contained inline C helper that handles both
platforms.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 cuda_core/build_hooks.py                      | 26 +++++------
 cuda_core/cuda/core/_include/aoti_shim.h      | 18 +++++---
 .../cuda/core/_include/aoti_shim_cuda.def     |  8 ----
 cuda_core/cuda/core/_tensor_bridge.pyx        | 43 +++++++++++++++++--
 cuda_core/pyproject.toml                      |  2 +-
 5 files changed, 65 insertions(+), 32 deletions(-)
 delete mode 100644 cuda_core/cuda/core/_include/aoti_shim_cuda.def

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 7174df48764..f4fb4af01f7 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -183,22 +183,22 @@ def get_sources(mod_name):
         # related to free-threading builds.
         extra_compile_args += ["-DCYTHON_TRACE_NOGIL=1", "-DCYTHON_USE_SYS_MONITORING=0"]
 
-    # On Windows, _tensor_bridge.pyx needs stub import libraries so the MSVC
-    # linker can resolve the AOTI symbols at link time.  At runtime the symbols
-    # resolve from the actual DLLs loaded by 'import torch'.
-    #   - aoti_shim.def       -> torch_cpu.dll   (dtype, device, tensor metadata)
-    #   - aoti_shim_cuda.def  -> torch_cuda.dll   (CUDA stream access)
+    # On Windows, _tensor_bridge.pyx needs a stub import library so the MSVC
+    # linker can resolve the AOTI symbols (they live in torch_cpu.dll at
+    # runtime).  We generate the .lib from a .def file at build time.
+    # Note: aoti_torch_get_current_cuda_stream lives in torch_cuda.dll and
+    # is resolved lazily at runtime (not via the stub lib) — see
+    # _tensor_bridge.pyx.
     _aoti_extra_link_args = []
     if sys.platform == "win32":
+        _def_file = os.path.join("cuda", "core", "_include", "aoti_shim.def")
+        _lib_file = os.path.join("build", "aoti_shim.lib")
         os.makedirs("build", exist_ok=True)
-        for def_name in ("aoti_shim", "aoti_shim_cuda"):
-            def_file = os.path.join("cuda", "core", "_include", f"{def_name}.def")
-            lib_file = os.path.join("build", f"{def_name}.lib")
-            subprocess.check_call(  # noqa: S603
-                ["lib", f"/DEF:{def_file}", f"/OUT:{lib_file}", "/MACHINE:X64"],  # noqa: S607
-                stdout=subprocess.DEVNULL,
-            )
-            _aoti_extra_link_args.append(lib_file)
+        subprocess.check_call(  # noqa: S603
+            ["lib", f"/DEF:{_def_file}", f"/OUT:{_lib_file}", "/MACHINE:X64"],  # noqa: S607
+            stdout=subprocess.DEVNULL,
+        )
+        _aoti_extra_link_args = [_lib_file]
 
     def get_extra_link_args(mod_name):
         if mod_name == "_tensor_bridge" and _aoti_extra_link_args:
diff --git a/cuda_core/cuda/core/_include/aoti_shim.h b/cuda_core/cuda/core/_include/aoti_shim.h
index 809bdb1a2a6..464d27de46c 100644
--- a/cuda_core/cuda/core/_include/aoti_shim.h
+++ b/cuda_core/cuda/core/_include/aoti_shim.h
@@ -52,10 +52,13 @@ typedef struct AtenTensorOpaque* AtenTensorHandle;
 
 /*
  * IMPORTANT: Keep the AOTI_SHIM_API declaration list below in sync with
- * aoti_shim.def. On Windows, build_hooks.py turns that .def file into the
+ * aoti_shim.def.  On Windows, build_hooks.py turns that .def file into the
  * stub import library that MSVC needs to link _tensor_bridge without making
- * PyTorch a build-time dependency. If you add, remove, or rename an imported
- * AOTI symbol here, update aoti_shim.def in the same change.
+ * PyTorch a build-time dependency.  If you add, remove, or rename an
+ * imported AOTI symbol here, update aoti_shim.def in the same change.
+ *
+ * Exception: aoti_torch_get_current_cuda_stream lives in torch_cuda (not
+ * torch_cpu) and is resolved lazily at runtime — see _tensor_bridge.pyx.
  */
 
 /* ---- tensor metadata --------------------------------------------------- */
@@ -105,10 +108,11 @@ AOTI_SHIM_API AOTITorchError aoti_torch_get_device_index(
 AOTI_SHIM_API int32_t aoti_torch_device_type_cpu(void);
 AOTI_SHIM_API int32_t aoti_torch_device_type_cuda(void);
 
-/* ---- stream -------------------------------------------------------------- */
-
-AOTI_SHIM_API AOTITorchError aoti_torch_get_current_cuda_stream(
-    int32_t device_index, void** ret_stream);
+/* ---- stream --------------------------------------------------------------
+ * aoti_torch_get_current_cuda_stream is NOT declared here — it lives in
+ * torch_cuda (not torch_cpu) and is resolved at runtime.  See the inline
+ * C helper _resolve_cuda_stream_fn() in _tensor_bridge.pyx.
+ * ---------------------------------------------------------------------- */
 
 #ifdef __cplusplus
 }  /* extern "C" */
diff --git a/cuda_core/cuda/core/_include/aoti_shim_cuda.def b/cuda_core/cuda/core/_include/aoti_shim_cuda.def
deleted file mode 100644
index 0c38f3ffaa9..00000000000
--- a/cuda_core/cuda/core/_include/aoti_shim_cuda.def
+++ /dev/null
@@ -1,8 +0,0 @@
-; SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-; SPDX-License-Identifier: Apache-2.0
-;
-; Stub import library for CUDA-specific AOTI symbols (torch_cuda.dll).
-; See aoti_shim.def for the torch_cpu.dll counterpart.
-LIBRARY torch_cuda.dll
-EXPORTS
-    aoti_torch_get_current_cuda_stream
diff --git a/cuda_core/cuda/core/_tensor_bridge.pyx b/cuda_core/cuda/core/_tensor_bridge.pyx
index 388ca738dbb..07eec56537b 100644
--- a/cuda_core/cuda/core/_tensor_bridge.pyx
+++ b/cuda_core/cuda/core/_tensor_bridge.pyx
@@ -103,8 +103,38 @@ cdef extern from "_include/aoti_shim.h":
     int32_t aoti_torch_device_type_cpu()
     int32_t aoti_torch_device_type_cuda()
 
-    # stream
-    AOTITorchError aoti_torch_get_current_cuda_stream(int32_t, void**)
+    # Note: aoti_torch_get_current_cuda_stream is NOT declared here because
+    # it lives in torch_cuda.dll (not torch_cpu.dll).  It is resolved lazily
+    # at runtime via dlsym / GetProcAddress — see _resolve_cuda_stream_fn().
+
+# Runtime resolution for aoti_torch_get_current_cuda_stream.
+# This symbol lives in torch_cuda.dll (Windows) / libtorch_cuda.so (Linux),
+# NOT in torch_cpu.  We resolve it lazily on first use so that the module
+# can be imported even with CPU-only PyTorch.
+ctypedef AOTITorchError (*_get_cuda_stream_fn_t)(int32_t, void**) nogil
+
+cdef extern from *:
+    """
+    #ifdef _WIN32
+    #include <windows.h>
+    static void* _resolve_cuda_stream_fn(void) {
+        HMODULE h = LoadLibraryA("torch_cuda.dll");
+        if (!h) return NULL;
+        return (void*)GetProcAddress(h, "aoti_torch_get_current_cuda_stream");
+    }
+    #else
+    #include <dlfcn.h>
+    #ifndef RTLD_DEFAULT
+    #define RTLD_DEFAULT ((void*)0)
+    #endif
+    static void* _resolve_cuda_stream_fn(void) {
+        return dlsym(RTLD_DEFAULT, "aoti_torch_get_current_cuda_stream");
+    }
+    #endif
+    """
+    void* _resolve_cuda_stream_fn() nogil
+
+cdef _get_cuda_stream_fn_t _cached_get_cuda_stream = NULL
 
 import numpy
 import sys
@@ -274,10 +304,17 @@ cpdef int sync_torch_stream(int32_t device_index,
     the consumer stream wait on it.  This is a no-op if both streams are
     the same.
     """
+    global _cached_get_cuda_stream
     cdef void* producer_s
     cdef EventHandle h_event
 
-    check_aoti(aoti_torch_get_current_cuda_stream(device_index, &producer_s),
+    if _cached_get_cuda_stream == NULL:
+        _cached_get_cuda_stream = <_get_cuda_stream_fn_t>_resolve_cuda_stream_fn()
+        if _cached_get_cuda_stream == NULL:
+            raise RuntimeError(
+                "Cannot resolve aoti_torch_get_current_cuda_stream from "
+                "torch_cuda — is CUDA-enabled PyTorch installed?")
+    check_aoti(_cached_get_cuda_stream(device_index, &producer_s),
                b"aoti_torch_get_current_cuda_stream")
     if <intptr_t>producer_s != consumer_s:
         with nogil:
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index fca40ff707d..aa403409894 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -117,4 +117,4 @@ archs = "native"
 [tool.cibuildwheel.windows]
 archs = "AMD64"
 before-build = "pip install delvewheel"
-repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll;torch_cuda.dll\" -w {dest_dir} {wheel}"
+repair-wheel-command = "delvewheel repair --namespace-pkg cuda --exclude \"torch_cpu.dll;torch_python.dll\" -w {dest_dir} {wheel}"