From 3dea1acf870a9b36c2b70de262153f488bc47364 Mon Sep 17 00:00:00 2001
From: jcorners68 <jonathan.corners@voxell.ai>
Date: Thu, 16 Apr 2026 04:48:02 -0700
Subject: [PATCH] uvm: fix NULL pointer dereference in uvm_hmm_unregister_gpu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Guard gpu->parent->devmem dereference with a NULL check.

devmem is only initialized when uvm_pmm_devmem_init() succeeds during
init_parent_gpu(). When it is NULL — because init failed or was never
called for this GPU — the unconditional dereference at the top of
uvm_hmm_unregister_gpu() causes a kernel NULL pointer dereference
(Xid 79, "GPU has fallen off the bus") during CUDA process exit:

  BUG: kernel NULL pointer dereference, address: 00000000000000a8
  RIP: uvm_hmm_unregister_gpu+0x51/0x3a0 [nvidia_uvm]
  Call Trace:
   unregister_gpu+0xc4/0x5b0 [nvidia_uvm]
   uvm_va_space_destroy+0x22e/0x770 [nvidia_uvm]
   uvm_release+0x7a/0x180 [nvidia_uvm]
   __fput+0xea/0x2d0
   do_exit+0x1fa/0x480

The existing uvm_hmm_is_enabled(va_space) check only validates that
HMM is enabled for the VA space — it does not guarantee that the
specific GPU has device-private memory initialized.

When devmem is NULL there are no device-private pages to evict, so
skip the eviction loop and proceed directly to block unregistration.

Also move the lock assertions above the devmem access so they can
actually fire before the NULL dereference.

Fixes: NVIDIA/open-gpu-kernel-modules#1082
Tested: confirmed workaround (uvm_disable_hmm=1) resolves the crash;
        this patch is the proper fix for the same code path.

Signed-off-by: Jonathan Corners <jonathan.corners@gmail.com>
---
 kernel-open/nvidia-uvm/uvm_hmm.c | 63 ++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/kernel-open/nvidia-uvm/uvm_hmm.c b/kernel-open/nvidia-uvm/uvm_hmm.c
index 5ba0024cd0..76e446e010 100644
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -344,41 +344,48 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
     if (!uvm_hmm_is_enabled(va_space))
         return;
 
-    devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
-    devmem_end = devmem_start + gpu->mem_info.size;
-
     if (mm)
         uvm_assert_mmap_lock_locked(mm);
     uvm_assert_rwsem_locked_write(&va_space->lock);
 
-    // There could be pages with page->zone_device_data pointing to the va_space
-    // which may be about to be freed. Migrate those back to the CPU so we don't
-    // fault on them. Normally infinite retries are bad, but we don't have any
-    // option here. Device-private pages can't be pinned so migration should
-    // eventually succeed. Even if we did eventually bail out of the loop we'd
-    // just stall in memunmap_pages() anyway.
-    do {
-        retry = false;
-
-        for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
-            struct page *page = pfn_to_page(pfn);
-
-            // No need to keep scanning if no HMM pages are allocated for this
-            // va_space.
-            if (!atomic64_read(&va_space->hmm.allocated_page_count))
-                break;
+    // gpu->parent->devmem is only initialized when device-private memory
+    // was successfully set up for this GPU in init_parent_gpu(). If it
+    // was not (e.g. uvm_pmm_devmem_init() failed or was never called),
+    // there are no device-private pages to evict — skip straight to
+    // block unregistration.
+    if (gpu->parent->devmem) {
+        devmem_start = gpu->parent->devmem->pagemap.range.start + gpu->mem_info.phys_start;
+        devmem_end = devmem_start + gpu->mem_info.size;
+
+        // There could be pages with page->zone_device_data pointing to the va_space
+        // which may be about to be freed. Migrate those back to the CPU so we don't
+        // fault on them. Normally infinite retries are bad, but we don't have any
+        // option here. Device-private pages can't be pinned so migration should
+        // eventually succeed. Even if we did eventually bail out of the loop we'd
+        // just stall in memunmap_pages() anyway.
+        do {
+            retry = false;
+
+            for (pfn = __phys_to_pfn(devmem_start); pfn <= __phys_to_pfn(devmem_end); pfn++) {
+                struct page *page = pfn_to_page(pfn);
+
+                // No need to keep scanning if no HMM pages are allocated for this
+                // va_space.
+                if (!atomic64_read(&va_space->hmm.allocated_page_count))
+                    break;
 
-            UVM_ASSERT(is_device_private_page(page));
+                UVM_ASSERT(is_device_private_page(page));
 
-            // This check is racy because nothing stops the page being freed and
-            // even reused. That doesn't matter though - worst case the
-            // migration fails, we retry and find the va_space doesn't match.
-            if (uvm_pmm_devmem_page_to_va_space(page) == va_space) {
-                if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
-                    retry = true;
+                // This check is racy because nothing stops the page being freed and
+                // even reused. That doesn't matter though - worst case the
+                // migration fails, we retry and find the va_space doesn't match.
+                if (uvm_pmm_devmem_page_to_va_space(page) == va_space) {
+                    if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
+                        retry = true;
+                }
             }
-        }
-    } while (retry);
+        } while (retry);
+    }
 
     uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
         va_block = hmm_va_block_from_node(node);