From 9bf37ed8b64a4b09b8633eeaeb15561bfc6e6e7e Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Thu, 18 Jun 2026 00:23:59 +0200 Subject: [PATCH] Avoid freeing failed VMM grow reservation --- .../core/_memory/_virtual_memory_resource.py | 15 ++- cuda_core/tests/test_memory.py | 101 ++++++++++++++++++ 2 files changed, 112 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py index f30e6e3838d..d226f5eec6c 100644 --- a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py +++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py @@ -265,7 +265,8 @@ def modify_allocation( 0, ) - if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): + expected_ptr = int(buf.handle) + aligned_prev_size + if res != driver.CUresult.CUDA_SUCCESS: # Check for specific errors that are not recoverable with the slow path if res in ( driver.CUresult.CUDA_ERROR_INVALID_VALUE, @@ -274,15 +275,21 @@ def modify_allocation( driver.CUresult.CUDA_ERROR_NOT_SUPPORTED, ): raise_if_driver_error(res) + # Fallback: couldn't reserve contiguously, need full remapping + return self._grow_allocation_slow_path( + buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align + ) + + if new_ptr != expected_ptr: (res2,) = driver.cuMemAddressFree(new_ptr, aligned_additional_size) raise_if_driver_error(res2) # Fallback: couldn't extend contiguously, need full remapping return self._grow_allocation_slow_path( buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align ) - else: - # Success! We can extend the VA range contiguously - return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr) + + # Success! We can extend the VA range contiguously + return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr) def _grow_allocation_fast_path( self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, aligned_additional_size: int, new_ptr: int diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 920cd4bb0fd..1b6ca35489b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1035,6 +1035,107 @@ def __init__(self, size): assert ("set_access", new_ptr, aligned_additional, 1) in calls +def _make_mock_vmm_resource(): + vmm_mr = VirtualMemoryResource.__new__(VirtualMemoryResource) + vmm_mr.device = type("FakeDevice", (), {"device_id": 0})() + vmm_mr.config = VirtualMemoryResourceOptions(handle_type="win32_kmt" if IS_WINDOWS else "posix_fd") + return vmm_mr + + +def test_vmm_allocator_grow_allocation_does_not_free_failed_adjacent_reservation(monkeypatch): + vmm_mr = _make_mock_vmm_resource() + + SUCCESS = driver.CUresult.CUDA_SUCCESS + ERROR = driver.CUresult.CUDA_ERROR_OUT_OF_MEMORY + base_ptr = 0x10_0000 + old_size = 2048 + new_size = 4096 + granularity = 1024 + stale_ptr = 0xBAD + calls = [] + + class FakeBuffer: + handle = base_ptr + size = old_size + + def fake_get_allocation_granularity(_, _granularity_flag): + calls.append(("granularity",)) + return (SUCCESS, granularity) + + def fake_addr_reserve(size, align, hint, flags): + calls.append(("reserve", size, align, hint, flags)) + return (ERROR, stale_ptr) + + def fake_addr_free(ptr, size): + calls.append(("addr_free", ptr, size)) + return (SUCCESS,) + + def fake_slow_path(self, buf, result_size, prop, aligned_additional_size, total_aligned_size, addr_align): + calls.append(("slow_path", result_size, aligned_additional_size, total_aligned_size, addr_align)) + return buf + + monkeypatch.setattr(driver, "cuMemGetAllocationGranularity", fake_get_allocation_granularity) + monkeypatch.setattr(driver, "cuMemAddressReserve", fake_addr_reserve) + monkeypatch.setattr(driver, "cuMemAddressFree", fake_addr_free) + monkeypatch.setattr(VirtualMemoryResource, "_grow_allocation_slow_path", fake_slow_path) + + result = vmm_mr.modify_allocation(FakeBuffer(), new_size) + + assert isinstance(result, FakeBuffer) + assert calls == [ + ("granularity",), + ("reserve", 2048, granularity, base_ptr + old_size, 0), + ("slow_path", new_size, 2048, 4096, granularity), + ] + + +def test_vmm_allocator_grow_allocation_frees_noncontiguous_adjacent_reservation(monkeypatch): + vmm_mr = _make_mock_vmm_resource() + + SUCCESS = driver.CUresult.CUDA_SUCCESS + base_ptr = 0x10_0000 + old_size = 2048 + new_size = 4096 + granularity = 1024 + noncontiguous_ptr = base_ptr + 4 * granularity + calls = [] + + class FakeBuffer: + handle = base_ptr + size = old_size + + def fake_get_allocation_granularity(_, _granularity_flag): + calls.append(("granularity",)) + return (SUCCESS, granularity) + + def fake_addr_reserve(size, align, hint, flags): + calls.append(("reserve", size, align, hint, flags)) + return (SUCCESS, noncontiguous_ptr) + + def fake_addr_free(ptr, size): + calls.append(("addr_free", ptr, size)) + return (SUCCESS,) + + def fake_slow_path(self, buf, result_size, prop, aligned_additional_size, total_aligned_size, addr_align): + calls.append(("slow_path", result_size, aligned_additional_size, total_aligned_size, addr_align)) + return buf + + monkeypatch.setattr(driver, "cuMemGetAllocationGranularity", fake_get_allocation_granularity) + monkeypatch.setattr(driver, "cuMemAddressReserve", fake_addr_reserve) + monkeypatch.setattr(driver, "cuMemAddressFree", fake_addr_free) + monkeypatch.setattr(VirtualMemoryResource, "_grow_allocation_slow_path", fake_slow_path) + + result = vmm_mr.modify_allocation(FakeBuffer(), new_size) + + assert isinstance(result, FakeBuffer) + assert calls == [ + ("granularity",), + ("reserve", 2048, granularity, base_ptr + old_size, 0), + ("addr_free", noncontiguous_ptr, 2048), + ("slow_path", new_size, 2048, 4096, granularity), + ] + + def test_vmm_allocator_rdma_unsupported_exception(): """Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it.