From c4b11c08704d679c50417d49a7c781204f338b68 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 10 Jun 2026 21:34:12 +0200 Subject: [PATCH 1/4] TST: Mark tests as thread-unsafe or limit the number of threads - thread_unsafe: nvml init ref-count, graphMem attr, mock-based tests, OpenGL, peer-access pool state, multiprocessing warning, program-cache race reproduction, and functools.cache mutation tests - parallel_threads_limit: IPC / worker-pool tests that spawn subprocesses or open file descriptors (limit 4), example tests (limit 8), and the event-registration test whose timeouts are slow Signed-off-by: Sebastian Berg --- cuda_bindings/tests/nvml/test_init.py | 1 + cuda_bindings/tests/test_cuda.py | 1 + cuda_core/tests/example_tests/test_basic_examples.py | 1 + cuda_core/tests/memory_ipc/test_errors.py | 4 ++++ cuda_core/tests/memory_ipc/test_event_ipc.py | 4 ++++ cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py | 3 +++ cuda_core/tests/memory_ipc/test_leaks.py | 2 ++ cuda_core/tests/memory_ipc/test_memory_ipc.py | 3 +++ cuda_core/tests/memory_ipc/test_peer_access.py | 3 +++ cuda_core/tests/memory_ipc/test_send_buffers.py | 3 +++ cuda_core/tests/memory_ipc/test_serialize.py | 3 +++ cuda_core/tests/memory_ipc/test_workerpool.py | 3 +++ cuda_core/tests/system/test_system_device.py | 1 + cuda_core/tests/test_graphics.py | 3 +++ cuda_core/tests/test_memory.py | 1 + cuda_core/tests/test_memory_peer_access.py | 2 ++ cuda_core/tests/test_multiprocessing_warning.py | 5 +++++ cuda_core/tests/test_program_cache.py | 1 + cuda_pathfinder/tests/test_find_nvidia_binaries.py | 1 + 19 files changed, 45 insertions(+) diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py index 4c94dc26a3e..19e573c9cc6 100644 --- a/cuda_bindings/tests/nvml/test_init.py +++ b/cuda_bindings/tests/nvml/test_init.py @@ -42,6 +42,7 @@ def get_architecture_name(arch): @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows") +@pytest.mark.thread_unsafe(reason="nvml init affects other threads") def test_init_ref_count(): """ Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index 959534cbbdf..d8d4f26e9c0 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -456,6 +456,7 @@ def test_cuda_mem_range_attr(device): @pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported") +@pytest.mark.thread_unsafe(reason="used high memory can be higher if threaded.") def test_cuda_graphMem_attr(device): err, stream = cuda.cuStreamCreate(0) assert err == cuda.CUresult.CUDA_SUCCESS diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index 43fab4241db..c8d15677a54 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -100,6 +100,7 @@ def has_recent_memory_pool_support() -> bool: @pytest.mark.parametrize("example", sample_files) +@pytest.mark.parallel_threads_limit(8) def test_example(example): example_path = os.path.join(samples_path, example) has_package_requirements_or_skip(example_path) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index cc36575ff45..40cbcc2826b 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -17,6 +17,10 @@ POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + + def test_outer_timeout_marker_is_applied(request): """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker. diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index 48985e67b58..e3cefe6a211 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -16,6 +16,10 @@ NBYTES = 64 +# these tetss spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + + @pytest.mark.skipif(Device().compute_capability.major < 7, reason="__nanosleep is only available starting Volta (sm70)") class TestEventIpc: """Check the basic usage of IPC-enabled events with a latch kernel.""" diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index 8d450fa8e3f..eaa6ddec92f 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -24,6 +24,9 @@ ENABLE_LOGGING = False # Set True for test debugging and development +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + def child_main(log, queue): log.prefix = " child: " diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index 6fc4d03f142..c6e44824137 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -23,6 +23,8 @@ not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable" ) +pytestmark = pytest.mark.thread_unsafe(reason="Tests number of fds which is shared.") + @pytest.mark.flaky(reruns=2) @skip_if_unrunnable diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 0923fe28d8b..43d356789e7 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -14,6 +14,9 @@ NWORKERS = 2 NTASKS = 2 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestIpcMempool: @pytest.mark.flaky(reruns=2) diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 9e9e2879ae7..efb67b4cdb8 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -14,6 +14,9 @@ NBYTES = 64 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestPeerAccessNotPreservedOnImport: """ diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index cc7f45d67c2..01c9496e773 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -16,6 +16,9 @@ NTASKS = 7 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestIpcSendBuffers: @pytest.mark.flaky(reruns=2) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 2f0e429b103..4289de4b5a9 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -15,6 +15,9 @@ NBYTES = 64 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestObjectSerializationDirect: """ diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 609fadbcf3e..08d9bd79d92 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -16,6 +16,9 @@ NTASKS = 20 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestIpcWorkerPool: """ diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 4aa13840b48..c202407dc55 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -268,6 +268,7 @@ def test_unpack_bitmask_single_value(): _device._unpack_bitmask(1) +@pytest.mark.parallel_threads_limit(4) # timeouts are slow @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows") def test_register_events(): # This is not the world's greatest test. All of the events are pretty diff --git a/cuda_core/tests/test_graphics.py b/cuda_core/tests/test_graphics.py index 6f5877f76b0..e2b22a20c59 100644 --- a/cuda_core/tests/test_graphics.py +++ b/cuda_core/tests/test_graphics.py @@ -20,6 +20,9 @@ ) from cuda.core.utils import StridedMemoryView +# TODO(seberg): Maybe some of these tests can be made threadable? +pytestmark = pytest.mark.thread_unsafe(reason="OpenGL context not threadable") + # --------------------------------------------------------------------------- # GL context + buffer helpers # --------------------------------------------------------------------------- diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 920cd4bb0fd..35592485c94 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1323,6 +1323,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) +@pytest.mark.thread_unsafe(reason="Uses mock.") def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index 71beb459143..68c32ce69c6 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -12,6 +12,8 @@ NBYTES = 1024 +pytestmark = pytest.mark.thread_unsafe(reason="peer access tests mutate process-global CUDA memory-pool access state") + def test_peer_access_basic(mempool_device_x2): """Basic tests for dmr.peer_accessible_by.""" diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py index 0f96e0abfbc..94a671ff2f8 100644 --- a/cuda_core/tests/test_multiprocessing_warning.py +++ b/cuda_core/tests/test_multiprocessing_warning.py @@ -12,12 +12,17 @@ import warnings from unittest.mock import patch +import pytest + from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions from cuda.core._event import _reduce_event from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource from cuda.core._memory._ipc import _reduce_allocation_handle from cuda.core._utils.cuda_utils import check_multiprocessing_start_method, reset_fork_warning +# We could move these to a (session) fixtures +pytestmark = pytest.mark.thread_unsafe(reason="all tests use unittest.mock.patch") + def test_warn_on_fork_method_device_memory_resource(ipc_device): """Test that warning is emitted when DeviceMemoryResource is pickled with fork method.""" diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py index 01a39e0032c..963ec1cc04b 100644 --- a/cuda_core/tests/test_program_cache.py +++ b/cuda_core/tests/test_program_cache.py @@ -1927,6 +1927,7 @@ def test_filestream_cache_tracker_reconciles_after_external_drift(tmp_path): assert cache._tracked_size_bytes <= 1100 # actual on-disk is 'b' + 'c' or just 'c' +@pytest.mark.thread_unsafe(reason="already threaded and patches _file_stream") def test_filestream_cache_tracker_clamps_at_zero_under_delete_race(tmp_path): """Two-thread reproduction of the ``__delitem__`` vs ``_enforce_size_cap`` race. Thread A is mid-delete: it has stat'd the diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py index ec9740cd853..0f9e5ed31c1 100644 --- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py +++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py @@ -173,6 +173,7 @@ def test_find_binary_cache_negative_result(monkeypatch, mocker): @pytest.mark.usefixtures("clear_find_binary_cache") +@pytest.mark.thread_unsafe(reason="functools.cache may replace entry.") def test_caching_per_utility(): """Verify that different utilities have independent cache entries.""" nvdisasm1 = find_nvidia_binary_utility("nvdisasm") From 86629a453f4290e5dfbc7218a89f7c0fdc889890 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Sat, 13 Jun 2026 09:27:11 +0200 Subject: [PATCH 2/4] TST: Add two more thread-unsafe markers --- cuda_bindings/tests/nvml/test_device.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py index 4b5f6a1e160..e571de67a76 100644 --- a/cuda_bindings/tests/nvml/test_device.py +++ b/cuda_bindings/tests/nvml/test_device.py @@ -120,6 +120,7 @@ def test_read_prm_counters(all_devices): assert len(read_counters) == 5 +@pytest.mark.thread_unsafe(reason="API appears to be thread-unsafe (2026-06)") def test_read_write_prm(all_devices): for device in all_devices: # Docs say supported in BLACKWELL or later From 6ee5f86169c96c9a2fb20ba2b0bdab16e170f74b Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 16 Jun 2026 11:13:27 +0200 Subject: [PATCH 3/4] TST: Mark `mocker` as a thread-unsafe fixture --- cuda_pathfinder/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml index 9f0a955cc01..0905f0b84ad 100644 --- a/cuda_pathfinder/pyproject.toml +++ b/cuda_pathfinder/pyproject.toml @@ -102,6 +102,7 @@ git_describe_command = [ "git", "describe", "--dirty", "--tags", "--long", "--ma [tool.pytest.ini_options] addopts = "--showlocals" +thread_unsafe_fixtures = ['mocker'] [tool.mypy] # Try to keep the mypy configuration similar between the subprojects From 1f7783f9f3d0a581553fabfde47cdc6a34680f4d Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 16 Jun 2026 11:18:07 +0200 Subject: [PATCH 4/4] Remove unsafe marker from test_multiprocessing_warning (can do the fixtures) --- cuda_core/tests/test_multiprocessing_warning.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py index 94a671ff2f8..0f96e0abfbc 100644 --- a/cuda_core/tests/test_multiprocessing_warning.py +++ b/cuda_core/tests/test_multiprocessing_warning.py @@ -12,17 +12,12 @@ import warnings from unittest.mock import patch -import pytest - from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions from cuda.core._event import _reduce_event from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource from cuda.core._memory._ipc import _reduce_allocation_handle from cuda.core._utils.cuda_utils import check_multiprocessing_start_method, reset_fork_warning -# We could move these to a (session) fixtures -pytestmark = pytest.mark.thread_unsafe(reason="all tests use unittest.mock.patch") - def test_warn_on_fork_method_device_memory_resource(ipc_device): """Test that warning is emitted when DeviceMemoryResource is pickled with fork method."""