From c4b11c08704d679c50417d49a7c781204f338b68 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Wed, 10 Jun 2026 21:34:12 +0200
Subject: [PATCH 1/4] TST: Mark tests as thread-unsafe or limit the number of
 threads

- thread_unsafe: nvml init ref-count, graphMem attr, mock-based tests,
  OpenGL, peer-access pool state, multiprocessing warning, program-cache
  race reproduction, and functools.cache mutation tests
- parallel_threads_limit: IPC / worker-pool tests that spawn subprocesses
  or open file descriptors (limit 4), example tests (limit 8), and the
  event-registration test whose timeouts are slow

Signed-off-by: Sebastian Berg <sebastianb@nvidia.com>
---
 cuda_bindings/tests/nvml/test_init.py                   | 1 +
 cuda_bindings/tests/test_cuda.py                        | 1 +
 cuda_core/tests/example_tests/test_basic_examples.py    | 1 +
 cuda_core/tests/memory_ipc/test_errors.py               | 4 ++++
 cuda_core/tests/memory_ipc/test_event_ipc.py            | 4 ++++
 cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py | 3 +++
 cuda_core/tests/memory_ipc/test_leaks.py                | 2 ++
 cuda_core/tests/memory_ipc/test_memory_ipc.py           | 3 +++
 cuda_core/tests/memory_ipc/test_peer_access.py          | 3 +++
 cuda_core/tests/memory_ipc/test_send_buffers.py         | 3 +++
 cuda_core/tests/memory_ipc/test_serialize.py            | 3 +++
 cuda_core/tests/memory_ipc/test_workerpool.py           | 3 +++
 cuda_core/tests/system/test_system_device.py            | 1 +
 cuda_core/tests/test_graphics.py                        | 3 +++
 cuda_core/tests/test_memory.py                          | 1 +
 cuda_core/tests/test_memory_peer_access.py              | 2 ++
 cuda_core/tests/test_multiprocessing_warning.py         | 5 +++++
 cuda_core/tests/test_program_cache.py                   | 1 +
 cuda_pathfinder/tests/test_find_nvidia_binaries.py      | 1 +
 19 files changed, 45 insertions(+)

diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py
index 4c94dc26a3e..19e573c9cc6 100644
--- a/cuda_bindings/tests/nvml/test_init.py
+++ b/cuda_bindings/tests/nvml/test_init.py
@@ -42,6 +42,7 @@ def get_architecture_name(arch):
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
+@pytest.mark.thread_unsafe(reason="nvml init affects other threads")
 def test_init_ref_count():
     """
     Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index 959534cbbdf..d8d4f26e9c0 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -456,6 +456,7 @@ def test_cuda_mem_range_attr(device):
 
 
 @pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported")
+@pytest.mark.thread_unsafe(reason="used high memory can be higher if threaded.")
 def test_cuda_graphMem_attr(device):
     err, stream = cuda.cuStreamCreate(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 43fab4241db..c8d15677a54 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -100,6 +100,7 @@ def has_recent_memory_pool_support() -> bool:
 
 
 @pytest.mark.parametrize("example", sample_files)
+@pytest.mark.parallel_threads_limit(8)
 def test_example(example):
     example_path = os.path.join(samples_path, example)
     has_package_requirements_or_skip(example_path)
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index cc36575ff45..40cbcc2826b 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -17,6 +17,10 @@
 POOL_SIZE = 2097152
 
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 def test_outer_timeout_marker_is_applied(request):
     """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker.
 
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index 48985e67b58..e3cefe6a211 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -16,6 +16,10 @@
 NBYTES = 64
 
 
+# these tetss spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 @pytest.mark.skipif(Device().compute_capability.major < 7, reason="__nanosleep is only available starting Volta (sm70)")
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
index 8d450fa8e3f..eaa6ddec92f 100644
--- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
+++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -24,6 +24,9 @@
 
 ENABLE_LOGGING = False  # Set True for test debugging and development
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 def child_main(log, queue):
     log.prefix = " child: "
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index 6fc4d03f142..c6e44824137 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -23,6 +23,8 @@
     not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable"
 )
 
+pytestmark = pytest.mark.thread_unsafe(reason="Tests number of fds which is shared.")
+
 
 @pytest.mark.flaky(reruns=2)
 @skip_if_unrunnable
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index 0923fe28d8b..43d356789e7 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -14,6 +14,9 @@
 NWORKERS = 2
 NTASKS = 2
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcMempool:
     @pytest.mark.flaky(reruns=2)
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 9e9e2879ae7..efb67b4cdb8 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -14,6 +14,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestPeerAccessNotPreservedOnImport:
     """
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index cc7f45d67c2..01c9496e773 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -16,6 +16,9 @@
 NTASKS = 7
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcSendBuffers:
     @pytest.mark.flaky(reruns=2)
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 2f0e429b103..4289de4b5a9 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -15,6 +15,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestObjectSerializationDirect:
     """
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 609fadbcf3e..08d9bd79d92 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -16,6 +16,9 @@
 NTASKS = 20
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcWorkerPool:
     """
diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
index 4aa13840b48..c202407dc55 100644
--- a/cuda_core/tests/system/test_system_device.py
+++ b/cuda_core/tests/system/test_system_device.py
@@ -268,6 +268,7 @@ def test_unpack_bitmask_single_value():
         _device._unpack_bitmask(1)
 
 
+@pytest.mark.parallel_threads_limit(4)  # timeouts are slow
 @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows")
 def test_register_events():
     # This is not the world's greatest test.  All of the events are pretty
diff --git a/cuda_core/tests/test_graphics.py b/cuda_core/tests/test_graphics.py
index 6f5877f76b0..e2b22a20c59 100644
--- a/cuda_core/tests/test_graphics.py
+++ b/cuda_core/tests/test_graphics.py
@@ -20,6 +20,9 @@
 )
 from cuda.core.utils import StridedMemoryView
 
+# TODO(seberg): Maybe some of these tests can be made threadable?
+pytestmark = pytest.mark.thread_unsafe(reason="OpenGL context not threadable")
+
 # ---------------------------------------------------------------------------
 # GL context + buffer helpers
 # ---------------------------------------------------------------------------
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 920cd4bb0fd..35592485c94 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1323,6 +1323,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda):
         )
 
 
+@pytest.mark.thread_unsafe(reason="Uses mock.")
 def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
     """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
     from unittest.mock import MagicMock, patch
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 71beb459143..68c32ce69c6 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -12,6 +12,8 @@
 
 NBYTES = 1024
 
+pytestmark = pytest.mark.thread_unsafe(reason="peer access tests mutate process-global CUDA memory-pool access state")
+
 
 def test_peer_access_basic(mempool_device_x2):
     """Basic tests for dmr.peer_accessible_by."""
diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 0f96e0abfbc..94a671ff2f8 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -12,12 +12,17 @@
 import warnings
 from unittest.mock import patch
 
+import pytest
+
 from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
 from cuda.core._event import _reduce_event
 from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource
 from cuda.core._memory._ipc import _reduce_allocation_handle
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method, reset_fork_warning
 
+# We could move these to a (session) fixtures
+pytestmark = pytest.mark.thread_unsafe(reason="all tests use unittest.mock.patch")
+
 
 def test_warn_on_fork_method_device_memory_resource(ipc_device):
     """Test that warning is emitted when DeviceMemoryResource is pickled with fork method."""
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
index 01a39e0032c..963ec1cc04b 100644
--- a/cuda_core/tests/test_program_cache.py
+++ b/cuda_core/tests/test_program_cache.py
@@ -1927,6 +1927,7 @@ def test_filestream_cache_tracker_reconciles_after_external_drift(tmp_path):
         assert cache._tracked_size_bytes <= 1100  # actual on-disk is 'b' + 'c' or just 'c'
 
 
+@pytest.mark.thread_unsafe(reason="already threaded and patches _file_stream")
 def test_filestream_cache_tracker_clamps_at_zero_under_delete_race(tmp_path):
     """Two-thread reproduction of the ``__delitem__`` vs
     ``_enforce_size_cap`` race. Thread A is mid-delete: it has stat'd the
diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index ec9740cd853..0f9e5ed31c1 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -173,6 +173,7 @@ def test_find_binary_cache_negative_result(monkeypatch, mocker):
 
 
 @pytest.mark.usefixtures("clear_find_binary_cache")
+@pytest.mark.thread_unsafe(reason="functools.cache may replace entry.")
 def test_caching_per_utility():
     """Verify that different utilities have independent cache entries."""
     nvdisasm1 = find_nvidia_binary_utility("nvdisasm")

From 86629a453f4290e5dfbc7218a89f7c0fdc889890 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Sat, 13 Jun 2026 09:27:11 +0200
Subject: [PATCH 2/4] TST: Add two more thread-unsafe markers

---
 cuda_bindings/tests/nvml/test_device.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
index 4b5f6a1e160..e571de67a76 100644
--- a/cuda_bindings/tests/nvml/test_device.py
+++ b/cuda_bindings/tests/nvml/test_device.py
@@ -120,6 +120,7 @@ def test_read_prm_counters(all_devices):
         assert len(read_counters) == 5
 
 
+@pytest.mark.thread_unsafe(reason="API appears to be thread-unsafe (2026-06)")
 def test_read_write_prm(all_devices):
     for device in all_devices:
         # Docs say supported in BLACKWELL or later

From 6ee5f86169c96c9a2fb20ba2b0bdab16e170f74b Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 16 Jun 2026 11:13:27 +0200
Subject: [PATCH 3/4] TST: Mark `mocker` as a thread-unsafe fixture

---
 cuda_pathfinder/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index 9f0a955cc01..0905f0b84ad 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -102,6 +102,7 @@ git_describe_command = [ "git", "describe", "--dirty", "--tags", "--long", "--ma
 
 [tool.pytest.ini_options]
 addopts = "--showlocals"
+thread_unsafe_fixtures = ['mocker']
 
 [tool.mypy]
 # Try to keep the mypy configuration similar between the subprojects

From 1f7783f9f3d0a581553fabfde47cdc6a34680f4d Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 16 Jun 2026 11:18:07 +0200
Subject: [PATCH 4/4] Remove unsafe marker from test_multiprocessing_warning
 (can do the fixtures)

---
 cuda_core/tests/test_multiprocessing_warning.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 94a671ff2f8..0f96e0abfbc 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -12,17 +12,12 @@
 import warnings
 from unittest.mock import patch
 
-import pytest
-
 from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
 from cuda.core._event import _reduce_event
 from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource
 from cuda.core._memory._ipc import _reduce_allocation_handle
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method, reset_fork_warning
 
-# We could move these to a (session) fixtures
-pytestmark = pytest.mark.thread_unsafe(reason="all tests use unittest.mock.patch")
-
 
 def test_warn_on_fork_method_device_memory_resource(ipc_device):
     """Test that warning is emitted when DeviceMemoryResource is pickled with fork method."""