NVIDIA · seberg · Jun 10, 2026 · Jun 13, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
@@ -120,6 +120,7 @@ def test_read_prm_counters(all_devices):
         assert len(read_counters) == 5
 
 
+@pytest.mark.thread_unsafe(reason="API appears to be thread-unsafe (2026-06)")
 def test_read_write_prm(all_devices):
     for device in all_devices:
         # Docs say supported in BLACKWELL or later

diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py
@@ -42,6 +42,7 @@ def get_architecture_name(arch):
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
+@pytest.mark.thread_unsafe(reason="nvml init affects other threads")
 def test_init_ref_count():
     """
     Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works

diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
@@ -456,6 +456,7 @@ def test_cuda_mem_range_attr(device):
 
 
 @pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported")
+@pytest.mark.thread_unsafe(reason="used high memory can be higher if threaded.")
 def test_cuda_graphMem_attr(device):
     err, stream = cuda.cuStreamCreate(0)
     assert err == cuda.CUresult.CUDA_SUCCESS

diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -100,6 +100,7 @@ def has_recent_memory_pool_support() -> bool:
 
 
 @pytest.mark.parametrize("example", sample_files)
+@pytest.mark.parallel_threads_limit(8)
 def test_example(example):
     example_path = os.path.join(samples_path, example)
     has_package_requirements_or_skip(example_path)

diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
@@ -17,6 +17,10 @@
 POOL_SIZE = 2097152
 
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 def test_outer_timeout_marker_is_applied(request):
     """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker.
 

diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -16,6 +16,10 @@
 NBYTES = 64
 
 
+# these tetss spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 @pytest.mark.skipif(Device().compute_capability.major < 7, reason="__nanosleep is only available starting Volta (sm70)")
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""

diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -24,6 +24,9 @@
 
 ENABLE_LOGGING = False  # Set True for test debugging and development
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 def child_main(log, queue):
     log.prefix = " child: "

diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -23,6 +23,8 @@
     not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable"
 )
 
+pytestmark = pytest.mark.thread_unsafe(reason="Tests number of fds which is shared.")
+
 
 @pytest.mark.flaky(reruns=2)
 @skip_if_unrunnable

diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -14,6 +14,9 @@
 NWORKERS = 2
 NTASKS = 2
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcMempool:
     @pytest.mark.flaky(reruns=2)

diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -14,6 +14,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestPeerAccessNotPreservedOnImport:
     """

diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -16,6 +16,9 @@
 NTASKS = 7
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcSendBuffers:
     @pytest.mark.flaky(reruns=2)

diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -15,6 +15,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestObjectSerializationDirect:
     """

diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -16,6 +16,9 @@
 NTASKS = 20
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcWorkerPool:
     """

diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
@@ -268,6 +268,7 @@ def test_unpack_bitmask_single_value():
         _device._unpack_bitmask(1)
 
 
+@pytest.mark.parallel_threads_limit(4)  # timeouts are slow
 @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows")
 def test_register_events():
     # This is not the world's greatest test.  All of the events are pretty

diff --git a/cuda_core/tests/test_graphics.py b/cuda_core/tests/test_graphics.py
@@ -20,6 +20,9 @@
 )
 from cuda.core.utils import StridedMemoryView
 
+# TODO(seberg): Maybe some of these tests can be made threadable?
+pytestmark = pytest.mark.thread_unsafe(reason="OpenGL context not threadable")
+
 # ---------------------------------------------------------------------------
 # GL context + buffer helpers
 # ---------------------------------------------------------------------------

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -1323,6 +1323,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda):
         )
 
 
+@pytest.mark.thread_unsafe(reason="Uses mock.")
 def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
     """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
     from unittest.mock import MagicMock, patch

diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
@@ -12,6 +12,8 @@
 
 NBYTES = 1024
 
+pytestmark = pytest.mark.thread_unsafe(reason="peer access tests mutate process-global CUDA memory-pool access state")
+
 
 def test_peer_access_basic(mempool_device_x2):
     """Basic tests for dmr.peer_accessible_by."""

diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
@@ -1927,6 +1927,7 @@ def test_filestream_cache_tracker_reconciles_after_external_drift(tmp_path):
         assert cache._tracked_size_bytes <= 1100  # actual on-disk is 'b' + 'c' or just 'c'
 
 
+@pytest.mark.thread_unsafe(reason="already threaded and patches _file_stream")
 def test_filestream_cache_tracker_clamps_at_zero_under_delete_race(tmp_path):
     """Two-thread reproduction of the ``__delitem__`` vs
     ``_enforce_size_cap`` race. Thread A is mid-delete: it has stat'd the

diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
@@ -102,6 +102,7 @@ git_describe_command = [ "git", "describe", "--dirty", "--tags", "--long", "--ma
 
 [tool.pytest.ini_options]
 addopts = "--showlocals"
+thread_unsafe_fixtures = ['mocker']
 
 [tool.mypy]
 # Try to keep the mypy configuration similar between the subprojects

diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -173,6 +173,7 @@ def test_find_binary_cache_negative_result(monkeypatch, mocker):
 
 
 @pytest.mark.usefixtures("clear_find_binary_cache")
+@pytest.mark.thread_unsafe(reason="functools.cache may replace entry.")
 def test_caching_per_utility():
     """Verify that different utilities have independent cache entries."""
     nvdisasm1 = find_nvidia_binary_utility("nvdisasm")