diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
index 4b5f6a1e160..e571de67a76 100644
--- a/cuda_bindings/tests/nvml/test_device.py
+++ b/cuda_bindings/tests/nvml/test_device.py
@@ -120,6 +120,7 @@ def test_read_prm_counters(all_devices):
         assert len(read_counters) == 5
 
 
+@pytest.mark.thread_unsafe(reason="API appears to be thread-unsafe (2026-06)")
 def test_read_write_prm(all_devices):
     for device in all_devices:
         # Docs say supported in BLACKWELL or later
diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py
index 4c94dc26a3e..19e573c9cc6 100644
--- a/cuda_bindings/tests/nvml/test_init.py
+++ b/cuda_bindings/tests/nvml/test_init.py
@@ -42,6 +42,7 @@ def get_architecture_name(arch):
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
+@pytest.mark.thread_unsafe(reason="nvml init affects other threads")
 def test_init_ref_count():
     """
     Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index 959534cbbdf..d8d4f26e9c0 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -456,6 +456,7 @@ def test_cuda_mem_range_attr(device):
 
 
 @pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported")
+@pytest.mark.thread_unsafe(reason="used high memory can be higher if threaded.")
 def test_cuda_graphMem_attr(device):
     err, stream = cuda.cuStreamCreate(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 43fab4241db..c8d15677a54 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -100,6 +100,7 @@ def has_recent_memory_pool_support() -> bool:
 
 
 @pytest.mark.parametrize("example", sample_files)
+@pytest.mark.parallel_threads_limit(8)
 def test_example(example):
     example_path = os.path.join(samples_path, example)
     has_package_requirements_or_skip(example_path)
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index cc36575ff45..40cbcc2826b 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -17,6 +17,10 @@
 POOL_SIZE = 2097152
 
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 def test_outer_timeout_marker_is_applied(request):
     """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker.
 
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index 48985e67b58..e3cefe6a211 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -16,6 +16,10 @@
 NBYTES = 64
 
 
+# these tetss spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
+
 @pytest.mark.skipif(Device().compute_capability.major < 7, reason="__nanosleep is only available starting Volta (sm70)")
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
index 8d450fa8e3f..eaa6ddec92f 100644
--- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
+++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -24,6 +24,9 @@
 
 ENABLE_LOGGING = False  # Set True for test debugging and development
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 def child_main(log, queue):
     log.prefix = " child: "
diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py
index 6fc4d03f142..c6e44824137 100644
--- a/cuda_core/tests/memory_ipc/test_leaks.py
+++ b/cuda_core/tests/memory_ipc/test_leaks.py
@@ -23,6 +23,8 @@
     not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable"
 )
 
+pytestmark = pytest.mark.thread_unsafe(reason="Tests number of fds which is shared.")
+
 
 @pytest.mark.flaky(reruns=2)
 @skip_if_unrunnable
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index 0923fe28d8b..43d356789e7 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -14,6 +14,9 @@
 NWORKERS = 2
 NTASKS = 2
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcMempool:
     @pytest.mark.flaky(reruns=2)
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 9e9e2879ae7..efb67b4cdb8 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -14,6 +14,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestPeerAccessNotPreservedOnImport:
     """
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index cc7f45d67c2..01c9496e773 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -16,6 +16,9 @@
 NTASKS = 7
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcSendBuffers:
     @pytest.mark.flaky(reruns=2)
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 2f0e429b103..4289de4b5a9 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -15,6 +15,9 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestObjectSerializationDirect:
     """
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 609fadbcf3e..08d9bd79d92 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -16,6 +16,9 @@
 NTASKS = 20
 POOL_SIZE = 2097152
 
+# these tests spawn new processes and files which fails for very many threads
+pytestmark = pytest.mark.parallel_threads_limit(4)
+
 
 class TestIpcWorkerPool:
     """
diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
index 4aa13840b48..c202407dc55 100644
--- a/cuda_core/tests/system/test_system_device.py
+++ b/cuda_core/tests/system/test_system_device.py
@@ -268,6 +268,7 @@ def test_unpack_bitmask_single_value():
         _device._unpack_bitmask(1)
 
 
+@pytest.mark.parallel_threads_limit(4)  # timeouts are slow
 @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows")
 def test_register_events():
     # This is not the world's greatest test.  All of the events are pretty
diff --git a/cuda_core/tests/test_graphics.py b/cuda_core/tests/test_graphics.py
index 6f5877f76b0..e2b22a20c59 100644
--- a/cuda_core/tests/test_graphics.py
+++ b/cuda_core/tests/test_graphics.py
@@ -20,6 +20,9 @@
 )
 from cuda.core.utils import StridedMemoryView
 
+# TODO(seberg): Maybe some of these tests can be made threadable?
+pytestmark = pytest.mark.thread_unsafe(reason="OpenGL context not threadable")
+
 # ---------------------------------------------------------------------------
 # GL context + buffer helpers
 # ---------------------------------------------------------------------------
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 920cd4bb0fd..35592485c94 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1323,6 +1323,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda):
         )
 
 
+@pytest.mark.thread_unsafe(reason="Uses mock.")
 def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda):
     """host_numa with None raises RuntimeError when NUMA ID cannot be determined."""
     from unittest.mock import MagicMock, patch
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 71beb459143..68c32ce69c6 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -12,6 +12,8 @@
 
 NBYTES = 1024
 
+pytestmark = pytest.mark.thread_unsafe(reason="peer access tests mutate process-global CUDA memory-pool access state")
+
 
 def test_peer_access_basic(mempool_device_x2):
     """Basic tests for dmr.peer_accessible_by."""
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
index 01a39e0032c..963ec1cc04b 100644
--- a/cuda_core/tests/test_program_cache.py
+++ b/cuda_core/tests/test_program_cache.py
@@ -1927,6 +1927,7 @@ def test_filestream_cache_tracker_reconciles_after_external_drift(tmp_path):
         assert cache._tracked_size_bytes <= 1100  # actual on-disk is 'b' + 'c' or just 'c'
 
 
+@pytest.mark.thread_unsafe(reason="already threaded and patches _file_stream")
 def test_filestream_cache_tracker_clamps_at_zero_under_delete_race(tmp_path):
     """Two-thread reproduction of the ``__delitem__`` vs
     ``_enforce_size_cap`` race. Thread A is mid-delete: it has stat'd the
diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml
index 9f0a955cc01..0905f0b84ad 100644
--- a/cuda_pathfinder/pyproject.toml
+++ b/cuda_pathfinder/pyproject.toml
@@ -102,6 +102,7 @@ git_describe_command = [ "git", "describe", "--dirty", "--tags", "--long", "--ma
 
 [tool.pytest.ini_options]
 addopts = "--showlocals"
+thread_unsafe_fixtures = ['mocker']
 
 [tool.mypy]
 # Try to keep the mypy configuration similar between the subprojects
diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
index ec9740cd853..0f9e5ed31c1 100644
--- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py
+++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py
@@ -173,6 +173,7 @@ def test_find_binary_cache_negative_result(monkeypatch, mocker):
 
 
 @pytest.mark.usefixtures("clear_find_binary_cache")
+@pytest.mark.thread_unsafe(reason="functools.cache may replace entry.")
 def test_caching_per_utility():
     """Verify that different utilities have independent cache entries."""
     nvdisasm1 = find_nvidia_binary_utility("nvdisasm")