diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py index 4b5f6a1e160..e571de67a76 100644 --- a/cuda_bindings/tests/nvml/test_device.py +++ b/cuda_bindings/tests/nvml/test_device.py @@ -120,6 +120,7 @@ def test_read_prm_counters(all_devices): assert len(read_counters) == 5 +@pytest.mark.thread_unsafe(reason="API appears to be thread-unsafe (2026-06)") def test_read_write_prm(all_devices): for device in all_devices: # Docs say supported in BLACKWELL or later diff --git a/cuda_bindings/tests/nvml/test_init.py b/cuda_bindings/tests/nvml/test_init.py index 4c94dc26a3e..19e573c9cc6 100644 --- a/cuda_bindings/tests/nvml/test_init.py +++ b/cuda_bindings/tests/nvml/test_init.py @@ -42,6 +42,7 @@ def get_architecture_name(arch): @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows") +@pytest.mark.thread_unsafe(reason="nvml init affects other threads") def test_init_ref_count(): """ Verifies that we can call NVML shutdown and init(2) multiple times, and that ref counting works diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index 959534cbbdf..d8d4f26e9c0 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -456,6 +456,7 @@ def test_cuda_mem_range_attr(device): @pytest.mark.skipif(driverVersionLessThan(11040) or not supportsMemoryPool(), reason="Mempool for graphs not supported") +@pytest.mark.thread_unsafe(reason="used high memory can be higher if threaded.") def test_cuda_graphMem_attr(device): err, stream = cuda.cuStreamCreate(0) assert err == cuda.CUresult.CUDA_SUCCESS diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index 43fab4241db..c8d15677a54 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -100,6 +100,7 @@ def has_recent_memory_pool_support() -> bool: @pytest.mark.parametrize("example", sample_files) +@pytest.mark.parallel_threads_limit(8) def test_example(example): example_path = os.path.join(samples_path, example) has_package_requirements_or_skip(example_path) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index cc36575ff45..40cbcc2826b 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -17,6 +17,10 @@ POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + + def test_outer_timeout_marker_is_applied(request): """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker. diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index 48985e67b58..e3cefe6a211 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -16,6 +16,10 @@ NBYTES = 64 +# these tetss spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + + @pytest.mark.skipif(Device().compute_capability.major < 7, reason="__nanosleep is only available starting Volta (sm70)") class TestEventIpc: """Check the basic usage of IPC-enabled events with a latch kernel.""" diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index 8d450fa8e3f..eaa6ddec92f 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -24,6 +24,9 @@ ENABLE_LOGGING = False # Set True for test debugging and development +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + def child_main(log, queue): log.prefix = " child: " diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index 6fc4d03f142..c6e44824137 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -23,6 +23,8 @@ not USING_FDS or not HAVE_PSUTIL, reason="mempool allocation handle is not using fds or psutil is unavailable" ) +pytestmark = pytest.mark.thread_unsafe(reason="Tests number of fds which is shared.") + @pytest.mark.flaky(reruns=2) @skip_if_unrunnable diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 0923fe28d8b..43d356789e7 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -14,6 +14,9 @@ NWORKERS = 2 NTASKS = 2 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestIpcMempool: @pytest.mark.flaky(reruns=2) diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 9e9e2879ae7..efb67b4cdb8 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -14,6 +14,9 @@ NBYTES = 64 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestPeerAccessNotPreservedOnImport: """ diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index cc7f45d67c2..01c9496e773 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -16,6 +16,9 @@ NTASKS = 7 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestIpcSendBuffers: @pytest.mark.flaky(reruns=2) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 2f0e429b103..4289de4b5a9 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -15,6 +15,9 @@ NBYTES = 64 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestObjectSerializationDirect: """ diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 609fadbcf3e..08d9bd79d92 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -16,6 +16,9 @@ NTASKS = 20 POOL_SIZE = 2097152 +# these tests spawn new processes and files which fails for very many threads +pytestmark = pytest.mark.parallel_threads_limit(4) + class TestIpcWorkerPool: """ diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 4aa13840b48..c202407dc55 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -268,6 +268,7 @@ def test_unpack_bitmask_single_value(): _device._unpack_bitmask(1) +@pytest.mark.parallel_threads_limit(4) # timeouts are slow @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows") def test_register_events(): # This is not the world's greatest test. All of the events are pretty diff --git a/cuda_core/tests/test_graphics.py b/cuda_core/tests/test_graphics.py index 6f5877f76b0..e2b22a20c59 100644 --- a/cuda_core/tests/test_graphics.py +++ b/cuda_core/tests/test_graphics.py @@ -20,6 +20,9 @@ ) from cuda.core.utils import StridedMemoryView +# TODO(seberg): Maybe some of these tests can be made threadable? +pytestmark = pytest.mark.thread_unsafe(reason="OpenGL context not threadable") + # --------------------------------------------------------------------------- # GL context + buffer helpers # --------------------------------------------------------------------------- diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 920cd4bb0fd..35592485c94 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1323,6 +1323,7 @@ def test_managed_memory_resource_preferred_location_validation(init_cuda): ) +@pytest.mark.thread_unsafe(reason="Uses mock.") def test_managed_memory_resource_host_numa_auto_resolve_failure(init_cuda): """host_numa with None raises RuntimeError when NUMA ID cannot be determined.""" from unittest.mock import MagicMock, patch diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index 71beb459143..68c32ce69c6 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -12,6 +12,8 @@ NBYTES = 1024 +pytestmark = pytest.mark.thread_unsafe(reason="peer access tests mutate process-global CUDA memory-pool access state") + def test_peer_access_basic(mempool_device_x2): """Basic tests for dmr.peer_accessible_by.""" diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py index 01a39e0032c..963ec1cc04b 100644 --- a/cuda_core/tests/test_program_cache.py +++ b/cuda_core/tests/test_program_cache.py @@ -1927,6 +1927,7 @@ def test_filestream_cache_tracker_reconciles_after_external_drift(tmp_path): assert cache._tracked_size_bytes <= 1100 # actual on-disk is 'b' + 'c' or just 'c' +@pytest.mark.thread_unsafe(reason="already threaded and patches _file_stream") def test_filestream_cache_tracker_clamps_at_zero_under_delete_race(tmp_path): """Two-thread reproduction of the ``__delitem__`` vs ``_enforce_size_cap`` race. Thread A is mid-delete: it has stat'd the diff --git a/cuda_pathfinder/pyproject.toml b/cuda_pathfinder/pyproject.toml index 9f0a955cc01..0905f0b84ad 100644 --- a/cuda_pathfinder/pyproject.toml +++ b/cuda_pathfinder/pyproject.toml @@ -102,6 +102,7 @@ git_describe_command = [ "git", "describe", "--dirty", "--tags", "--long", "--ma [tool.pytest.ini_options] addopts = "--showlocals" +thread_unsafe_fixtures = ['mocker'] [tool.mypy] # Try to keep the mypy configuration similar between the subprojects diff --git a/cuda_pathfinder/tests/test_find_nvidia_binaries.py b/cuda_pathfinder/tests/test_find_nvidia_binaries.py index ec9740cd853..0f9e5ed31c1 100644 --- a/cuda_pathfinder/tests/test_find_nvidia_binaries.py +++ b/cuda_pathfinder/tests/test_find_nvidia_binaries.py @@ -173,6 +173,7 @@ def test_find_binary_cache_negative_result(monkeypatch, mocker): @pytest.mark.usefixtures("clear_find_binary_cache") +@pytest.mark.thread_unsafe(reason="functools.cache may replace entry.") def test_caching_per_utility(): """Verify that different utilities have independent cache entries.""" nvdisasm1 = find_nvidia_binary_utility("nvdisasm")