From 6df760815631962194ba188919f47b11f1168109 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 15 Jun 2026 12:34:49 +0200 Subject: [PATCH 1/2] Add pytest-run-parallel markers to pytest.ini --- pytest.ini | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index d1a82feb749..9c11c2b5f56 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 [pytest] @@ -21,3 +21,6 @@ markers = cython: cython tests smoke: meta-level smoke tests flaky: mark test as flaky (provided by pytest-rerunfailures) + # pytest-run-parallel related markers + thread_unsafe: mark test as thread unsafe (provided by pytest-run-parallel) + parallel_threads_limit: max number of threads (provided by pytest-run-parallel) From b9f4c54ffa4bf3b7efea15b3ddb9512b5b0a7856 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 15 Jun 2026 12:35:01 +0200 Subject: [PATCH 2/2] TST: Use tempdir for cufile tests (and mark some as thread unsafe) It seems that changing these to use a temp_dir (necessary to run them in parallel in multiple threads) also fixes the CI failures. --- cuda_bindings/tests/test_cufile.py | 128 +++++++---------------------- 1 file changed, 31 insertions(+), 97 deletions(-) diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py index 6e614ca1b05..3a4d9b1c0e3 100644 --- a/cuda_bindings/tests/test_cufile.py +++ b/cuda_bindings/tests/test_cufile.py @@ -1,8 +1,7 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import ctypes -import errno import logging import os import pathlib @@ -118,12 +117,6 @@ def get_tegra_kind(): ), ] -xfail_handle_register = pytest.mark.xfail( - condition=isSupportedFilesystem() and os.environ.get("CI") is not None, - raises=cufile.cuFileError, - reason="handle_register call fails in CI for unknown reasons", -) - def test_cufile_success_defined(): """Check if CUFILE_SUCCESS is defined in OpError enum.""" @@ -204,11 +197,10 @@ def driver(ctx): @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("driver") -@xfail_handle_register -def test_handle_register(): +def test_handle_register(tmpdir): """Test file handle registration with cuFile.""" # Create test file - file_path = "test_handle_register.bin" + file_path = tmpdir / "test_handle_register.bin" # Create file with POSIX operations fd = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600) @@ -242,8 +234,6 @@ def test_handle_register(): finally: os.close(fd) - with suppress(OSError): - os.unlink(file_path) @pytest.mark.usefixtures("driver") @@ -397,11 +387,10 @@ def test_buf_register_already_registered(): @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("driver") -@xfail_handle_register -def test_cufile_read_write(): +def test_cufile_read_write(tmpdir): """Test cuFile read and write operations.""" # Create test file - file_path = "test_cufile_rw.bin" + file_path = tmpdir / "test_cufile_rw.bin" # Allocate CUDA memory for write and read write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0) @@ -478,21 +467,14 @@ def test_cufile_read_write(): # Free CUDA memory cuda.cuMemFree(write_buf) cuda.cuMemFree(read_buf) - # Clean up test file - try: - os.unlink(file_path) - except OSError as e: - if e.errno != errno.ENOENT: - raise @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("driver") -@xfail_handle_register -def test_cufile_read_write_host_memory(): +def test_cufile_read_write_host_memory(tmpdir): """Test cuFile read and write operations using host memory.""" # Create test file - file_path = "test_cufile_rw_host.bin" + file_path = tmpdir / "test_cufile_rw_host.bin" # Allocate host memory for write and read write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0) @@ -565,21 +547,14 @@ def test_cufile_read_write_host_memory(): # Free host memory cuda.cuMemFreeHost(write_buf) cuda.cuMemFreeHost(read_buf) - # Clean up test file - try: - os.unlink(file_path) - except OSError as e: - if e.errno != errno.ENOENT: - raise @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("driver") -@xfail_handle_register -def test_cufile_read_write_large(): +def test_cufile_read_write_large(tmpdir): """Test cuFile read and write operations with large data.""" # Create test file - file_path = "test_cufile_rw_large.bin" + file_path = tmpdir / "test_cufile_rw_large.bin" # Allocate large CUDA memory (1MB, aligned to 4096 bytes) write_size = 1024 * 1024 # 1MB, aligned to 4096 bytes (1048576 % 4096 == 0) @@ -659,21 +634,14 @@ def test_cufile_read_write_large(): # Free CUDA memory cuda.cuMemFree(write_buf) cuda.cuMemFree(read_buf) - # Clean up test file - try: - os.unlink(file_path) - except OSError as e: - if e.errno != errno.ENOENT: - raise @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver") -@xfail_handle_register -def test_cufile_write_async(): +def test_cufile_write_async(tmpdir): """Test cuFile asynchronous write operations.""" # Create test file - file_path = "test_cufile_write_async.bin" + file_path = tmpdir / "test_cufile_write_async.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -741,17 +709,14 @@ def test_cufile_write_async(): finally: os.close(fd) - with suppress(OSError): - os.unlink(file_path) @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver") -@xfail_handle_register -def test_cufile_read_async(): +def test_cufile_read_async(tmpdir): """Test cuFile asynchronous read operations.""" # Create test file - file_path = "test_cufile_read_async.bin" + file_path = tmpdir / "test_cufile_read_async.bin" # First create and write test data without O_DIRECT fd_temp = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600) @@ -832,17 +797,14 @@ def test_cufile_read_async(): finally: os.close(fd) - with suppress(OSError): - os.unlink(file_path) @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@xfail_handle_register @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver") -def test_cufile_async_read_write(): +def test_cufile_async_read_write(tmpdir): """Test cuFile asynchronous read and write operations in sequence.""" # Create test file - file_path = "test_cufile_async_rw.bin" + file_path = tmpdir / "test_cufile_async_rw.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -946,17 +908,14 @@ def test_cufile_async_read_write(): finally: os.close(fd) - with suppress(OSError): - os.unlink(file_path) @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("driver") -@xfail_handle_register -def test_batch_io_basic(): +def test_batch_io_basic(tmpdir): """Test basic batch IO operations with multiple read/write operations.""" # Create test file - file_path = "test_batch_io.bin" + file_path = tmpdir / "test_batch_io.bin" # Allocate CUDA memory for multiple operations buf_size = 65536 # 64KB @@ -1145,21 +1104,14 @@ def test_batch_io_basic(): # Free CUDA memory for buf in buffers + read_buffers: cuda.cuMemFree(buf) - # Clean up test file - try: - os.unlink(file_path) - except OSError as e: - if e.errno != errno.ENOENT: - raise @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("driver") -@xfail_handle_register -def test_batch_io_cancel(): +def test_batch_io_cancel(tmpdir): """Test batch IO cancellation.""" # Create test file - file_path = "test_batch_cancel.bin" + file_path = tmpdir / "test_batch_cancel.bin" # Allocate CUDA memory buf_size = 4096 # 4KB, aligned to 4096 bytes @@ -1229,21 +1181,14 @@ def test_batch_io_cancel(): # Free CUDA memory for buf in buffers: cuda.cuMemFree(buf) - # Clean up test file - try: - os.unlink(file_path) - except OSError as e: - if e.errno != errno.ENOENT: - raise @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("driver") -@xfail_handle_register -def test_batch_io_large_operations(): +def test_batch_io_large_operations(tmpdir): """Test batch IO with large buffer operations.""" # Create test file - file_path = "test_batch_large.bin" + file_path = tmpdir / "test_batch_large.bin" # Allocate large CUDA memory (1MB, aligned to 4096 bytes) buf_size = 1024 * 1024 # 1MB, aligned to 4096 bytes @@ -1421,12 +1366,6 @@ def test_batch_io_large_operations(): # Free CUDA memory for buf in all_buffers: cuda.cuMemFree(buf) - # Clean up test file - try: - os.unlink(file_path) - except OSError as e: - if e.errno != errno.ENOENT: - raise @pytest.mark.skipif( @@ -1631,6 +1570,7 @@ def test_get_parameter_min_max_value(): cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later" ) @pytest.mark.usefixtures("stats") +@pytest.mark.thread_unsafe(reason="not safe to stats_start() from multiple threads") def test_stats_start_stop(): """Test cuFile statistics collection stop.""" # Set statistics level first (required before starting stats) @@ -1647,11 +1587,11 @@ def test_stats_start_stop(): ) @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("stats") -@xfail_handle_register -def test_get_stats_l1(): +@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") +def test_get_stats_l1(tmpdir): """Test cuFile L1 statistics retrieval with file operations.""" # Create test file directly with O_DIRECT - file_path = "test_stats_l1.bin" + file_path = tmpdir / "test_stats_l1.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -1718,8 +1658,6 @@ def test_get_stats_l1(): finally: os.close(fd) - with suppress(OSError): - os.unlink(file_path) @pytest.mark.skipif( @@ -1727,11 +1665,11 @@ def test_get_stats_l1(): ) @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("stats") -@xfail_handle_register -def test_get_stats_l2(): +@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") +def test_get_stats_l2(tmpdir): """Test cuFile L2 statistics retrieval with file operations.""" # Create test file directly with O_DIRECT - file_path = "test_stats_l2.bin" + file_path = tmpdir / "test_stats_l2.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -1802,8 +1740,6 @@ def test_get_stats_l2(): finally: os.close(fd) - with suppress(OSError): - os.unlink(file_path) @pytest.mark.skipif( @@ -1811,11 +1747,11 @@ def test_get_stats_l2(): ) @pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") @pytest.mark.usefixtures("stats") -@xfail_handle_register -def test_get_stats_l3(): +@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") +def test_get_stats_l3(tmpdir): """Test cuFile L3 statistics retrieval with file operations.""" # Create test file directly with O_DIRECT - file_path = "test_stats_l3.bin" + file_path = tmpdir / "test_stats_l3.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -1896,8 +1832,6 @@ def test_get_stats_l3(): finally: os.close(fd) - with suppress(OSError): - os.unlink(file_path) @pytest.mark.skipif(