Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 31 additions & 97 deletions cuda_bindings/tests/test_cufile.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import ctypes
import errno
import logging
import os
import pathlib
Expand Down Expand Up @@ -118,12 +117,6 @@ def get_tegra_kind():
),
]

xfail_handle_register = pytest.mark.xfail(
condition=isSupportedFilesystem() and os.environ.get("CI") is not None,
raises=cufile.cuFileError,
reason="handle_register call fails in CI for unknown reasons",
)
Comment on lines -121 to -125

@leofang leofang Jun 17, 2026

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for catching up slowly. IIRC this decorator was first added in #1271 (there was a long discussion that seems not decipherable to me now) and is still tracked in #1307. Not sure if this rings any bell to @rwgk? Is there any QA system that would require this decorator?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bot told me that if @sourabgupta3 or @rsarpangalav could confirm that the actual root cause of us having to add xfail_handle_register was because the CWD being on a non-ext4 mount inside the container, then this PR would allow us to declare #1307 as resolved and close it.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for being impatient. This only affects os.environ.get("CI") is not None so I would think QA doesn't matter.

However... I glanced over isSupportedFilesystem() and that was silly. The temporary folder is not necessarily on the same file system after all. I'll follow up.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened gh-2233 but maybe with some input (or revisiting) we can also figure out which check is missing from the supported filesystem check that would have made it reliable on CI.



def test_cufile_success_defined():
"""Check if CUFILE_SUCCESS is defined in OpError enum."""
Expand Down Expand Up @@ -204,11 +197,10 @@ def driver(ctx):

@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("driver")
@xfail_handle_register
def test_handle_register():
def test_handle_register(tmpdir):
"""Test file handle registration with cuFile."""
# Create test file
file_path = "test_handle_register.bin"
file_path = tmpdir / "test_handle_register.bin"

# Create file with POSIX operations
fd = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600)
Expand Down Expand Up @@ -242,8 +234,6 @@ def test_handle_register():

finally:
os.close(fd)
with suppress(OSError):
os.unlink(file_path)


@pytest.mark.usefixtures("driver")
Expand Down Expand Up @@ -397,11 +387,10 @@ def test_buf_register_already_registered():

@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("driver")
@xfail_handle_register
def test_cufile_read_write():
def test_cufile_read_write(tmpdir):
"""Test cuFile read and write operations."""
# Create test file
file_path = "test_cufile_rw.bin"
file_path = tmpdir / "test_cufile_rw.bin"

# Allocate CUDA memory for write and read
write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
Expand Down Expand Up @@ -478,21 +467,14 @@ def test_cufile_read_write():
# Free CUDA memory
cuda.cuMemFree(write_buf)
cuda.cuMemFree(read_buf)
# Clean up test file
try:
os.unlink(file_path)
except OSError as e:
if e.errno != errno.ENOENT:
raise


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("driver")
@xfail_handle_register
def test_cufile_read_write_host_memory():
def test_cufile_read_write_host_memory(tmpdir):
"""Test cuFile read and write operations using host memory."""
# Create test file
file_path = "test_cufile_rw_host.bin"
file_path = tmpdir / "test_cufile_rw_host.bin"

# Allocate host memory for write and read
write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
Expand Down Expand Up @@ -565,21 +547,14 @@ def test_cufile_read_write_host_memory():
# Free host memory
cuda.cuMemFreeHost(write_buf)
cuda.cuMemFreeHost(read_buf)
# Clean up test file
try:
os.unlink(file_path)
except OSError as e:
if e.errno != errno.ENOENT:
raise


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("driver")
@xfail_handle_register
def test_cufile_read_write_large():
def test_cufile_read_write_large(tmpdir):
"""Test cuFile read and write operations with large data."""
# Create test file
file_path = "test_cufile_rw_large.bin"
file_path = tmpdir / "test_cufile_rw_large.bin"

# Allocate large CUDA memory (1MB, aligned to 4096 bytes)
write_size = 1024 * 1024 # 1MB, aligned to 4096 bytes (1048576 % 4096 == 0)
Expand Down Expand Up @@ -659,21 +634,14 @@ def test_cufile_read_write_large():
# Free CUDA memory
cuda.cuMemFree(write_buf)
cuda.cuMemFree(read_buf)
# Clean up test file
try:
os.unlink(file_path)
except OSError as e:
if e.errno != errno.ENOENT:
raise


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
@xfail_handle_register
def test_cufile_write_async():
def test_cufile_write_async(tmpdir):
"""Test cuFile asynchronous write operations."""
# Create test file
file_path = "test_cufile_write_async.bin"
file_path = tmpdir / "test_cufile_write_async.bin"
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)

try:
Expand Down Expand Up @@ -741,17 +709,14 @@ def test_cufile_write_async():

finally:
os.close(fd)
with suppress(OSError):
os.unlink(file_path)


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
@xfail_handle_register
def test_cufile_read_async():
def test_cufile_read_async(tmpdir):
"""Test cuFile asynchronous read operations."""
# Create test file
file_path = "test_cufile_read_async.bin"
file_path = tmpdir / "test_cufile_read_async.bin"

# First create and write test data without O_DIRECT
fd_temp = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600)
Expand Down Expand Up @@ -832,17 +797,14 @@ def test_cufile_read_async():

finally:
os.close(fd)
with suppress(OSError):
os.unlink(file_path)


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@xfail_handle_register
@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
def test_cufile_async_read_write():
def test_cufile_async_read_write(tmpdir):
"""Test cuFile asynchronous read and write operations in sequence."""
# Create test file
file_path = "test_cufile_async_rw.bin"
file_path = tmpdir / "test_cufile_async_rw.bin"
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)

try:
Expand Down Expand Up @@ -946,17 +908,14 @@ def test_cufile_async_read_write():

finally:
os.close(fd)
with suppress(OSError):
os.unlink(file_path)


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("driver")
@xfail_handle_register
def test_batch_io_basic():
def test_batch_io_basic(tmpdir):
"""Test basic batch IO operations with multiple read/write operations."""
# Create test file
file_path = "test_batch_io.bin"
file_path = tmpdir / "test_batch_io.bin"

# Allocate CUDA memory for multiple operations
buf_size = 65536 # 64KB
Expand Down Expand Up @@ -1145,21 +1104,14 @@ def test_batch_io_basic():
# Free CUDA memory
for buf in buffers + read_buffers:
cuda.cuMemFree(buf)
# Clean up test file
try:
os.unlink(file_path)
except OSError as e:
if e.errno != errno.ENOENT:
raise


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("driver")
@xfail_handle_register
def test_batch_io_cancel():
def test_batch_io_cancel(tmpdir):
"""Test batch IO cancellation."""
# Create test file
file_path = "test_batch_cancel.bin"
file_path = tmpdir / "test_batch_cancel.bin"

# Allocate CUDA memory
buf_size = 4096 # 4KB, aligned to 4096 bytes
Expand Down Expand Up @@ -1229,21 +1181,14 @@ def test_batch_io_cancel():
# Free CUDA memory
for buf in buffers:
cuda.cuMemFree(buf)
# Clean up test file
try:
os.unlink(file_path)
except OSError as e:
if e.errno != errno.ENOENT:
raise


@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("driver")
@xfail_handle_register
def test_batch_io_large_operations():
def test_batch_io_large_operations(tmpdir):
"""Test batch IO with large buffer operations."""
# Create test file
file_path = "test_batch_large.bin"
file_path = tmpdir / "test_batch_large.bin"

# Allocate large CUDA memory (1MB, aligned to 4096 bytes)
buf_size = 1024 * 1024 # 1MB, aligned to 4096 bytes
Expand Down Expand Up @@ -1421,12 +1366,6 @@ def test_batch_io_large_operations():
# Free CUDA memory
for buf in all_buffers:
cuda.cuMemFree(buf)
# Clean up test file
try:
os.unlink(file_path)
except OSError as e:
if e.errno != errno.ENOENT:
raise


@pytest.mark.skipif(
Expand Down Expand Up @@ -1631,6 +1570,7 @@ def test_get_parameter_min_max_value():
cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later"
)
@pytest.mark.usefixtures("stats")
@pytest.mark.thread_unsafe(reason="not safe to stats_start() from multiple threads")
def test_stats_start_stop():
"""Test cuFile statistics collection stop."""
# Set statistics level first (required before starting stats)
Expand All @@ -1647,11 +1587,11 @@ def test_stats_start_stop():
)
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("stats")
@xfail_handle_register
def test_get_stats_l1():
@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
def test_get_stats_l1(tmpdir):
"""Test cuFile L1 statistics retrieval with file operations."""
# Create test file directly with O_DIRECT
file_path = "test_stats_l1.bin"
file_path = tmpdir / "test_stats_l1.bin"
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)

try:
Expand Down Expand Up @@ -1718,20 +1658,18 @@ def test_get_stats_l1():

finally:
os.close(fd)
with suppress(OSError):
os.unlink(file_path)


@pytest.mark.skipif(
cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later"
)
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("stats")
@xfail_handle_register
def test_get_stats_l2():
@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
def test_get_stats_l2(tmpdir):
"""Test cuFile L2 statistics retrieval with file operations."""
# Create test file directly with O_DIRECT
file_path = "test_stats_l2.bin"
file_path = tmpdir / "test_stats_l2.bin"
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)

try:
Expand Down Expand Up @@ -1802,20 +1740,18 @@ def test_get_stats_l2():

finally:
os.close(fd)
with suppress(OSError):
os.unlink(file_path)


@pytest.mark.skipif(
cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later"
)
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
@pytest.mark.usefixtures("stats")
@xfail_handle_register
def test_get_stats_l3():
@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
def test_get_stats_l3(tmpdir):
"""Test cuFile L3 statistics retrieval with file operations."""
# Create test file directly with O_DIRECT
file_path = "test_stats_l3.bin"
file_path = tmpdir / "test_stats_l3.bin"
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)

try:
Expand Down Expand Up @@ -1896,8 +1832,6 @@ def test_get_stats_l3():

finally:
os.close(fd)
with suppress(OSError):
os.unlink(file_path)


@pytest.mark.skipif(
Expand Down
5 changes: 4 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

[pytest]
Expand All @@ -21,3 +21,6 @@ markers =
cython: cython tests
smoke: meta-level smoke tests
flaky: mark test as flaky (provided by pytest-rerunfailures)
# pytest-run-parallel related markers
thread_unsafe: mark test as thread unsafe (provided by pytest-run-parallel)
parallel_threads_limit: max number of threads (provided by pytest-run-parallel)
Loading