diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 4a089cefb2..0574094683 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -261,16 +261,6 @@ jobs: LOCAL_CTK: ${{ matrix.LOCAL_CTK }} run: run-tests bindings - - name: Run cuda.bindings examples - if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} - env: - CUDA_VER: ${{ matrix.CUDA_VER }} - LOCAL_CTK: ${{ matrix.LOCAL_CTK }} - run: | - pushd cuda_bindings - ${SANITIZER_CMD} pytest -ra -s -vv examples/ - popd - - name: Run cuda.core tests env: CUDA_VER: ${{ matrix.CUDA_VER }} diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index fbe8bad1a5..5cfee3b892 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -245,17 +245,6 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: run-tests bindings - - name: Run cuda.bindings examples - if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} - env: - CUDA_VER: ${{ matrix.CUDA_VER }} - LOCAL_CTK: ${{ matrix.LOCAL_CTK }} - shell: bash --noprofile --norc -xeuo pipefail {0} - run: | - pushd cuda_bindings - ${SANITIZER_CMD} pytest -ra -s -vv examples/ - popd - - name: Run cuda.core tests env: CUDA_VER: ${{ matrix.CUDA_VER }} diff --git a/cuda_bindings/cuda/bindings/_example_helpers/__init__.py b/cuda_bindings/cuda/bindings/_example_helpers/__init__.py new file mode 100644 index 0000000000..fa061cc346 --- /dev/null +++ b/cuda_bindings/cuda/bindings/_example_helpers/__init__.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +from .common import KernelHelper, check_compute_capability_too_low, requirement_not_met +from .helper_cuda import check_cuda_errors, find_cuda_device, find_cuda_device_drv +from .helper_string import check_cmd_line_flag, get_cmd_line_argument_int + +__all__ = [ + "KernelHelper", + "check_cmd_line_flag", + "check_compute_capability_too_low", + "check_cuda_errors", + "find_cuda_device", + "find_cuda_device_drv", + "get_cmd_line_argument_int", + "requirement_not_met", +] diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/cuda/bindings/_example_helpers/common.py similarity index 77% rename from cuda_bindings/examples/common/common.py rename to cuda_bindings/cuda/bindings/_example_helpers/common.py index 5b5151ef24..15317ace29 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/cuda/bindings/_example_helpers/common.py @@ -1,19 +1,27 @@ -# Copyright 2021-2025 NVIDIA Corporation. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import os +import sys + import numpy as np -from common.helper_cuda import check_cuda_errors from cuda import pathfinder from cuda.bindings import driver as cuda from cuda.bindings import nvrtc from cuda.bindings import runtime as cudart +from .helper_cuda import check_cuda_errors + + +def requirement_not_met(message): + print(message, file=sys.stderr) # noqa: T201 + exitcode = os.environ.get("CUDA_BINDINGS_SKIP_EXAMPLE", "1") + return sys.exit(int(exitcode)) -def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor): - import pytest +def check_compute_capability_too_low(dev_id, required_cc_major_minor): cc_major = check_cuda_errors( cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) @@ -22,7 +30,9 @@ def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor): ) have_cc_major_minor = (cc_major, cc_minor) if have_cc_major_minor < required_cc_major_minor: - pytest.skip(f"cudaDevAttrComputeCapability too low: {have_cc_major_minor=!r}, {required_cc_major_minor=!r}") + requirement_not_met( + f"CUDA device compute capability too low: {have_cc_major_minor=!r}, {required_cc_major_minor=!r}" + ) class KernelHelper: @@ -31,9 +41,7 @@ def __init__(self, code, dev_id): for libname in ("cudart", "cccl"): hdr_dir = pathfinder.find_nvidia_header_directory(libname) if hdr_dir is None: - import pytest - - pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None') + requirement_not_met(f'pathfinder.find_nvidia_header_directory("{libname}") returned None') include_dirs.append(hdr_dir) prog = check_cuda_errors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) @@ -69,8 +77,8 @@ def __init__(self, code, dev_id): check_cuda_errors(nvrtc.nvrtcGetProgramLog(prog, log)) import sys - print(log.decode(), file=sys.stderr) - print(err, file=sys.stderr) + print(log.decode(), file=sys.stderr) # noqa: T201 + print(err, file=sys.stderr) # noqa: T201 sys.exit(1) if use_cubin: diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/cuda/bindings/_example_helpers/helper_cuda.py similarity index 88% rename from cuda_bindings/examples/common/helper_cuda.py rename to cuda_bindings/cuda/bindings/_example_helpers/helper_cuda.py index 9fbfe8c82f..0e56fa8fd1 100644 --- a/cuda_bindings/examples/common/helper_cuda.py +++ b/cuda_bindings/cuda/bindings/_example_helpers/helper_cuda.py @@ -1,12 +1,12 @@ -# Copyright 2021-2025 NVIDIA Corporation. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int - from cuda.bindings import driver as cuda from cuda.bindings import nvrtc from cuda.bindings import runtime as cudart +from .helper_string import check_cmd_line_flag, get_cmd_line_argument_int + def _cuda_get_error_enum(error): if isinstance(error, cuda.CUresult): diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/cuda/bindings/_example_helpers/helper_string.py similarity index 78% rename from cuda_bindings/examples/common/helper_string.py rename to cuda_bindings/cuda/bindings/_example_helpers/helper_string.py index 47d9d36569..1540db447a 100644 --- a/cuda_bindings/examples/common/helper_string.py +++ b/cuda_bindings/cuda/bindings/_example_helpers/helper_string.py @@ -1,4 +1,4 @@ -# Copyright 2021-2025 NVIDIA Corporation. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import sys diff --git a/cuda_bindings/cuda/bindings/_test_helpers/pep723.py b/cuda_bindings/cuda/bindings/_test_helpers/pep723.py new file mode 100644 index 0000000000..e1f6f920b7 --- /dev/null +++ b/cuda_bindings/cuda/bindings/_test_helpers/pep723.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + + +import importlib.metadata +import os +import re + +import pytest + + +def has_package_requirements_or_skip(example): + example_name = os.path.basename(example) + + with open(example, encoding="utf-8") as f: + content = f.read() + + # The canonical regex as defined in PEP 723 + pep723 = re.search(r"(?m)^# /// (?P[a-zA-Z0-9-]+)$\s(?P(^#(| .*)$\s)+)^# ///$", content) + if not pep723: + raise ValueError(f"PEP 723 metadata not found in {example_name}") + + metadata = {} + for line in pep723.group("content").splitlines(): + line = line.lstrip("# ").rstrip() + if not line: + continue + key, value = line.split("=", 1) + key = key.strip() + value = value.strip() + metadata[key] = value + + if "dependencies" not in metadata: + raise ValueError(f"PEP 723 dependencies not found in {example_name}") + + missing_dependencies = [] + dependencies = eval(metadata["dependencies"]) # noqa: S307 + for dependency in dependencies: + name = re.match("[a-zA-Z0-9_-]+", dependency) + try: + importlib.metadata.distribution(name.group(0)) + except importlib.metadata.PackageNotFoundError: + missing_dependencies.append(name.string) + + if missing_dependencies: + pytest.skip(f"Skipping {example} due to missing package requirement: {', '.join(missing_dependencies)}") diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py index 540c9b4c11..26f02eba30 100644 --- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py +++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py @@ -8,13 +8,16 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import platform import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda +from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device, requirement_not_met clock_nvrtc = """\ extern "C" __global__ void timedReduction(const float *hinput, float *output, clock_t *timer) @@ -65,11 +68,13 @@ def elems_to_bytes(nelems, dt): return nelems * np.dtype(dt).itemsize -def main(): - import pytest - +def check_requirements(): if platform.machine() == "armv7l": - pytest.skip("clock_nvrtc is not supported on ARMv7") + requirement_not_met("clock_nvrtc is not supported on ARMv7") + + +def main(): + check_requirements() timer = np.empty(num_blocks * 2, dtype="int64") hinput = np.empty(num_threads * 2, dtype="float32") @@ -78,7 +83,7 @@ def main(): hinput[i] = i dev_id = find_cuda_device() - kernel_helper = common.KernelHelper(clock_nvrtc, dev_id) + kernel_helper = KernelHelper(clock_nvrtc, dev_id) kernel_addr = kernel_helper.get_function(b"timedReduction") dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes)) diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py index c92d33e975..da8227a6c3 100644 --- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py @@ -7,16 +7,21 @@ # # ################################################################################ + +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + + import ctypes import sys import time import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device, requirement_not_met simple_cubemap_texture = """\ extern "C" @@ -97,9 +102,7 @@ def main(): f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}" ) if device_props.major < 2: - import pytest - - pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.") + requirement_not_met("Test requires SM 2.0 or higher for support of Texture Arrays.") # Generate input data for layered texture width = 64 @@ -162,7 +165,7 @@ def main(): f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads" ) - kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id) + kernel_helper = KernelHelper(simple_cubemap_texture, dev_id) _transform_kernel = kernel_helper.get_function(b"transformKernel") kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None)) check_cuda_errors( diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py index 637c31cf0e..f1548adc25 100644 --- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py @@ -8,16 +8,19 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import platform import sys import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, requirement_not_met simplep2p = """\ extern "C" @@ -32,19 +35,17 @@ def main(): - import pytest - if platform.system() == "Darwin": - pytest.skip("simpleP2P is not supported on Mac OSX") + requirement_not_met("simpleP2P is not supported on Mac OSX") if platform.machine() == "armv7l": - pytest.skip("simpleP2P is not supported on ARMv7") + requirement_not_met("simpleP2P is not supported on ARMv7") if platform.machine() == "aarch64": - pytest.skip("simpleP2P is not supported on aarch64") + requirement_not_met("simpleP2P is not supported on aarch64") if platform.machine() == "sbsa": - pytest.skip("simpleP2P is not supported on sbsa") + requirement_not_met("simpleP2P is not supported on sbsa") # Number of GPUs print("Checking for multiple GPUs...") @@ -52,7 +53,7 @@ def main(): print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: - pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required") + requirement_not_met("Two or more GPUs with Peer-to-Peer access capability are required") prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] # Check possibility for peer access @@ -83,7 +84,7 @@ def main(): break if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1: - pytest.skip("Peer to Peer access is not available amongst GPUs in the system") + requirement_not_met("Peer to Peer access is not available amongst GPUs in the system") # Use first pair of p2p capable GPUs detected gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]] @@ -158,7 +159,7 @@ def main(): _simple_kernel = [None] * 2 kernel_args = [None] * 2 - kernel_helper[1] = common.KernelHelper(simplep2p, gpuid[1]) + kernel_helper[1] = KernelHelper(simplep2p, gpuid[1]) _simple_kernel[1] = kernel_helper[1].get_function(b"SimpleKernel") kernel_args[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p)) check_cuda_errors( @@ -183,7 +184,7 @@ def main(): # output to the GPU 0 buffer print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...") check_cuda_errors(cudart.cudaSetDevice(gpuid[0])) - kernel_helper[0] = common.KernelHelper(simplep2p, gpuid[0]) + kernel_helper[0] = KernelHelper(simplep2p, gpuid[0]) _simple_kernel[0] = kernel_helper[0].get_function(b"SimpleKernel") kernel_args[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p)) check_cuda_errors( diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py index e4dc439b9b..ff47696cb6 100644 --- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py @@ -8,6 +8,10 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import math import platform @@ -15,12 +19,16 @@ import sys import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors -from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import ( + KernelHelper, + check_cmd_line_flag, + check_cuda_errors, + get_cmd_line_argument_int, + requirement_not_met, +) simple_zero_copy = """\ extern "C" @@ -40,19 +48,17 @@ def main(): idev = 0 b_pin_generic_memory = False - import pytest - if platform.system() == "Darwin": - pytest.skip("simpleZeroCopy is not supported on Mac OSX") + requirement_not_met("simpleZeroCopy is not supported on Mac OSX") if platform.machine() == "armv7l": - pytest.skip("simpleZeroCopy is not supported on ARMv7") + requirement_not_met("simpleZeroCopy is not supported on ARMv7") if platform.machine() == "aarch64": - pytest.skip("simpleZeroCopy is not supported on aarch64") + requirement_not_met("simpleZeroCopy is not supported on aarch64") if platform.machine() == "sbsa": - pytest.skip("simpleZeroCopy is not supported on sbsa") + requirement_not_met("simpleZeroCopy is not supported on sbsa") if check_cmd_line_flag("help"): print("Usage: simpleZeroCopy [OPTION]\n", file=sys.stderr) @@ -84,7 +90,7 @@ def main(): device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(idev)) if not device_prop.canMapHostMemory: - pytest.skip(f"Device {idev} does not support mapping CPU host memory!") + requirement_not_met(f"Device {idev} does not support mapping CPU host memory!") check_cuda_errors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost)) @@ -131,7 +137,7 @@ def main(): grid.x = math.ceil(nelem / float(block.x)) grid.y = 1 grid.z = 1 - kernel_helper = common.KernelHelper(simple_zero_copy, idev) + kernel_helper = KernelHelper(simple_zero_copy, idev) _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU") kernel_args = ( (d_a, d_b, d_c, nelem), diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py index ed4a13e686..0d7a6341a5 100644 --- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py +++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py @@ -7,16 +7,19 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import os import sys import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device, requirement_not_met system_wide_atomics = """\ #define LOOP_NUM 50 @@ -172,26 +175,24 @@ def verify(test_data, length): def main(): - import pytest - if os.name == "nt": - pytest.skip("Atomics not supported on Windows") + requirement_not_met("Atomics not supported on Windows") # set device dev_id = find_cuda_device() device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id)) if not device_prop.managedMemory: - pytest.skip("Unified Memory not supported on this device") + requirement_not_met("Unified Memory not supported on this device") compute_mode = check_cuda_errors( cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id) ) if compute_mode == cudart.cudaComputeMode.cudaComputeModeProhibited: - pytest.skip("This sample requires a device in either default or process exclusive mode") + requirement_not_met("This sample requires a device in either default or process exclusive mode") if device_prop.major < 6: - pytest.skip("Requires a minimum CUDA compute 6.0 capability") + requirement_not_met("Requires a minimum CUDA compute 6.0 capability") num_threads = 256 num_blocks = 64 @@ -214,7 +215,7 @@ def main(): # To make the AND and XOR tests generate something other than 0... atom_arr_h[7] = atom_arr_h[9] = 0xFF - kernel_helper = common.KernelHelper(system_wide_atomics, dev_id) + kernel_helper = KernelHelper(system_wide_atomics, dev_id) _atomic_kernel = kernel_helper.get_function(b"atomicKernel") kernel_args = ((atom_arr,), (ctypes.c_void_p,)) check_cuda_errors( diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py index 0a29b8c0ca..a6f65b9c81 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py @@ -8,15 +8,18 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import math import sys import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device_drv from cuda.bindings import driver as cuda +from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device_drv, requirement_not_met vector_add_drv = """\ /* Vector addition: C = A + B. @@ -52,11 +55,9 @@ def main(): cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device) ) if not uva_supported: - import pytest - - pytest.skip("Accessing pageable memory directly requires UVA") + requirement_not_met("Accessing pageable memory directly requires UVA") - kernel_helper = common.KernelHelper(vector_add_drv, int(cu_device)) + kernel_helper = KernelHelper(vector_add_drv, int(cu_device)) _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel") # Allocate input vectors h_A and h_B in host memory @@ -73,31 +74,28 @@ def main(): check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, nbytes)) check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, nbytes)) - if True: - # Grid/Block configuration - threads_per_block = 256 - blocks_per_grid = (n + threads_per_block - 1) / threads_per_block - - kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int)) - - # Launch the CUDA kernel - check_cuda_errors( - cuda.cuLaunchKernel( - _vec_add_kernel, - blocks_per_grid, - 1, - 1, - threads_per_block, - 1, - 1, - 0, - 0, - kernel_args, - 0, - ) + # Grid/Block configuration + threads_per_block = 256 + blocks_per_grid = (n + threads_per_block - 1) / threads_per_block + + kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int)) + + # Launch the CUDA kernel + check_cuda_errors( + cuda.cuLaunchKernel( + _vec_add_kernel, + blocks_per_grid, + 1, + 1, + threads_per_block, + 1, + 1, + 0, + 0, + kernel_args, + 0, ) - else: - pass + ) # Copy result from device memory to host memory # h_C contains the result in host memory diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py index 55178f1abd..f1e9617166 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py @@ -8,16 +8,19 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import math import platform import sys import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device_drv from cuda.bindings import driver as cuda +from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device_drv, requirement_not_met vector_add_mmap = """\ /* Vector addition: C = A + B. @@ -197,19 +200,17 @@ def simple_free_multi_device_mmap(dptr, size): def main(): - import pytest - if platform.system() == "Darwin": - pytest.skip("vectorAddMMAP is not supported on Mac OSX") + requirement_not_met("vectorAddMMAP is not supported on Mac OSX") if platform.machine() == "armv7l": - pytest.skip("vectorAddMMAP is not supported on ARMv7") + requirement_not_met("vectorAddMMAP is not supported on ARMv7") if platform.machine() == "aarch64": - pytest.skip("vectorAddMMAP is not supported on aarch64") + requirement_not_met("vectorAddMMAP is not supported on aarch64") if platform.machine() == "sbsa": - pytest.skip("vectorAddMMAP is not supported on sbsa") + requirement_not_met("vectorAddMMAP is not supported on sbsa") n = 50000 size = n * np.dtype(np.float32).itemsize @@ -228,7 +229,7 @@ def main(): ) print(f"Device {cu_device} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attribute_val}.") if not attribute_val: - pytest.skip(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.") + requirement_not_met(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.") # The vector addition happens on cuDevice, so the allocations need to be mapped there. mapping_devices = [cu_device] @@ -239,7 +240,7 @@ def main(): # Create context cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device)) - kernel_helper = common.KernelHelper(vector_add_mmap, int(cu_device)) + kernel_helper = KernelHelper(vector_add_mmap, int(cu_device)) _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel") # Allocate input vectors h_A and h_B in host memory diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py index 407079ad43..d9094a8a70 100644 --- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py +++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py @@ -8,6 +8,10 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import math import platform @@ -15,12 +19,16 @@ import sys import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device -from common.helper_string import check_cmd_line_flag from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import ( + KernelHelper, + check_cmd_line_flag, + check_cuda_errors, + find_cuda_device, + requirement_not_met, +) stream_ordered_allocation = """\ /* Add two vectors on the GPU */ @@ -205,10 +213,8 @@ def stream_ordered_allocation_post_sync(dev, nelem, a, b, c): def main(): - import pytest - if platform.system() == "Darwin": - pytest.skip("streamOrderedAllocation is not supported on Mac OSX") + requirement_not_met("streamOrderedAllocation is not supported on Mac OSX") cuda.cuInit(0) if check_cmd_line_flag("help"): @@ -227,10 +233,10 @@ def main(): cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev) ) if not is_mem_pool_supported: - pytest.skip("Waiving execution as device does not support Memory Pools") + requirement_not_met("Waiving execution as device does not support Memory Pools") global _vector_add_gpu - kernel_helper = common.KernelHelper(stream_ordered_allocation, dev) + kernel_helper = KernelHelper(stream_ordered_allocation, dev) _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU") # Allocate CPU memory diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index 00ed5cdfd4..18f5c88e9d 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -8,6 +8,10 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import math import platform @@ -15,12 +19,18 @@ from enum import Enum import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device -from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import ( + KernelHelper, + check_cmd_line_flag, + check_compute_capability_too_low, + check_cuda_errors, + find_cuda_device, + get_cmd_line_argument_int, + requirement_not_met, +) block_size = 16 @@ -1130,16 +1140,14 @@ def matrix_multiply(dims_a, dims_b, kernel_number): def main(): - import pytest - - common.pytest_skipif_compute_capability_too_low(find_cuda_device(), (7, 0)) + check_compute_capability_too_low(find_cuda_device(), (7, 0)) if platform.machine() == "qnx": - pytest.skip("globalToShmemAsyncCopy is not supported on QNX") + requirement_not_met("globalToShmemAsyncCopy is not supported on QNX") version = check_cuda_errors(cuda.cuDriverGetVersion()) if version < 11010: - pytest.skip("CUDA Toolkit 11.1 or greater is required") + requirement_not_met("CUDA Toolkit 11.1 or greater is required") if check_cmd_line_flag("help") or check_cmd_line_flag("?"): print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr) @@ -1207,7 +1215,7 @@ def main(): cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id) ) if major < 7: - pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.") + requirement_not_met("globalToShmemAsyncCopy requires SM 7.0 or higher.") print(f"MatrixA({dims_a.x},{dims_a.y}), MatrixB({dims_b.x},{dims_b.y})") @@ -1219,7 +1227,7 @@ def main(): global _MatrixMulAsyncCopySingleStage global _MatrixMulNaive global _MatrixMulNaiveLargeChunk - kernel_helper = common.KernelHelper(global_to_shmem_async_copy, dev_id) + kernel_helper = KernelHelper(global_to_shmem_async_copy, dev_id) _MatrixMulAsyncCopyMultiStageLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageLargeChunk") _MatrixMulAsyncCopyLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunk") _MatrixMulAsyncCopyLargeChunkAWBarrier = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunkAWBarrier") diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py index 9fff51767e..bb749065f2 100644 --- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py @@ -8,15 +8,22 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import random as rnd import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import ( + KernelHelper, + check_cuda_errors, + find_cuda_device, +) THREADS_PER_BLOCK = 512 GRAPH_LAUNCH_ITERATIONS = 3 @@ -378,7 +385,7 @@ def main(): global _reduce global _reduceFinal - kernel_helper = common.KernelHelper(simple_cuda_graphs, dev_id) + kernel_helper = KernelHelper(simple_cuda_graphs, dev_id) _reduce = kernel_helper.get_function(b"reduce") _reduceFinal = kernel_helper.get_function(b"reduceFinal") diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py index a2d4cdca40..44ff57c6d7 100644 --- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py +++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py @@ -8,6 +8,10 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import math import platform @@ -15,11 +19,14 @@ from random import random import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors, find_cuda_device from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import ( + KernelHelper, + check_cuda_errors, + find_cuda_device, +) conjugate_gradient_multi_block_cg = """\ #line __LINE__ @@ -238,7 +245,7 @@ def main(): ) # Get kernel - kernel_helper = common.KernelHelper(conjugate_gradient_multi_block_cg, dev_id) + kernel_helper = KernelHelper(conjugate_gradient_multi_block_cg, dev_id) _gpu_conjugate_gradient = kernel_helper.get_function(b"gpuConjugateGradient") # Generate a random tridiagonal symmetric matrix in CSR format diff --git a/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py b/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py index 8290e491c6..ca229e5268 100644 --- a/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py +++ b/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py @@ -11,6 +11,10 @@ # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1"] +# /// + import sys from cuda.bindings import nvml diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py index d5c48025d1..2bb4768a3c 100644 --- a/cuda_bindings/examples/extra/isoFDModelling_test.py +++ b/cuda_bindings/examples/extra/isoFDModelling_test.py @@ -8,14 +8,17 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy", "matplotlib"] +# /// + import time import numpy as np -from common import common -from common.helper_cuda import check_cuda_errors from cuda.bindings import driver as cuda from cuda.bindings import runtime as cudart +from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, requirement_not_met iso_propagator = """\ extern "C" @@ -222,7 +225,7 @@ def __init__(self, cntx): check_cuda_errors(cuda.cuCtxSetCurrent(cntx)) dev = check_cuda_errors(cuda.cuCtxGetDevice()) - self.kernel_helper = common.KernelHelper(iso_propagator, int(dev)) + self.kernel_helper = KernelHelper(iso_propagator, int(dev)) # kernel to create a source fnction with some max frequency self.creatSource = self.kernel_helper.get_function(b"createSource") @@ -627,8 +630,7 @@ def main(): print(f"CUDA-capable device count: {gpu_n}") if gpu_n < 2: - print("Two or more GPUs with Peer-to-Peer access capability are required") - return + requirement_not_met("Two or more GPUs with Peer-to-Peer access capability are required") prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)] # Check possibility for peer access @@ -659,9 +661,7 @@ def main(): break if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1: - print("Two or more GPUs with Peer-to-Peer access capability are required.") - print("Peer to Peer access is not available amongst GPUs in the system, waiving test.") - return + requirement_not_met("Two or more GPUs with Peer-to-Peer access capability are required") # Use first pair of p2p capable GPUs detected gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]] diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py index 892776dfd9..ec471ef9b3 100644 --- a/cuda_bindings/examples/extra/jit_program_test.py +++ b/cuda_bindings/examples/extra/jit_program_test.py @@ -8,6 +8,10 @@ # # ################################################################################ +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numpy"] +# /// + import ctypes import numpy as np diff --git a/cuda_bindings/examples/extra/numba_emm_plugin.py b/cuda_bindings/examples/extra/numba_emm_plugin.py index dcbf541321..cd8fcfcc55 100644 --- a/cuda_bindings/examples/extra/numba_emm_plugin.py +++ b/cuda_bindings/examples/extra/numba_emm_plugin.py @@ -1,6 +1,10 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# /// script +# dependencies = ["cuda_bindings>13.2.1", "numba-cuda"] +# /// + """Numba EMM Plugin using the CUDA Python Driver API. This example provides an External Memory Management (EMM) Plugin for Numba (see diff --git a/cuda_bindings/examples/pytest.ini b/cuda_bindings/examples/pytest.ini deleted file mode 100644 index e105585d5a..0000000000 --- a/cuda_bindings/examples/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -[pytest] -python_files = *_test.py -python_functions = main -pythonpath = . diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index 96cfb4dd07..f4866fc4f8 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -43,6 +43,7 @@ all = [ test = [ "cython>=3.2,<3.3", "setuptools>=77.0.0", + "matplotlib>=3.5.0", # Required by isoFDModelling_test.py "numpy>=1.21.1", "pytest>=6.2.4", "pytest-benchmark>=3.4.1", diff --git a/cuda_bindings/tests/test_examples.py b/cuda_bindings/tests/test_examples.py new file mode 100644 index 0000000000..171d7d37f2 --- /dev/null +++ b/cuda_bindings/tests/test_examples.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import glob +import os +import subprocess +import sys + +import pytest + +from cuda.bindings._test_helpers.pep723 import has_package_requirements_or_skip + +examples_path = os.path.join(os.path.dirname(__file__), "..", "examples") +examples_files = glob.glob(os.path.join(examples_path, "**/*.py"), recursive=True) + + +BROKEN_EXAMPLES = {"numba_emm_plugin.py"} + + +@pytest.mark.parametrize("example", examples_files) +def test_example(example): + if os.path.basename(example) in BROKEN_EXAMPLES: + pytest.skip(f"Skipping broken example: {example}") + + has_package_requirements_or_skip(example) + + env = os.environ.copy() + env["CUDA_BINDINGS_SKIP_EXAMPLE"] = "100" + + process = subprocess.run([sys.executable, example], capture_output=True, env=env) # noqa: S603 + # returncode is a special value used in the examples to indicate that system requirements are not met. + if process.returncode == 100: + pytest.skip(process.stderr.decode(errors="replace").strip()) + elif process.returncode != 0: + if process.stdout: + print(process.stdout.decode(errors="replace")) + if process.stderr: + print(process.stderr.decode(errors="replace"), file=sys.stderr) + raise AssertionError(f"`{example}` failed ({process.returncode})") diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index d978bde2ea..75fa07bbfd 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -4,10 +4,8 @@ # If we have subcategories of examples in the future, this file can be split along those lines import glob -import importlib.metadata import os import platform -import re import subprocess import sys @@ -15,6 +13,13 @@ from cuda.core import Device, system +try: + from cuda.bindings._test_helpers.pep723 import has_package_requirements_or_skip +except ImportError: + # If the import fails, we define a dummy function that will cause all tests to be skipped. + def has_package_requirements_or_skip(example): + pytest.skip("PEP 723 test helper is not available") + def has_compute_capability_9_or_higher() -> bool: return Device().compute_capability >= (9, 0) @@ -62,43 +67,6 @@ def has_cuda_path() -> bool: sample_files = [os.path.basename(x) for x in glob.glob(samples_path + "**/*.py", recursive=True)] -def has_package_requirements_or_skip(example): - example_name = os.path.basename(example) - - with open(example, encoding="utf-8") as f: - content = f.read() - - # The canonical regex as defined in PEP 723 - pep723 = re.search(r"(?m)^# /// (?P[a-zA-Z0-9-]+)$\s(?P(^#(| .*)$\s)+)^# ///$", content) - if not pep723: - raise ValueError(f"PEP 723 metadata not found in {example_name}") - - metadata = {} - for line in pep723.group("content").splitlines(): - line = line.lstrip("# ").rstrip() - if not line: - continue - key, value = line.split("=", 1) - key = key.strip() - value = value.strip() - metadata[key] = value - - if "dependencies" not in metadata: - raise ValueError(f"PEP 723 dependencies not found in {example_name}") - - missing_dependencies = [] - dependencies = eval(metadata["dependencies"]) # noqa: S307 - for dependency in dependencies: - name = re.match("[a-zA-Z0-9_-]+", dependency) - try: - importlib.metadata.distribution(name.string) - except importlib.metadata.PackageNotFoundError: - missing_dependencies.append(name.string) - - if missing_dependencies: - pytest.skip(f"Skipping {example} due to missing package requirement: {', '.join(missing_dependencies)}") - - @pytest.mark.parametrize("example", sample_files) def test_example(example): example_path = os.path.join(samples_path, example)