diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 4a089cefb2..0574094683 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -261,16 +261,6 @@ jobs:
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         run: run-tests bindings
 
-      - name: Run cuda.bindings examples
-        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
-        env:
-          CUDA_VER: ${{ matrix.CUDA_VER }}
-          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
-        run: |
-          pushd cuda_bindings
-          ${SANITIZER_CMD} pytest -ra -s -vv examples/
-          popd
-
       - name: Run cuda.core tests
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index fbe8bad1a5..5cfee3b892 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -245,17 +245,6 @@ jobs:
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests bindings
 
-      - name: Run cuda.bindings examples
-        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
-        env:
-          CUDA_VER: ${{ matrix.CUDA_VER }}
-          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
-        shell: bash --noprofile --norc -xeuo pipefail {0}
-        run: |
-          pushd cuda_bindings
-          ${SANITIZER_CMD} pytest -ra -s -vv examples/
-          popd
-
       - name: Run cuda.core tests
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}
diff --git a/cuda_bindings/cuda/bindings/_example_helpers/__init__.py b/cuda_bindings/cuda/bindings/_example_helpers/__init__.py
new file mode 100644
index 0000000000..fa061cc346
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_example_helpers/__init__.py
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from .common import KernelHelper, check_compute_capability_too_low, requirement_not_met
+from .helper_cuda import check_cuda_errors, find_cuda_device, find_cuda_device_drv
+from .helper_string import check_cmd_line_flag, get_cmd_line_argument_int
+
+__all__ = [
+    "KernelHelper",
+    "check_cmd_line_flag",
+    "check_compute_capability_too_low",
+    "check_cuda_errors",
+    "find_cuda_device",
+    "find_cuda_device_drv",
+    "get_cmd_line_argument_int",
+    "requirement_not_met",
+]
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/cuda/bindings/_example_helpers/common.py
similarity index 77%
rename from cuda_bindings/examples/common/common.py
rename to cuda_bindings/cuda/bindings/_example_helpers/common.py
index 5b5151ef24..15317ace29 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/cuda/bindings/_example_helpers/common.py
@@ -1,19 +1,27 @@
-# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
+import os
+import sys
+
 import numpy as np
-from common.helper_cuda import check_cuda_errors
 
 from cuda import pathfinder
 from cuda.bindings import driver as cuda
 from cuda.bindings import nvrtc
 from cuda.bindings import runtime as cudart
 
+from .helper_cuda import check_cuda_errors
+
+
+def requirement_not_met(message):
+    print(message, file=sys.stderr)  # noqa: T201
+    exitcode = os.environ.get("CUDA_BINDINGS_SKIP_EXAMPLE", "1")
+    return sys.exit(int(exitcode))
 
-def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor):
-    import pytest
 
+def check_compute_capability_too_low(dev_id, required_cc_major_minor):
     cc_major = check_cuda_errors(
         cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
     )
@@ -22,7 +30,9 @@ def pytest_skipif_compute_capability_too_low(dev_id, required_cc_major_minor):
     )
     have_cc_major_minor = (cc_major, cc_minor)
     if have_cc_major_minor < required_cc_major_minor:
-        pytest.skip(f"cudaDevAttrComputeCapability too low: {have_cc_major_minor=!r}, {required_cc_major_minor=!r}")
+        requirement_not_met(
+            f"CUDA device compute capability too low: {have_cc_major_minor=!r}, {required_cc_major_minor=!r}"
+        )
 
 
 class KernelHelper:
@@ -31,9 +41,7 @@ def __init__(self, code, dev_id):
         for libname in ("cudart", "cccl"):
             hdr_dir = pathfinder.find_nvidia_header_directory(libname)
             if hdr_dir is None:
-                import pytest
-
-                pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None')
+                requirement_not_met(f'pathfinder.find_nvidia_header_directory("{libname}") returned None')
             include_dirs.append(hdr_dir)
 
         prog = check_cuda_errors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
@@ -69,8 +77,8 @@ def __init__(self, code, dev_id):
             check_cuda_errors(nvrtc.nvrtcGetProgramLog(prog, log))
             import sys
 
-            print(log.decode(), file=sys.stderr)
-            print(err, file=sys.stderr)
+            print(log.decode(), file=sys.stderr)  # noqa: T201
+            print(err, file=sys.stderr)  # noqa: T201
             sys.exit(1)
 
         if use_cubin:
diff --git a/cuda_bindings/examples/common/helper_cuda.py b/cuda_bindings/cuda/bindings/_example_helpers/helper_cuda.py
similarity index 88%
rename from cuda_bindings/examples/common/helper_cuda.py
rename to cuda_bindings/cuda/bindings/_example_helpers/helper_cuda.py
index 9fbfe8c82f..0e56fa8fd1 100644
--- a/cuda_bindings/examples/common/helper_cuda.py
+++ b/cuda_bindings/cuda/bindings/_example_helpers/helper_cuda.py
@@ -1,12 +1,12 @@
-# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
-
 from cuda.bindings import driver as cuda
 from cuda.bindings import nvrtc
 from cuda.bindings import runtime as cudart
 
+from .helper_string import check_cmd_line_flag, get_cmd_line_argument_int
+
 
 def _cuda_get_error_enum(error):
     if isinstance(error, cuda.CUresult):
diff --git a/cuda_bindings/examples/common/helper_string.py b/cuda_bindings/cuda/bindings/_example_helpers/helper_string.py
similarity index 78%
rename from cuda_bindings/examples/common/helper_string.py
rename to cuda_bindings/cuda/bindings/_example_helpers/helper_string.py
index 47d9d36569..1540db447a 100644
--- a/cuda_bindings/examples/common/helper_string.py
+++ b/cuda_bindings/cuda/bindings/_example_helpers/helper_string.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import sys
diff --git a/cuda_bindings/cuda/bindings/_test_helpers/pep723.py b/cuda_bindings/cuda/bindings/_test_helpers/pep723.py
new file mode 100644
index 0000000000..e1f6f920b7
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_test_helpers/pep723.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+import importlib.metadata
+import os
+import re
+
+import pytest
+
+
+def has_package_requirements_or_skip(example):
+    example_name = os.path.basename(example)
+
+    with open(example, encoding="utf-8") as f:
+        content = f.read()
+
+    # The canonical regex as defined in PEP 723
+    pep723 = re.search(r"(?m)^# /// (?P<type>[a-zA-Z0-9-]+)$\s(?P<content>(^#(| .*)$\s)+)^# ///$", content)
+    if not pep723:
+        raise ValueError(f"PEP 723 metadata not found in {example_name}")
+
+    metadata = {}
+    for line in pep723.group("content").splitlines():
+        line = line.lstrip("# ").rstrip()
+        if not line:
+            continue
+        key, value = line.split("=", 1)
+        key = key.strip()
+        value = value.strip()
+        metadata[key] = value
+
+    if "dependencies" not in metadata:
+        raise ValueError(f"PEP 723 dependencies not found in {example_name}")
+
+    missing_dependencies = []
+    dependencies = eval(metadata["dependencies"])  # noqa: S307
+    for dependency in dependencies:
+        name = re.match("[a-zA-Z0-9_-]+", dependency)
+        try:
+            importlib.metadata.distribution(name.group(0))
+        except importlib.metadata.PackageNotFoundError:
+            missing_dependencies.append(name.string)
+
+    if missing_dependencies:
+        pytest.skip(f"Skipping {example} due to missing package requirement: {', '.join(missing_dependencies)}")
diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index 540c9b4c11..26f02eba30 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -8,13 +8,16 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import platform
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
+from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device, requirement_not_met
 
 clock_nvrtc = """\
 extern "C" __global__  void timedReduction(const float *hinput, float *output, clock_t *timer)
@@ -65,11 +68,13 @@ def elems_to_bytes(nelems, dt):
     return nelems * np.dtype(dt).itemsize
 
 
-def main():
-    import pytest
-
+def check_requirements():
     if platform.machine() == "armv7l":
-        pytest.skip("clock_nvrtc is not supported on ARMv7")
+        requirement_not_met("clock_nvrtc is not supported on ARMv7")
+
+
+def main():
+    check_requirements()
 
     timer = np.empty(num_blocks * 2, dtype="int64")
     hinput = np.empty(num_threads * 2, dtype="float32")
@@ -78,7 +83,7 @@ def main():
         hinput[i] = i
 
     dev_id = find_cuda_device()
-    kernel_helper = common.KernelHelper(clock_nvrtc, dev_id)
+    kernel_helper = KernelHelper(clock_nvrtc, dev_id)
     kernel_addr = kernel_helper.get_function(b"timedReduction")
 
     dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes))
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index c92d33e975..da8227a6c3 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -7,16 +7,21 @@
 #
 # ################################################################################
 
+
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
+
 import ctypes
 import sys
 import time
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device, requirement_not_met
 
 simple_cubemap_texture = """\
 extern "C"
@@ -97,9 +102,7 @@ def main():
         f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}"
     )
     if device_props.major < 2:
-        import pytest
-
-        pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.")
+        requirement_not_met("Test requires SM 2.0 or higher for support of Texture Arrays.")
 
     # Generate input data for layered texture
     width = 64
@@ -162,7 +165,7 @@ def main():
         f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads"
     )
 
-    kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id)
+    kernel_helper = KernelHelper(simple_cubemap_texture, dev_id)
     _transform_kernel = kernel_helper.get_function(b"transformKernel")
     kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
     check_cuda_errors(
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index 637c31cf0e..f1548adc25 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -8,16 +8,19 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import platform
 import sys
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, requirement_not_met
 
 simplep2p = """\
 extern "C"
@@ -32,19 +35,17 @@
 
 
 def main():
-    import pytest
-
     if platform.system() == "Darwin":
-        pytest.skip("simpleP2P is not supported on Mac OSX")
+        requirement_not_met("simpleP2P is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        pytest.skip("simpleP2P is not supported on ARMv7")
+        requirement_not_met("simpleP2P is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        pytest.skip("simpleP2P is not supported on aarch64")
+        requirement_not_met("simpleP2P is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        pytest.skip("simpleP2P is not supported on sbsa")
+        requirement_not_met("simpleP2P is not supported on sbsa")
 
     # Number of GPUs
     print("Checking for multiple GPUs...")
@@ -52,7 +53,7 @@ def main():
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
-        pytest.skip("Two or more GPUs with Peer-to-Peer access capability are required")
+        requirement_not_met("Two or more GPUs with Peer-to-Peer access capability are required")
 
     prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
@@ -83,7 +84,7 @@ def main():
             break
 
     if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1:
-        pytest.skip("Peer to Peer access is not available amongst GPUs in the system")
+        requirement_not_met("Peer to Peer access is not available amongst GPUs in the system")
 
     # Use first pair of p2p capable GPUs detected
     gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]]
@@ -158,7 +159,7 @@ def main():
     _simple_kernel = [None] * 2
     kernel_args = [None] * 2
 
-    kernel_helper[1] = common.KernelHelper(simplep2p, gpuid[1])
+    kernel_helper[1] = KernelHelper(simplep2p, gpuid[1])
     _simple_kernel[1] = kernel_helper[1].get_function(b"SimpleKernel")
     kernel_args[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
     check_cuda_errors(
@@ -183,7 +184,7 @@ def main():
     # output to the GPU 0 buffer
     print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...")
     check_cuda_errors(cudart.cudaSetDevice(gpuid[0]))
-    kernel_helper[0] = common.KernelHelper(simplep2p, gpuid[0])
+    kernel_helper[0] = KernelHelper(simplep2p, gpuid[0])
     _simple_kernel[0] = kernel_helper[0].get_function(b"SimpleKernel")
     kernel_args[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
     check_cuda_errors(
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index e4dc439b9b..ff47696cb6 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -8,6 +8,10 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import math
 import platform
@@ -15,12 +19,16 @@
 import sys
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors
-from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import (
+    KernelHelper,
+    check_cmd_line_flag,
+    check_cuda_errors,
+    get_cmd_line_argument_int,
+    requirement_not_met,
+)
 
 simple_zero_copy = """\
 extern "C"
@@ -40,19 +48,17 @@ def main():
     idev = 0
     b_pin_generic_memory = False
 
-    import pytest
-
     if platform.system() == "Darwin":
-        pytest.skip("simpleZeroCopy is not supported on Mac OSX")
+        requirement_not_met("simpleZeroCopy is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        pytest.skip("simpleZeroCopy is not supported on ARMv7")
+        requirement_not_met("simpleZeroCopy is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        pytest.skip("simpleZeroCopy is not supported on aarch64")
+        requirement_not_met("simpleZeroCopy is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        pytest.skip("simpleZeroCopy is not supported on sbsa")
+        requirement_not_met("simpleZeroCopy is not supported on sbsa")
 
     if check_cmd_line_flag("help"):
         print("Usage:  simpleZeroCopy [OPTION]\n", file=sys.stderr)
@@ -84,7 +90,7 @@ def main():
     device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(idev))
 
     if not device_prop.canMapHostMemory:
-        pytest.skip(f"Device {idev} does not support mapping CPU host memory!")
+        requirement_not_met(f"Device {idev} does not support mapping CPU host memory!")
 
     check_cuda_errors(cudart.cudaSetDeviceFlags(cudart.cudaDeviceMapHost))
 
@@ -131,7 +137,7 @@ def main():
     grid.x = math.ceil(nelem / float(block.x))
     grid.y = 1
     grid.z = 1
-    kernel_helper = common.KernelHelper(simple_zero_copy, idev)
+    kernel_helper = KernelHelper(simple_zero_copy, idev)
     _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU")
     kernel_args = (
         (d_a, d_b, d_c, nelem),
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index ed4a13e686..0d7a6341a5 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -7,16 +7,19 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import os
 import sys
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device, requirement_not_met
 
 system_wide_atomics = """\
 #define LOOP_NUM 50
@@ -172,26 +175,24 @@ def verify(test_data, length):
 
 
 def main():
-    import pytest
-
     if os.name == "nt":
-        pytest.skip("Atomics not supported on Windows")
+        requirement_not_met("Atomics not supported on Windows")
 
     # set device
     dev_id = find_cuda_device()
     device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
 
     if not device_prop.managedMemory:
-        pytest.skip("Unified Memory not supported on this device")
+        requirement_not_met("Unified Memory not supported on this device")
 
     compute_mode = check_cuda_errors(
         cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeMode, dev_id)
     )
     if compute_mode == cudart.cudaComputeMode.cudaComputeModeProhibited:
-        pytest.skip("This sample requires a device in either default or process exclusive mode")
+        requirement_not_met("This sample requires a device in either default or process exclusive mode")
 
     if device_prop.major < 6:
-        pytest.skip("Requires a minimum CUDA compute 6.0 capability")
+        requirement_not_met("Requires a minimum CUDA compute 6.0 capability")
 
     num_threads = 256
     num_blocks = 64
@@ -214,7 +215,7 @@ def main():
     # To make the AND and XOR tests generate something other than 0...
     atom_arr_h[7] = atom_arr_h[9] = 0xFF
 
-    kernel_helper = common.KernelHelper(system_wide_atomics, dev_id)
+    kernel_helper = KernelHelper(system_wide_atomics, dev_id)
     _atomic_kernel = kernel_helper.get_function(b"atomicKernel")
     kernel_args = ((atom_arr,), (ctypes.c_void_p,))
     check_cuda_errors(
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 0a29b8c0ca..a6f65b9c81 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -8,15 +8,18 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import math
 import sys
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device_drv
 
 from cuda.bindings import driver as cuda
+from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device_drv, requirement_not_met
 
 vector_add_drv = """\
 /* Vector addition: C = A + B.
@@ -52,11 +55,9 @@ def main():
         cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, cu_device)
     )
     if not uva_supported:
-        import pytest
-
-        pytest.skip("Accessing pageable memory directly requires UVA")
+        requirement_not_met("Accessing pageable memory directly requires UVA")
 
-    kernel_helper = common.KernelHelper(vector_add_drv, int(cu_device))
+    kernel_helper = KernelHelper(vector_add_drv, int(cu_device))
     _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel")
 
     # Allocate input vectors h_A and h_B in host memory
@@ -73,31 +74,28 @@ def main():
     check_cuda_errors(cuda.cuMemcpyHtoD(d_a, h_a, nbytes))
     check_cuda_errors(cuda.cuMemcpyHtoD(d_b, h_b, nbytes))
 
-    if True:
-        # Grid/Block configuration
-        threads_per_block = 256
-        blocks_per_grid = (n + threads_per_block - 1) / threads_per_block
-
-        kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int))
-
-        # Launch the CUDA kernel
-        check_cuda_errors(
-            cuda.cuLaunchKernel(
-                _vec_add_kernel,
-                blocks_per_grid,
-                1,
-                1,
-                threads_per_block,
-                1,
-                1,
-                0,
-                0,
-                kernel_args,
-                0,
-            )
+    # Grid/Block configuration
+    threads_per_block = 256
+    blocks_per_grid = (n + threads_per_block - 1) / threads_per_block
+
+    kernel_args = ((d_a, d_b, d_c, n), (None, None, None, ctypes.c_int))
+
+    # Launch the CUDA kernel
+    check_cuda_errors(
+        cuda.cuLaunchKernel(
+            _vec_add_kernel,
+            blocks_per_grid,
+            1,
+            1,
+            threads_per_block,
+            1,
+            1,
+            0,
+            0,
+            kernel_args,
+            0,
         )
-    else:
-        pass
+    )
 
     # Copy result from device memory to host memory
     # h_C contains the result in host memory
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index 55178f1abd..f1e9617166 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -8,16 +8,19 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import math
 import platform
 import sys
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device_drv
 
 from cuda.bindings import driver as cuda
+from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device_drv, requirement_not_met
 
 vector_add_mmap = """\
 /* Vector addition: C = A + B.
@@ -197,19 +200,17 @@ def simple_free_multi_device_mmap(dptr, size):
 
 
 def main():
-    import pytest
-
     if platform.system() == "Darwin":
-        pytest.skip("vectorAddMMAP is not supported on Mac OSX")
+        requirement_not_met("vectorAddMMAP is not supported on Mac OSX")
 
     if platform.machine() == "armv7l":
-        pytest.skip("vectorAddMMAP is not supported on ARMv7")
+        requirement_not_met("vectorAddMMAP is not supported on ARMv7")
 
     if platform.machine() == "aarch64":
-        pytest.skip("vectorAddMMAP is not supported on aarch64")
+        requirement_not_met("vectorAddMMAP is not supported on aarch64")
 
     if platform.machine() == "sbsa":
-        pytest.skip("vectorAddMMAP is not supported on sbsa")
+        requirement_not_met("vectorAddMMAP is not supported on sbsa")
 
     n = 50000
     size = n * np.dtype(np.float32).itemsize
@@ -228,7 +229,7 @@ def main():
     )
     print(f"Device {cu_device} VIRTUAL ADDRESS MANAGEMENT SUPPORTED = {attribute_val}.")
     if not attribute_val:
-        pytest.skip(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
+        requirement_not_met(f"Device {cu_device} doesn't support VIRTUAL ADDRESS MANAGEMENT.")
 
     # The vector addition happens on cuDevice, so the allocations need to be mapped there.
     mapping_devices = [cu_device]
@@ -239,7 +240,7 @@ def main():
     # Create context
     cu_context = check_cuda_errors(cuda.cuCtxCreate(None, 0, cu_device))
 
-    kernel_helper = common.KernelHelper(vector_add_mmap, int(cu_device))
+    kernel_helper = KernelHelper(vector_add_mmap, int(cu_device))
     _vec_add_kernel = kernel_helper.get_function(b"VecAdd_kernel")
 
     # Allocate input vectors h_A and h_B in host memory
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index 407079ad43..d9094a8a70 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -8,6 +8,10 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import math
 import platform
@@ -15,12 +19,16 @@
 import sys
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device
-from common.helper_string import check_cmd_line_flag
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import (
+    KernelHelper,
+    check_cmd_line_flag,
+    check_cuda_errors,
+    find_cuda_device,
+    requirement_not_met,
+)
 
 stream_ordered_allocation = """\
 /* Add two vectors on the GPU */
@@ -205,10 +213,8 @@ def stream_ordered_allocation_post_sync(dev, nelem, a, b, c):
 
 
 def main():
-    import pytest
-
     if platform.system() == "Darwin":
-        pytest.skip("streamOrderedAllocation is not supported on Mac OSX")
+        requirement_not_met("streamOrderedAllocation is not supported on Mac OSX")
 
     cuda.cuInit(0)
     if check_cmd_line_flag("help"):
@@ -227,10 +233,10 @@ def main():
             cudart.cudaDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev)
         )
     if not is_mem_pool_supported:
-        pytest.skip("Waiving execution as device does not support Memory Pools")
+        requirement_not_met("Waiving execution as device does not support Memory Pools")
 
     global _vector_add_gpu
-    kernel_helper = common.KernelHelper(stream_ordered_allocation, dev)
+    kernel_helper = KernelHelper(stream_ordered_allocation, dev)
     _vector_add_gpu = kernel_helper.get_function(b"vectorAddGPU")
 
     # Allocate CPU memory
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index 00ed5cdfd4..18f5c88e9d 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -8,6 +8,10 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import math
 import platform
@@ -15,12 +19,18 @@
 from enum import Enum
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device
-from common.helper_string import check_cmd_line_flag, get_cmd_line_argument_int
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import (
+    KernelHelper,
+    check_cmd_line_flag,
+    check_compute_capability_too_low,
+    check_cuda_errors,
+    find_cuda_device,
+    get_cmd_line_argument_int,
+    requirement_not_met,
+)
 
 block_size = 16
 
@@ -1130,16 +1140,14 @@ def matrix_multiply(dims_a, dims_b, kernel_number):
 
 
 def main():
-    import pytest
-
-    common.pytest_skipif_compute_capability_too_low(find_cuda_device(), (7, 0))
+    check_compute_capability_too_low(find_cuda_device(), (7, 0))
 
     if platform.machine() == "qnx":
-        pytest.skip("globalToShmemAsyncCopy is not supported on QNX")
+        requirement_not_met("globalToShmemAsyncCopy is not supported on QNX")
 
     version = check_cuda_errors(cuda.cuDriverGetVersion())
     if version < 11010:
-        pytest.skip("CUDA Toolkit 11.1 or greater is required")
+        requirement_not_met("CUDA Toolkit 11.1 or greater is required")
 
     if check_cmd_line_flag("help") or check_cmd_line_flag("?"):
         print("Usage device=n (n >= 0 for deviceID)", file=sys.stderr)
@@ -1207,7 +1215,7 @@ def main():
         cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, dev_id)
     )
     if major < 7:
-        pytest.skip("globalToShmemAsyncCopy requires SM 7.0 or higher.")
+        requirement_not_met("globalToShmemAsyncCopy requires SM 7.0 or higher.")
 
     print(f"MatrixA({dims_a.x},{dims_a.y}), MatrixB({dims_b.x},{dims_b.y})")
 
@@ -1219,7 +1227,7 @@ def main():
     global _MatrixMulAsyncCopySingleStage
     global _MatrixMulNaive
     global _MatrixMulNaiveLargeChunk
-    kernel_helper = common.KernelHelper(global_to_shmem_async_copy, dev_id)
+    kernel_helper = KernelHelper(global_to_shmem_async_copy, dev_id)
     _MatrixMulAsyncCopyMultiStageLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyMultiStageLargeChunk")
     _MatrixMulAsyncCopyLargeChunk = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunk")
     _MatrixMulAsyncCopyLargeChunkAWBarrier = kernel_helper.get_function(b"MatrixMulAsyncCopyLargeChunkAWBarrier")
diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
index 9fff51767e..bb749065f2 100644
--- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -8,15 +8,22 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import random as rnd
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import (
+    KernelHelper,
+    check_cuda_errors,
+    find_cuda_device,
+)
 
 THREADS_PER_BLOCK = 512
 GRAPH_LAUNCH_ITERATIONS = 3
@@ -378,7 +385,7 @@ def main():
 
     global _reduce
     global _reduceFinal
-    kernel_helper = common.KernelHelper(simple_cuda_graphs, dev_id)
+    kernel_helper = KernelHelper(simple_cuda_graphs, dev_id)
     _reduce = kernel_helper.get_function(b"reduce")
     _reduceFinal = kernel_helper.get_function(b"reduceFinal")
 
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index a2d4cdca40..44ff57c6d7 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -8,6 +8,10 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 import math
 import platform
@@ -15,11 +19,14 @@
 from random import random
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors, find_cuda_device
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import (
+    KernelHelper,
+    check_cuda_errors,
+    find_cuda_device,
+)
 
 conjugate_gradient_multi_block_cg = """\
 #line __LINE__
@@ -238,7 +245,7 @@ def main():
     )
 
     # Get kernel
-    kernel_helper = common.KernelHelper(conjugate_gradient_multi_block_cg, dev_id)
+    kernel_helper = KernelHelper(conjugate_gradient_multi_block_cg, dev_id)
     _gpu_conjugate_gradient = kernel_helper.get_function(b"gpuConjugateGradient")
 
     # Generate a random tridiagonal symmetric matrix in CSR format
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py b/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py
index 8290e491c6..ca229e5268 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/nvidia_smi.py
@@ -11,6 +11,10 @@
 # ################################################################################
 
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1"]
+# ///
+
 import sys
 
 from cuda.bindings import nvml
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index d5c48025d1..2bb4768a3c 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -8,14 +8,17 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy", "matplotlib"]
+# ///
+
 import time
 
 import numpy as np
-from common import common
-from common.helper_cuda import check_cuda_errors
 
 from cuda.bindings import driver as cuda
 from cuda.bindings import runtime as cudart
+from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, requirement_not_met
 
 iso_propagator = """\
 extern "C"
@@ -222,7 +225,7 @@ def __init__(self, cntx):
         check_cuda_errors(cuda.cuCtxSetCurrent(cntx))
         dev = check_cuda_errors(cuda.cuCtxGetDevice())
 
-        self.kernel_helper = common.KernelHelper(iso_propagator, int(dev))
+        self.kernel_helper = KernelHelper(iso_propagator, int(dev))
 
         # kernel to create a source fnction with some max frequency
         self.creatSource = self.kernel_helper.get_function(b"createSource")
@@ -627,8 +630,7 @@ def main():
     print(f"CUDA-capable device count: {gpu_n}")
 
     if gpu_n < 2:
-        print("Two or more GPUs with Peer-to-Peer access capability are required")
-        return
+        requirement_not_met("Two or more GPUs with Peer-to-Peer access capability are required")
 
     prop = [check_cuda_errors(cudart.cudaGetDeviceProperties(i)) for i in range(gpu_n)]
     # Check possibility for peer access
@@ -659,9 +661,7 @@ def main():
             break
 
     if p2p_capable_gp_us[0] == -1 or p2p_capable_gp_us[1] == -1:
-        print("Two or more GPUs with Peer-to-Peer access capability are required.")
-        print("Peer to Peer access is not available amongst GPUs in the system, waiving test.")
-        return
+        requirement_not_met("Two or more GPUs with Peer-to-Peer access capability are required")
 
     # Use first pair of p2p capable GPUs detected
     gpuid = [p2p_capable_gp_us[0], p2p_capable_gp_us[1]]
diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
index 892776dfd9..ec471ef9b3 100644
--- a/cuda_bindings/examples/extra/jit_program_test.py
+++ b/cuda_bindings/examples/extra/jit_program_test.py
@@ -8,6 +8,10 @@
 #
 # ################################################################################
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numpy"]
+# ///
+
 import ctypes
 
 import numpy as np
diff --git a/cuda_bindings/examples/extra/numba_emm_plugin.py b/cuda_bindings/examples/extra/numba_emm_plugin.py
index dcbf541321..cd8fcfcc55 100644
--- a/cuda_bindings/examples/extra/numba_emm_plugin.py
+++ b/cuda_bindings/examples/extra/numba_emm_plugin.py
@@ -1,6 +1,10 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# /// script
+# dependencies = ["cuda_bindings>13.2.1", "numba-cuda"]
+# ///
+
 """Numba EMM Plugin using the CUDA Python Driver API.
 
 This example provides an External Memory Management (EMM) Plugin for Numba (see
diff --git a/cuda_bindings/examples/pytest.ini b/cuda_bindings/examples/pytest.ini
deleted file mode 100644
index e105585d5a..0000000000
--- a/cuda_bindings/examples/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-[pytest]
-python_files = *_test.py
-python_functions = main
-pythonpath = .
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 96cfb4dd07..f4866fc4f8 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -43,6 +43,7 @@ all = [
 test = [
     "cython>=3.2,<3.3",
     "setuptools>=77.0.0",
+    "matplotlib>=3.5.0",  # Required by isoFDModelling_test.py
     "numpy>=1.21.1",
     "pytest>=6.2.4",
     "pytest-benchmark>=3.4.1",
diff --git a/cuda_bindings/tests/test_examples.py b/cuda_bindings/tests/test_examples.py
new file mode 100644
index 0000000000..171d7d37f2
--- /dev/null
+++ b/cuda_bindings/tests/test_examples.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import glob
+import os
+import subprocess
+import sys
+
+import pytest
+
+from cuda.bindings._test_helpers.pep723 import has_package_requirements_or_skip
+
+examples_path = os.path.join(os.path.dirname(__file__), "..", "examples")
+examples_files = glob.glob(os.path.join(examples_path, "**/*.py"), recursive=True)
+
+
+BROKEN_EXAMPLES = {"numba_emm_plugin.py"}
+
+
+@pytest.mark.parametrize("example", examples_files)
+def test_example(example):
+    if os.path.basename(example) in BROKEN_EXAMPLES:
+        pytest.skip(f"Skipping broken example: {example}")
+
+    has_package_requirements_or_skip(example)
+
+    env = os.environ.copy()
+    env["CUDA_BINDINGS_SKIP_EXAMPLE"] = "100"
+
+    process = subprocess.run([sys.executable, example], capture_output=True, env=env)  # noqa: S603
+    # returncode is a special value used in the examples to indicate that system requirements are not met.
+    if process.returncode == 100:
+        pytest.skip(process.stderr.decode(errors="replace").strip())
+    elif process.returncode != 0:
+        if process.stdout:
+            print(process.stdout.decode(errors="replace"))
+        if process.stderr:
+            print(process.stderr.decode(errors="replace"), file=sys.stderr)
+        raise AssertionError(f"`{example}` failed ({process.returncode})")
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index d978bde2ea..75fa07bbfd 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -4,10 +4,8 @@
 # If we have subcategories of examples in the future, this file can be split along those lines
 
 import glob
-import importlib.metadata
 import os
 import platform
-import re
 import subprocess
 import sys
 
@@ -15,6 +13,13 @@
 
 from cuda.core import Device, system
 
+try:
+    from cuda.bindings._test_helpers.pep723 import has_package_requirements_or_skip
+except ImportError:
+    # If the import fails, we define a dummy function that will cause all tests to be skipped.
+    def has_package_requirements_or_skip(example):
+        pytest.skip("PEP 723 test helper is not available")
+
 
 def has_compute_capability_9_or_higher() -> bool:
     return Device().compute_capability >= (9, 0)
@@ -62,43 +67,6 @@ def has_cuda_path() -> bool:
 sample_files = [os.path.basename(x) for x in glob.glob(samples_path + "**/*.py", recursive=True)]
 
 
-def has_package_requirements_or_skip(example):
-    example_name = os.path.basename(example)
-
-    with open(example, encoding="utf-8") as f:
-        content = f.read()
-
-    # The canonical regex as defined in PEP 723
-    pep723 = re.search(r"(?m)^# /// (?P<type>[a-zA-Z0-9-]+)$\s(?P<content>(^#(| .*)$\s)+)^# ///$", content)
-    if not pep723:
-        raise ValueError(f"PEP 723 metadata not found in {example_name}")
-
-    metadata = {}
-    for line in pep723.group("content").splitlines():
-        line = line.lstrip("# ").rstrip()
-        if not line:
-            continue
-        key, value = line.split("=", 1)
-        key = key.strip()
-        value = value.strip()
-        metadata[key] = value
-
-    if "dependencies" not in metadata:
-        raise ValueError(f"PEP 723 dependencies not found in {example_name}")
-
-    missing_dependencies = []
-    dependencies = eval(metadata["dependencies"])  # noqa: S307
-    for dependency in dependencies:
-        name = re.match("[a-zA-Z0-9_-]+", dependency)
-        try:
-            importlib.metadata.distribution(name.string)
-        except importlib.metadata.PackageNotFoundError:
-            missing_dependencies.append(name.string)
-
-    if missing_dependencies:
-        pytest.skip(f"Skipping {example} due to missing package requirement: {', '.join(missing_dependencies)}")
-
-
 @pytest.mark.parametrize("example", sample_files)
 def test_example(example):
     example_path = os.path.join(samples_path, example)