diff --git a/.github/workflows/docs-build-pr.yaml b/.github/workflows/docs-build-pr.yaml index faf265e1e..afb6d4def 100644 --- a/.github/workflows/docs-build-pr.yaml +++ b/.github/workflows/docs-build-pr.yaml @@ -2,7 +2,7 @@ name: docs-build-pr on: pull_request: - branches: [ main ] + branches: [ main, release-* ] types: [ opened, synchronize ] env: diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml index 3cabdaa50..77a57cc8f 100644 --- a/.github/workflows/docs-build.yaml +++ b/.github/workflows/docs-build.yaml @@ -2,7 +2,7 @@ name: docs-build on: push: - branches: [ main ] + branches: [ main, release-* ] tags: - v* workflow_dispatch: @@ -10,7 +10,7 @@ on: env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} - TAG: 0.5.0 + TAG: 0.5.1 GH_TOKEN: ${{ github.token }} concurrency: diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 630d73bff..fa2993909 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,7 +1,7 @@ variables: CONTAINER_TEST_IMAGE: "${CI_REGISTRY_IMAGE}:${CI_COMMIT_REF_SLUG}" - CONTAINER_RELEASE_IMAGE: "${CI_REGISTRY_IMAGE}:0.5.0" - BUILDER_IMAGE: ghcr.io/nvidia/cloud-native-docs:0.5.0 + CONTAINER_RELEASE_IMAGE: "${CI_REGISTRY_IMAGE}:0.5.1" + BUILDER_IMAGE: ghcr.io/nvidia/cloud-native-docs:0.5.1 PUBLISHER_IMAGE: "${CI_REGISTRY_PUBLISHER}/publisher:3.1.0" stages: diff --git a/README.md b/README.md index e1f9178bc..2673c0158 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ Refer to to find the most rec 1. Build the docs: + Use the alias `build-docs` or the full command: + ```bash ./repo docs ``` @@ -52,6 +54,8 @@ Refer to to find the most rec The resulting HTML pages are located in the `_build/docs/.../latest/` directory of your repository clone. +If you are using WSL on Windows, the URL looks like . + More information about the `repo docs` command is available from . @@ -139,6 +143,20 @@ Always update the openshift docset when there is a new gpu-operator docset versi The documentation for the older releases is not removed, readers are just less likely to browse the older releases. + GPU Operator has changed to minor-only version branches. + Consequently, patch releases are documented within the same branch for that minor version. + In the `/versions1.json` file, you can use just the first two fields of the semantic version. + For example: + + ```bash + { + "url": "../25.10", + "version": "25.10" + }, + ``` + + The three most-recent minor are supported. + ### Tagging for Publication Changes to the default branch are not published on docs.nvidia.com. @@ -150,11 +168,21 @@ Only tags are published to docs.nvidia.com. *Example* ```text - gpu-operator-v23.3.1 + container-toolkit-v1.17.8 ``` The first three fields of the semantic version are used. - For a "do over," push a tag like `gpu-operator-v23.3.1-1`. + For a "do over," push a tag like `container-toolkit-v1.17.8-1`. + + For GPU Operator, use only the first two fields of the semantic version. + + *Example* + + ```text + gpu-operator-v25.10 + ``` + + For a "do over," push a tag like `gpu-operator-v25.10-2`. Always tag the openshift docset for each new gpu-operator docset release. diff --git a/css/custom.css b/css/custom.css index 7d075cce1..207757d00 100644 --- a/css/custom.css +++ b/css/custom.css @@ -4,4 +4,23 @@ */ html[data-theme=light] .highlight .go { font-style:unset -} \ No newline at end of file +} + +.bd-page-width { + max-width: 176rem; + } + + .bd-main { + flex: 1 1 auto; + } + + .bd-main .bd-content .bd-article-container { + max-width: 100%; + } + + .bd-sidebar-secondary { + /* flex: 0 0 auto; */ + flex-basis: 15%; + min-width: var(--pst-sidebar-secondary); + } + \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 547de0e73..e15da489e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,7 +21,10 @@ RUN --mount=type=bind,source=.,destination=/x,rw /x/tools/packman/python.sh -m p -t /tmp/extension \ sphinx-copybutton \ nvidia-sphinx-theme \ - pydata-sphinx-theme + pydata-sphinx-theme \ + linuxdoc RUN (cd /tmp/extension; tar cf - . ) | (cd /var/tmp/packman/chk/sphinx/4.5.0.2-py3.7-linux-x86_64/; tar xf -) RUN rm -rf /tmp/extension + +RUN --mount=type=bind,target=/work echo 'alias build-docs="./repo docs"' >> ~/.bashrc diff --git a/gpu-operator/cdi.rst b/gpu-operator/cdi.rst index 05212812b..5c8a9522f 100644 --- a/gpu-operator/cdi.rst +++ b/gpu-operator/cdi.rst @@ -16,86 +16,46 @@ .. headings # #, * *, =, -, ^, " -###################################################### -Container Device Interface Support in the GPU Operator -###################################################### +############################################################ +Container Device Interface (CDI) Support in the GPU Operator +############################################################ ************************************ About the Container Device Interface ************************************ -The Container Device Interface (CDI) is a specification for container runtimes -such as cri-o, containerd, and podman that standardizes access to complex -devices like NVIDIA GPUs by the container runtimes. -CDI support is provided by the NVIDIA Container Toolkit and the Operator extends -that support for Kubernetes clusters. +The `Container Device Interface (CDI) `_ +is an open specification for container runtimes that abstracts what access to a device, such as an NVIDIA GPU, means, +and standardizes access across container runtimes. Popular container runtimes can read and process the specification to +ensure that a device is available in a container. CDI simplifies adding support for devices such as NVIDIA GPUs because +the specification is applicable to all container runtimes that support CDI. + +Starting with GPU Operator v25.10.0, CDI is used by default for enabling GPU support in containers running on Kubernetes. +Specifically, CDI support in container runtimes, e.g. containerd and cri-o, is used to inject GPU(s) into workload +containers. This differs from prior GPU Operator releases where CDI was used via a CDI-enabled ``nvidia`` runtime class. Use of CDI is transparent to cluster administrators and application developers. The benefits of CDI are largely to reduce development and support for runtime-specific plugins. -When CDI is enabled, two runtime classes, nvidia-cdi and nvidia-legacy, become available. -These two runtime classes are in addition to the default runtime class, nvidia. - -If you do not set CDI as the default runtime, the runtime resolves to the -legacy runtime mode that the NVIDIA Container Toolkit provides on x86_64 -machines or any architecture that has NVML libraries installed. - -Optionally, you can specify the runtime class for a workload. -See :ref:`Optional: Specifying the Runtime Class for a Pod` for an example. - - -Support for Multi-Instance GPU -============================== - -Configuring CDI is supported with Multi-Instance GPU (MIG). -Both the ``single`` and ``mixed`` strategies are supported. - - -Limitations and Restrictions -============================ - -* CDI is not supported on Red Hat OpenShift Container Platform. - CDI is supported on all other platforms listed in :ref:`Supported Operating Systems and Kubernetes Platforms`. - -* Enabling CDI is not supported with Rancher Kubernetes Engine 2 (RKE2). - - ******************************** Enabling CDI During Installation ******************************** +CDI is enabled by default during installation in GPU Operator v25.10.0 and later. Follow the instructions for installing the Operator with Helm on the :doc:`getting-started` page. -When you install the Operator with Helm, specify the ``--set cdi.enabled=true`` argument. -Optionally, also specify the ``--set cdi.default=true`` argument to use the CDI runtime class by default for all pods. - +CDI is also enabled by default during a Helm upgrade to GPU Operator v25.10.0 and later. ******************************* Enabling CDI After Installation ******************************* -.. rubric:: Prerequisites - -* You installed version 22.3.0 or newer. -* (Optional) Confirm that the only runtime class is ``nvidia`` by running the following command: - - .. code-block:: console - - $ kubectl get runtimeclasses - - **Example Output** - - .. code-block:: output - - NAME HANDLER AGE - nvidia nvidia 47h - +CDI is enabled by default in GPU Operator v25.10.0 and later. +Use the following procedure to enable CDI if you disabled CDI during installation. .. rubric:: Procedure -To enable CDI support, perform the following steps: - #. Enable CDI by modifying the cluster policy: .. code-block:: console @@ -109,19 +69,6 @@ To enable CDI support, perform the following steps: clusterpolicy.nvidia.com/cluster-policy patched -#. (Optional) Set the default container runtime mode to CDI by modifying the cluster policy: - - .. code-block:: console - - $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ - -p='[{"op": "replace", "path": "/spec/cdi/default", "value":true}]' - - *Example Output* - - .. code-block:: output - - clusterpolicy.nvidia.com/cluster-policy patched - #. (Optional) Confirm that the container toolkit and device plugin pods restart: .. code-block:: console @@ -134,23 +81,13 @@ To enable CDI support, perform the following steps: :language: output :emphasize-lines: 6,9 -#. Verify that the runtime classes include nvidia-cdi and nvidia-legacy: - - .. code-block:: console - - $ kubectl get runtimeclasses - - *Example Output* - - .. literalinclude:: ./manifests/output/cdi-verify-get-runtime-classes.txt - :language: output - ************* Disabling CDI ************* -To disable CDI support, perform the following steps: +While CDI is the default and recommended mechanism for injecting GPU support into containers, you can +disable CDI and use the legacy NVIDIA Container Toolkit stack instead with the following procedure: #. If your nodes use the CRI-O container runtime, then temporarily disable the GPU Operator validator: @@ -188,93 +125,3 @@ To disable CDI support, perform the following steps: nvidia.com/gpu.deploy.operator-validator=true \ nvidia.com/gpu.present=true \ --overwrite - -#. (Optional) Verify that the ``nvidia-cdi`` and ``nvidia-legacy`` runtime classes - are no longer available: - - .. code-block:: console - - $ kubectl get runtimeclass - - *Example Output* - - .. code-block:: output - - NAME HANDLER AGE - nvidia nvidia 11d - - -************************************************ -Optional: Specifying the Runtime Class for a Pod -************************************************ - -If you enabled CDI mode for the default container runtime, then no action is required to use CDI. -However, you can use the following procedure to specify the legacy mode for a workload if you experience trouble. - -If you did not enable CDI mode for the default container runtime, then you can -use the following procedure to verify that CDI is enabled and as a -routine practice to use the CDI mode of the container runtime. - -#. Create a file, such as ``cuda-vectoradd-cdi.yaml``, with contents like the following example: - - .. literalinclude:: ./manifests/input/cuda-vectoradd-cdi.yaml - :language: yaml - :emphasize-lines: 7 - - As an alternative, specify ``nvidia-legacy`` to use the legacy mode of the container runtime. - -#. (Optional) Create a temporary namespace: - - .. code-block:: console - - $ kubectl create ns demo - - *Example Output* - - .. code-block:: output - - namespace/demo created - -#. Start the pod: - - .. code-block:: console - - $ kubectl apply -n demo -f cuda-vectoradd-cdi.yaml - - *Example Output* - - .. code-block:: output - - pod/cuda-vectoradd created - -#. View the logs from the pod: - - .. code-block:: console - - $ kubectl logs -n demo cuda-vectoradd - - *Example Output* - - .. literalinclude:: ./manifests/output/common-cuda-vectoradd-logs.txt - :language: output - -#. Delete the temporary namespace: - - .. code-block:: console - - $ kubectl delete ns demo - - *Example Output* - - .. code-block:: output - - namespace "demo" deleted - - -******************* -Related Information -******************* - -* For more information about CDI, see the container device interface - `repository `_ - on GitHub. diff --git a/gpu-operator/getting-started.rst b/gpu-operator/getting-started.rst index 4fa422858..cf29ebcf0 100644 --- a/gpu-operator/getting-started.rst +++ b/gpu-operator/getting-started.rst @@ -27,6 +27,9 @@ Installing the NVIDIA GPU Operator ================================== +.. admonition:: Version + + The current patch release of this version of the NVIDIA GPU Operator is ``${version}``. ************* Prerequisites @@ -139,18 +142,18 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. - ``false`` * - ``cdi.enabled`` - - When set to ``true``, the Operator installs two additional runtime classes, - nvidia-cdi and nvidia-legacy, and enables the use of the Container Device Interface (CDI) - for making GPUs accessible to containers. + - When set to ``true`` (default), the Container Device Interface (CDI) will be used for + injecting GPUs into workload containers. The Operator will no longer configure the `nvidia` + runtime class as the default runtime handler. Instead, native-CDI support in container runtimes + like containerd or cri-o will be leveraged for injecting GPUs into workload containers. Using CDI aligns the Operator with the recent efforts to standardize how complex devices like GPUs are exposed to containerized environments. + - ``true`` - Pods can specify ``spec.runtimeClassName`` as ``nvidia-cdi`` to use the functionality or - specify ``nvidia-legacy`` to prevent using CDI to perform device injection. - - ``false`` - - * - ``cdi.default`` - - When set to ``true``, the container runtime uses CDI to perform device injection by default. + * - ``cdi.default`` Deprecated. + - This field is deprecated as of v25.10.0 and will be ignored. + The ``cdi.enabled`` field is set to ``true`` by default in versions 25.10.0 and later. + When set to ``true``, the container runtime uses CDI to perform device injection by default. - ``false`` * - ``daemonsets.annotations`` @@ -479,10 +482,8 @@ options are used with the container-toolkit deployed with GPU Operator: value: /etc/containerd/config.toml - name: CONTAINERD_SOCKET value: /run/containerd/containerd.sock - - name: CONTAINERD_RUNTIME_CLASS - value: nvidia - - name: CONTAINERD_SET_AS_DEFAULT - value: true + - name: RUNTIME_CONFIG_SOURCE + value: "command, file" If you need to specify custom values, refer to the following sample command for the syntax: @@ -494,21 +495,18 @@ If you need to specify custom values, refer to the following sample command for nvidia/gpu-operator $HELM_OPTIONS \ --version=${version} \ --set toolkit.env[0].name=CONTAINERD_CONFIG \ - --set toolkit.env[0].value=/etc/containerd/config.toml \ + --set toolkit.env[0].value=/etc/containerd/containerd.toml \ --set toolkit.env[1].name=CONTAINERD_SOCKET \ --set toolkit.env[1].value=/run/containerd/containerd.sock \ - --set toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS \ - --set toolkit.env[2].value=nvidia \ - --set toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT \ - --set-string toolkit.env[3].value=true + --set toolkit.env[2].name=RUNTIME_CONFIG_SOURCE \ + --set toolkit.env[2].value="command, file" These options are defined as follows: CONTAINERD_CONFIG - The path on the host to the ``containerd`` config - you would like to have updated with support for the ``nvidia-container-runtime``. - By default this will point to ``/etc/containerd/config.toml`` (the default - location for ``containerd``). It should be customized if your ``containerd`` + The path on the host to the top-level ``containerd`` config file. + By default this will point to ``/etc/containerd/containerd.toml`` + (the default location for ``containerd``). It should be customized if your ``containerd`` installation is not in the default location. CONTAINERD_SOCKET @@ -519,20 +517,20 @@ CONTAINERD_SOCKET (the default location for ``containerd``). It should be customized if your ``containerd`` installation is not in the default location. -CONTAINERD_RUNTIME_CLASS - The name of the - `Runtime Class `_ - you would like to associate with the ``nvidia-container-runtime``. - Pods launched with a ``runtimeClassName`` equal to CONTAINERD_RUNTIME_CLASS - will always run with the ``nvidia-container-runtime``. The default - CONTAINERD_RUNTIME_CLASS is ``nvidia``. - -CONTAINERD_SET_AS_DEFAULT - A flag indicating whether you want to set - ``nvidia-container-runtime`` as the default runtime used to launch all - containers. When set to false, only containers in pods with a ``runtimeClassName`` - equal to CONTAINERD_RUNTIME_CLASS will be run with the ``nvidia-container-runtime``. - The default value is ``true``. +RUNTIME_CONFIG_SOURCE + The config source(s) that the container-toolkit uses when fetching + the current containerd configuration. A valid value for this setting is any + combination of [command | file]. By default this will be configured as + "command, file" which means the container-toolkit will attempt to fetch + the configuration via the containerd CLI before falling back to reading the + config from the top-level ``containerd`` config file (configured via + CONTIANERD_CONFIG). When ``file`` is specified, the absolute path to the file + to be used as a config source can be specified as ``file=/path/to/source/config.toml`` + +RUNTIME_DROP_IN_CONFIG + The path on the host where the NVIDIA-specific drop-in config file + will be created. By default this will point to ``/etc/containerd/conf.d/99-nvidia.toml``. + Rancher Kubernetes Engine 2 =========================== @@ -543,6 +541,8 @@ in the RKE2 documentation. Refer to the :ref:`v24.9.0-known-limitations`. +.. _microk8s-install-procedure: + MicroK8s ======== @@ -556,10 +556,8 @@ For MicroK8s, set the following in the ``ClusterPolicy``. value: /var/snap/microk8s/current/args/containerd-template.toml - name: CONTAINERD_SOCKET value: /var/snap/microk8s/common/run/containerd.sock - - name: CONTAINERD_RUNTIME_CLASS - value: nvidia - - name: CONTAINERD_SET_AS_DEFAULT - value: "true" + - name: RUNTIME_CONFIG_SOURCE + value: "file=/var/snap/microk8s/current/args/containerd.toml" These options can be passed to GPU Operator during install time as below. @@ -572,10 +570,8 @@ These options can be passed to GPU Operator during install time as below. --set toolkit.env[0].value=/var/snap/microk8s/current/args/containerd-template.toml \ --set toolkit.env[1].name=CONTAINERD_SOCKET \ --set toolkit.env[1].value=/var/snap/microk8s/common/run/containerd.sock \ - --set toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS \ - --set toolkit.env[2].value=nvidia \ - --set toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT \ - --set-string toolkit.env[3].value=true + --set toolkit.env[2].name=RUNTIME_CONFIG_SOURCE \ + --set-string toolkit.env[2].value=file=/var/snap/microk8s/current/args/containerd.toml .. _running sample gpu applications: .. _verify gpu operator install: diff --git a/gpu-operator/gpu-operator-kubevirt.rst b/gpu-operator/gpu-operator-kubevirt.rst index 4e97bcc32..967152c81 100644 --- a/gpu-operator/gpu-operator-kubevirt.rst +++ b/gpu-operator/gpu-operator-kubevirt.rst @@ -72,7 +72,6 @@ Assumptions, constraints, and dependencies * Users must manually add all passthrough GPU and vGPU resources to the ``permittedDevices`` list in the KubeVirt CR before assigning them to KubeVirt virtual machines. Refer to the `KubeVirt documentation `_ for more information. -* MIG-backed vGPUs are not supported. Prerequisites ============= @@ -452,10 +451,16 @@ It is recommended to set this node label prior to installing the GPU Operator if Switching vGPU device configuration after one has been successfully applied assumes that no virtual machines with vGPU are currently running on the node. Any existing virtual machines should be shutdown/migrated before you apply the new configuration. -To apply a new configuration after GPU Operator install, update the ``nvidia.com/vgpu.config`` node label. +To apply a new configuration after GPU Operator install, update the ``nvidia.com/vgpu.config`` node label. + +.. note:: + + On GPUs that support MIG, you have the option to select MIG-backed vGPU instances instead of time-sliced vGPU instances. + To select a MIG-backed vGPU profile, label the node with the name of the MIG-backed vGPU profile. The following example shows how to apply a new configuration on a system with two **A10** GPUs. + .. code-block:: console $ nvidia-smi -L diff --git a/gpu-operator/index.rst b/gpu-operator/index.rst index 5eecd4d9c..820ef4f5e 100644 --- a/gpu-operator/index.rst +++ b/gpu-operator/index.rst @@ -47,7 +47,7 @@ Custom GPU Driver Parameters precompiled-drivers.rst GPU Driver CRD - Container Device Interface Support + Container Device Interface (CDI) Support .. toctree:: :caption: Sandboxed Workloads diff --git a/gpu-operator/install-gpu-operator-nvaie.rst b/gpu-operator/install-gpu-operator-nvaie.rst index 09ec08cfb..787aeb4f7 100644 --- a/gpu-operator/install-gpu-operator-nvaie.rst +++ b/gpu-operator/install-gpu-operator-nvaie.rst @@ -117,7 +117,7 @@ Procedure Updating NLS Client License Token ********************************* -In case the NLS client license token needs to be updated, please use the following procedure: +In case the NLS client license token needs to be updated, use the following procedure: Create an empty vGPU license configuration file: @@ -125,16 +125,21 @@ Create an empty vGPU license configuration file: $ sudo touch gridd.conf -Generate and download a new NLS client license token. Please refer to Section 4.6 of the `NLS User Guide `_ for instructions. +Generate and download a new NLS client license token. Refer to Section 4.6 of the `NLS User Guide `_ for instructions. Rename the NLS client license token that you downloaded to ``client_configuration_token.tok``. -Create a new ``licensing-config-new`` ConfigMap object in the ``gpu-operator`` namespace (make sure the name of the configmap is not already used in the kubernetes cluster). Both the vGPU license configuration file and the NLS client license token will be added to this ConfigMap: +.. warning:: + + The ``configMap(configMapName)`` is **deprecated** and will be removed in a future release. + Use ``secrets(secretName)`` instead. + +Create a new ``licensing-config-new`` Secret object in the ``gpu-operator`` namespace (make sure the name of the secret is not already used in the kubernetes cluster). Both the vGPU license configuration file and the NLS client license token will be added to this Secret: .. code-block:: console - $ kubectl create configmap licensing-config-new \ + $ kubectl create secret generic licensing-config-new \ -n gpu-operator --from-file=gridd.conf --from-file=/client_configuration_token.tok @@ -150,14 +155,14 @@ Go to the driver section and replace the following argument: .. code-block:: console licensingConfig: - configMapName: licensing-config + secretName: licensing-config with .. code-block:: console licensingConfig: - configMapName: licensing-config-new + secretName: licensing-config-new Write and exit from the kubectl edit session (you can use :qw for instance if vi utility is used) diff --git a/gpu-operator/install-gpu-operator-vgpu.rst b/gpu-operator/install-gpu-operator-vgpu.rst index e80cd573d..66d2cbe92 100644 --- a/gpu-operator/install-gpu-operator-vgpu.rst +++ b/gpu-operator/install-gpu-operator-vgpu.rst @@ -207,11 +207,11 @@ Configure the Cluster with the vGPU License Information and the Driver Container $ kubectl create namespace gpu-operator -#. Create a config map that is named ``licensing-config`` using the ``gridd.conf`` and ``client_configuration_token.tok`` files: +#. Create a secret that is named ``licensing-config`` using the ``gridd.conf`` and ``client_configuration_token.tok`` files: .. code-block:: console - $ kubectl create configmap licensing-config \ + $ kubectl create secret generic licensing-config \ -n gpu-operator --from-file=gridd.conf --from-file=client_configuration_token.tok #. Create an image pull secret in the ``gpu-operator`` namespace with the registry secret and private registry. @@ -249,7 +249,7 @@ Install the Operator --set driver.repository=${PRIVATE_REGISTRY} \ --set driver.version=${VGPU_DRIVER_VERSION} \ --set driver.imagePullSecrets={$REGISTRY_SECRET_NAME} \ - --set driver.licensingConfig.configMapName=licensing-config + --set driver.licensingConfig.secretName=licensing-config The preceding command installs the Operator with the default configuration. Refer to :ref:`gpu-operator-helm-chart-options` for information about configuration options. diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index e00c4965d..00ab3de0a 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -47,7 +47,7 @@ The product life cycle and versioning are subject to change in the future. .. note:: - - Upgrades are only supported within a major release or to the next major release. + Upgrades are only supported within a major release or to the next major release. .. list-table:: Support Status for Releases :header-rows: 1 @@ -55,13 +55,13 @@ The product life cycle and versioning are subject to change in the future. * - GPU Operator Version - Status - * - 25.3.x + * - 25.10.x - Generally Available - * - 24.9.x + * - 25.3.x - Maintenance - * - 24.6.x and lower + * - 24.9.x and lower - EOL @@ -81,62 +81,61 @@ The following table shows the operands and default operand versions that corresp When post-release testing confirms support for newer versions of operands, these updates are identified as *recommended updates* to a GPU Operator version. Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. -.. list-table:: - :header-rows: 1 +**D** = Default driver, **R** = Recommended driver - * - Component - - Version +.. flat-table:: + :header-rows: 2 - * - NVIDIA GPU Operator - - ${version} + * - :rspan:`1` Component + - GPU Operator Version + + * - v25.10.0 * - NVIDIA GPU Driver |ki|_ - - | `580.82.07 `_ (default, recommended) - | `580.65.06 `_ + - | `580.95.05 `_ (**D**, **R**) + | `580.82.07 `_ | `575.57.08 `_ - | `570.172.08 `_ - | `570.158.01 `_ - | `570.148.08 `_ - | `535.261.03 `_ + | `570.195.03 `_ | `550.163.01 `_ - | `535.247.01 `_ + | `535.274.02 `_ + * - NVIDIA Driver Manager for Kubernetes - - `v0.8.1 `__ + - `v0.9.0 `__ * - NVIDIA Container Toolkit - - `1.17.8 `__ + - `1.18.0 `__ * - NVIDIA Kubernetes Device Plugin - - `0.17.4 `__ + - `0.18.0 `__ * - DCGM Exporter - - `4.3.1-4.4.0 `__ + - `v4.4.1-4.6.0 `__ * - Node Feature Discovery - - `v0.17.3 `__ + - `v0.18.2 `__ * - | NVIDIA GPU Feature Discovery | for Kubernetes - - `0.17.4 `__ + - `0.18.0 `__ * - NVIDIA MIG Manager for Kubernetes - - `0.12.3 `__ + - `0.13.0 `__ * - DCGM - - `4.3.1 `__ + - `4.4.1 `__ * - Validator for NVIDIA GPU Operator - - ${version} + - v25.10.0 * - NVIDIA KubeVirt GPU Device Plugin - `v1.4.0 `__ * - NVIDIA vGPU Device Manager - - `v0.4.0 `__ + - `v0.4.1 `__ * - NVIDIA GDS Driver |gds|_ - - `2.20.5 `__ + - `2.26.6 `__ * - NVIDIA Kata Manager for Kubernetes - `v0.2.3 `__ @@ -154,7 +153,7 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. Known Issue: For drivers 570.124.06, 570.133.20, 570.148.08, and 570.158.01, GPU workloads cannot be scheduled on nodes that have a mix of MIG slices and full GPUs. This manifests as GPU pods getting stuck indefinitely in the ``Pending`` state. - NVIDIA recommends that you downgrade the driver to version 570.86.15 to work around this issue. + NVIDIA recommends that you upgrade the driver to version 580.65.06 or later to resolve this issue. For more detailed information, see GitHub issue https://github.com/NVIDIA/gpu-operator/issues/1361. diff --git a/gpu-operator/manifests/input/cuda-vectoradd-cdi.yaml b/gpu-operator/manifests/input/cuda-vectoradd-cdi.yaml deleted file mode 100644 index 967d328e9..000000000 --- a/gpu-operator/manifests/input/cuda-vectoradd-cdi.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: cuda-vectoradd -spec: - restartPolicy: OnFailure - runtimeClassName: nvidia-cdi - containers: - - name: cuda-vectoradd - image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04" - resources: - limits: - nvidia.com/gpu: 1 - diff --git a/gpu-operator/manifests/output/cdi-verify-get-runtime-classes.txt b/gpu-operator/manifests/output/cdi-verify-get-runtime-classes.txt deleted file mode 100644 index 5443561ae..000000000 --- a/gpu-operator/manifests/output/cdi-verify-get-runtime-classes.txt +++ /dev/null @@ -1,4 +0,0 @@ -NAME HANDLER AGE -nvidia nvidia 2d -nvidia-cdi nvidia-cdi 5m7s -nvidia-legacy nvidia-legacy 5m7s diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index d07c96305..1a128ad7a 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -49,9 +49,11 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: * - Product - Architecture + - Notes * - NVIDIA GH200 |open-kern-module|_ - NVIDIA Grace Hopper + - .. _requires-open-kernel-module: @@ -64,60 +66,60 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: .. tab-item:: A, H and L-series Products :selected: - +-------------------------+---------------------------+ - | Product | Architecture | - +=========================+===========================+ - | NVIDIA H800 | NVIDIA Hopper | - +-------------------------+---------------------------+ - | | NVIDIA H200, | NVIDIA Hopper | - | | NVIDIA H200 NVL | | - +-------------------------+---------------------------+ - | NVIDIA DGX H100 | NVIDIA Hopper and | - | | NVSwitch | - +-------------------------+---------------------------+ - | NVIDIA DGX H200 | NVIDIA Hopper and | - | | NVSwitch | - +-------------------------+---------------------------+ - | NVIDIA HGX H100 | NVIDIA Hopper and | - | | NVSwitch | - +-------------------------+---------------------------+ - | NVIDIA HGX H200 | NVIDIA Hopper and | - | | NVSwitch | - +-------------------------+---------------------------+ - | | NVIDIA H100, | NVIDIA Hopper | - | | NVIDIA H100 NVL | | - +-------------------------+---------------------------+ - | NVIDIA H20 | NVIDIA Hopper | - +-------------------------+---------------------------+ - | NVIDIA L20 | NVIDIA Ada | - +-------------------------+---------------------------+ - | | NVIDIA L40, | NVIDIA Ada | - | | NVIDIA L40S | | - +-------------------------+---------------------------+ - | NVIDIA L4 | NVIDIA Ada | - +-------------------------+---------------------------+ - | NVIDIA DGX A100 | A100 and NVSwitch | - +-------------------------+---------------------------+ - | NVIDIA HGX A100 | A100 and NVSwitch | - +-------------------------+---------------------------+ - | NVIDIA A800 | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A100 | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A100X | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A40 | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A30 | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A30X | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A16 | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A10 | NVIDIA Ampere | - +-------------------------+---------------------------+ - | NVIDIA A2 | NVIDIA Ampere | - +-------------------------+---------------------------+ + +-------------------------+---------------------------+-------+ + | Product | Architecture | Notes | + +=========================+===========================+=======+ + | NVIDIA H800 | NVIDIA Hopper | | + +-------------------------+---------------------------+-------+ + | | NVIDIA H200, | NVIDIA Hopper | | + | | NVIDIA H200 NVL | | | + +-------------------------+---------------------------+-------+ + | NVIDIA DGX H100 | NVIDIA Hopper and | | + | | NVSwitch | | + +-------------------------+---------------------------+-------+ + | NVIDIA DGX H200 | NVIDIA Hopper and | | + | | NVSwitch | | + +-------------------------+---------------------------+-------+ + | NVIDIA HGX H100 | NVIDIA Hopper and | | + | | NVSwitch | | + +-------------------------+---------------------------+-------+ + | NVIDIA HGX H200 | NVIDIA Hopper and | | + | | NVSwitch | | + +-------------------------+---------------------------+-------+ + | | NVIDIA H100, | NVIDIA Hopper | | + | | NVIDIA H100 NVL | | | + +-------------------------+---------------------------+-------+ + | NVIDIA H20 | NVIDIA Hopper | | + +-------------------------+---------------------------+-------+ + | NVIDIA L20 | NVIDIA Ada | | + +-------------------------+---------------------------+-------+ + | | NVIDIA L40, | NVIDIA Ada | | + | | NVIDIA L40S | | | + +-------------------------+---------------------------+-------+ + | NVIDIA L4 | NVIDIA Ada | | + +-------------------------+---------------------------+-------+ + | NVIDIA DGX A100 | A100 and NVSwitch | | + +-------------------------+---------------------------+-------+ + | NVIDIA HGX A100 | A100 and NVSwitch | | + +-------------------------+---------------------------+-------+ + | NVIDIA A800 | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A100 | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A100X | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A40 | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A30 | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A30X | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A16 | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A10 | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ + | NVIDIA A2 | NVIDIA Ampere | | + +-------------------------+---------------------------+-------+ .. note:: @@ -126,74 +128,78 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: .. tab-item:: D,T and V-series Products - +-----------------------+------------------------+ - | Product | Architecture | - +=======================+========================+ - | NVIDIA T4 | Turing | - +-----------------------+------------------------+ - | NVIDIA V100 | Volta | - +-----------------------+------------------------+ - | NVIDIA P100 | Pascal | - +-----------------------+------------------------+ - | NVIDIA P40 | Pascal | - +-----------------------+------------------------+ - | NVIDIA P4 | Pascal | - +-----------------------+------------------------+ + +-----------------------+------------------------+-------+ + | Product | Architecture | Notes | + +=======================+========================+=======+ + | NVIDIA T4 | Turing | | + +-----------------------+------------------------+-------+ + | NVIDIA V100 | Volta | | + +-----------------------+------------------------+-------+ + | NVIDIA P100 | Pascal | | + +-----------------------+------------------------+-------+ + | NVIDIA P40 | Pascal | | + +-----------------------+------------------------+-------+ + | NVIDIA P4 | Pascal | | + +-----------------------+------------------------+-------+ .. tab-item:: RTX / T-series Products - +-------------------------+------------------------+ - | Product | Architecture | - +=========================+========================+ - | NVIDIA RTX PRO 6000 | NVIDIA Blackwell | - | Blackwell Server Edition| | - +-------------------------+------------------------+ - | NVIDIA RTX PRO 6000D | NVIDIA Blackwell | - +-------------------------+------------------------+ - | NVIDIA RTX A6000 | NVIDIA Ampere /Ada | - +-------------------------+------------------------+ - | NVIDIA RTX A5000 | NVIDIA Ampere | - +-------------------------+------------------------+ - | NVIDIA RTX A4500 | NVIDIA Ampere | - +-------------------------+------------------------+ - | NVIDIA RTX A4000 | NVIDIA Ampere | - +-------------------------+------------------------+ - | NVIDIA Quadro RTX 8000 | Turing | - +-------------------------+------------------------+ - | NVIDIA Quadro RTX 6000 | Turing | - +-------------------------+------------------------+ - | NVIDIA Quadro RTX 5000 | Turing | - +-------------------------+------------------------+ - | NVIDIA Quadro RTX 4000 | Turing | - +-------------------------+------------------------+ - | NVIDIA T1000 | Turing | - +-------------------------+------------------------+ - | NVIDIA T600 | Turing | - +-------------------------+------------------------+ - | NVIDIA T400 | Turing | - +-------------------------+------------------------+ + +-------------------------+------------------------+-------+ + | Product | Architecture | Notes | + +=========================+========================+=======+ + | NVIDIA RTX PRO 6000 | NVIDIA Blackwell | | + | Blackwell Server Edition| | | + +-------------------------+------------------------+-------+ + | NVIDIA RTX PRO 6000D | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ + | NVIDIA RTX A6000 | NVIDIA Ampere /Ada | | + +-------------------------+------------------------+-------+ + | NVIDIA RTX A5000 | NVIDIA Ampere | | + +-------------------------+------------------------+-------+ + | NVIDIA RTX A4500 | NVIDIA Ampere | | + +-------------------------+------------------------+-------+ + | NVIDIA RTX A4000 | NVIDIA Ampere | | + +-------------------------+------------------------+-------+ + | NVIDIA Quadro RTX 8000 | Turing | | + +-------------------------+------------------------+-------+ + | NVIDIA Quadro RTX 6000 | Turing | | + +-------------------------+------------------------+-------+ + | NVIDIA Quadro RTX 5000 | Turing | | + +-------------------------+------------------------+-------+ + | NVIDIA Quadro RTX 4000 | Turing | | + +-------------------------+------------------------+-------+ + | NVIDIA T1000 | Turing | | + +-------------------------+------------------------+-------+ + | NVIDIA T600 | Turing | | + +-------------------------+------------------------+-------+ + | NVIDIA T400 | Turing | | + +-------------------------+------------------------+-------+ .. note:: NVIDIA RTX PRO 6000 Blackwell Server Edition notes: - * Driver versions 575.57.08 or later is required. + * Driver versions 575.57.08 or later is required. * MIG is not supported on the 575.57.08 driver release. * In cases where CUDA init fails, you may need to disable Heterogeneous Memory Management (HMM) in UVM by :ref:`Customizing NVIDIA GPU Driver Parameters during Installation`. .. tab-item:: B-series Products - +-------------------------+------------------------+ - | Product | Architecture | - +=========================+========================+ - | NVIDIA DGX B200 | NVIDIA Blackwell | - +-------------------------+------------------------+ - | NVIDIA DGX Spark | NVIDIA Blackwell | - +-------------------------+------------------------+ - | NVIDIA HGX B200 | NVIDIA Blackwell | - +-------------------------+------------------------+ - | NVIDIA HGX GB200 NVL72 | NVIDIA Blackwell | - +-------------------------+------------------------+ + +-------------------------+------------------------+-------+ + | Product | Architecture | Notes | + +=========================+========================+=======+ + | NVIDIA DGX B200 | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ + | NVIDIA DGX Spark | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ + | NVIDIA HGX B200 | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ + | NVIDIA HGX B300 | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ + | NVIDIA HGX GB200 NVL72 | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ + | NVIDIA HGX GB300 NVL72 | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ .. note:: @@ -207,17 +213,23 @@ Supported ARM Based Platforms The following NVIDIA data center GPUs are supported: -+-------------------------+---------------------------+ -| Product | Architecture | -+=========================+===========================+ -| NVIDIA A100X | Ampere | -+-------------------------+---------------------------+ -| NVIDIA A30X | Ampere | -+-------------------------+---------------------------+ -| NVIDIA IGX Orin | Ampere | -+-------------------------+---------------------------+ -| AWS EC2 G5g instances | Turing | -+-------------------------+---------------------------+ ++-------------------------+---------------------------+-------+ +| Product | Architecture | Notes | ++=========================+===========================+=======+ +| NVIDIA A100X | Ampere | | ++-------------------------+---------------------------+-------+ +| NVIDIA A30X | Ampere | | ++-------------------------+---------------------------+-------+ +| NVIDIA IGX Orin | Ampere | | ++-------------------------+---------------------------+-------+ +| AWS EC2 G5g instances | Turing | | ++-------------------------+---------------------------+-------+ +| NVIDIA DGX Spark | Blackwell | | ++-------------------------+---------------------------+-------+ +| NVIDIA HGX GB200 NVL72 | Blackwell | | ++-------------------------+---------------------------+-------+ +| NVIDIA HGX GB300 NVL72 | Blackwell | | ++-------------------------+---------------------------+-------+ In addition to the products specified in the preceding table, any ARM based system that meets the following requirements is supported: @@ -285,44 +297,42 @@ The GPU Operator has been validated in the following scenarios: | with Tanzu - | Rancher Kubernetes | Engine 2 - - | HPE Ezmeral - | Runtime - | Enterprise + - | Mirantis k0s - | Canonical | MicroK8s - | Nutanix | NKP * - Ubuntu 20.04 LTS |fn2|_ - - 1.29---1.33 + - 1.30---1.34 - - 7.0 U3c, 8.0 U2, 8.0 U3 - - 1.29---1.33 + - 1.30---1.34 - - - 2.12, 2.13, 2.14 * - Ubuntu 22.04 LTS |fn2|_ - - 1.29---1.33 + - 1.30---1.34 - - 8.0 U2, 8.0 U3 - - 1.29---1.33 - - - - 1.26 + - 1.30---1.34 + - 1.30---1.34 + - 1.33---1.34 - 2.12, 2.13, 2.14, 2.15 * - Ubuntu 24.04 LTS - - 1.29---1.33 - - - - - - 1.29---1.33 + - 1.30---1.34 - - + - 1.30---1.34 + - 1.30---1.34 + - 1.33---1.34 - * - Red Hat Core OS - - - | 4.12---4.19 + - | 4.14---4.20 - - - @@ -331,11 +341,11 @@ The GPU Operator has been validated in the following scenarios: * - | Red Hat | Enterprise - | Linux 9.2, 9.4, 9.5, 9.6 |fn3|_ - - 1.29---1.33 + | Linux 9.2, 9.4, 9.6 |fn3|_ + - 1.30---1.34 - - - - 1.29---1.33 + - 1.30---1.34 - - - @@ -344,14 +354,14 @@ The GPU Operator has been validated in the following scenarios: | Enterprise | Linux 8.8, | 8.10 - - 1.29---1.33 + - 1.30---1.34 - - - - 1.29---1.33 + - 1.30---1.34 - - - 2.12, 2.13, 2.14, 2.15 - + .. _kubernetes-version: :sup:`1` @@ -372,7 +382,7 @@ The GPU Operator has been validated in the following scenarios: .. _rhel-9: :sup:`3` - Non-precompiled driver containers for Red Hat Enterprise Linux 9.2, 9.4, 9.5, and 9.6 versions are available for x86 based platforms only. + Non-precompiled driver containers for Red Hat Enterprise Linux 9.2, 9.4, and 9.6 versions are available for x86 based platforms only. They are not available for ARM based systems. .. note:: @@ -395,19 +405,19 @@ The GPU Operator has been validated in the following scenarios: | Kubernetes Service * - Ubuntu 20.04 LTS - - 1.29---1.33 - - 1.29---1.33 - - 1.29---1.33 + - 1.30---1.34 + - 1.30---1.34 + - 1.30---1.34 * - Ubuntu 22.04 LTS - - 1.29---1.33 - - 1.29---1.33 - - 1.29---1.33 + - 1.30---1.34 + - 1.30---1.34 + - 1.30---1.34 * - Ubuntu 24.04 LTS - - 1.29---1.33 - - 1.29---1.33 - - 1.29---1.33 + - 1.30---1.34 + - 1.30---1.34 + - 1.30---1.34 .. tab-item:: Virtual Machines with NVIDIA vGPU @@ -428,29 +438,29 @@ The GPU Operator has been validated in the following scenarios: | NKP * - Ubuntu 20.04 LTS - - 1.29--1.33 + - 1.30--1.34 - - 7.0 U3c, 8.0 U2, 8.0 U3 - - 1.29--1.33 + - 1.30--1.34 - 2.12, 2.13 * - Ubuntu 22.04 LTS - - 1.29--1.33 + - 1.30--1.34 - - 8.0 U2, 8.0 U3 - - 1.29--1.33 + - 1.30--1.34 - 2.12, 2.13 * - Ubuntu 24.04 LTS - - 1.29--1.33 + - 1.30--1.34 + - + - + - 1.30--1.34 - - - - - 1.29--1.33 - - * - Red Hat Core OS - - - 4.12---4.19 + - 4.14--4.20 - - - @@ -458,11 +468,11 @@ The GPU Operator has been validated in the following scenarios: * - | Red Hat | Enterprise | Linux 8.4, - | 8.6---8.10 - - 1.29---1.33 + | 8.6--8.10 + - 1.30--1.34 - - - - 1.29---1.33 + - 1.30--1.34 - .. _supported-precompiled-drivers: @@ -494,7 +504,7 @@ Supported Container Runtimes The GPU Operator has been validated for the following container runtimes: +----------------------------+------------------------+----------------+ -| Operating System | Containerd 1.6 - 2.1 | CRI-O | +| Operating System | Containerd 1.7 - 2.1 | CRI-O | +============================+========================+================+ | Ubuntu 20.04 LTS | Yes | Yes | +----------------------------+------------------------+----------------+ @@ -522,9 +532,9 @@ Operating System Kubernetes KubeVirt OpenShift Virtual \ \ | GPU vGPU | GPU vGPU | Passthrough | Passthrough ================ =========== ============= ========= ============= =========== -Ubuntu 20.04 LTS 1.23---1.33 0.36+ 0.59.1+ -Ubuntu 22.04 LTS 1.23---1.33 0.36+ 0.59.1+ -Red Hat Core OS 4.12---4.19 4.13---4.19 +Ubuntu 20.04 LTS 1.30---1.34 0.36+ 0.59.1+ +Ubuntu 22.04 LTS 1.30---1.34 0.36+ 0.59.1+ +Red Hat Core OS 4.14---4.20 4.14---4.20 ================ =========== ============= ========= ============= =========== You can run GPU passthrough and NVIDIA vGPU in the same cluster as long as you use @@ -542,7 +552,7 @@ KubeVirt and OpenShift Virtualization with NVIDIA vGPU is supported on the follo - H200NVL -- H100 +- H100 - GA10x: A100, A40, RTX A6000, RTX A5500, RTX A5000, A30, A16, A10, A2. @@ -555,19 +565,19 @@ KubeVirt and OpenShift Virtualization with NVIDIA vGPU is supported on the follo Note that HGX platforms are not supported. .. note:: - + KubeVirt with NVIDIA vGPU is supported on ``nodes`` with Linux kernel < 6.0, such as Ubuntu 22.04 ``LTS``. -Support for GPUDirect RDMA +Support for GPUDirect RDMA -------------------------- Supported operating systems and NVIDIA GPU Drivers with GPUDirect RDMA. -- RHEL 8 with Network Operator 25.1.0. -- Ubuntu 24.04 LTS with Network Operator 25.1.0. -- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.10.0. -- Red Hat Enterprise Linux 9.2, 9.4, 9.5, and 9.6 with Network Operator 25.1.0. -- Red Hat OpenShift 4.12 and higher with Network Operator 23.10.0. +- RHEL 8 with Network Operator 25.7.0. +- Ubuntu 24.04 LTS with Network Operator 25.7.0. +- Ubuntu 20.04 and 22.04 LTS with Network Operator 25.7.0. +- Red Hat Enterprise Linux 9.2, 9.4, and 9.6 with Network Operator 25.7.0. +- Red Hat OpenShift 4.14 and higher with Network Operator 25.7.0. For information about configuring GPUDirect RDMA, refer to :doc:`gpu-operator-rdma`. @@ -577,15 +587,15 @@ Support for GPUDirect Storage Supported operating systems and NVIDIA GPU Drivers with GPUDirect Storage. -- Ubuntu 24.04 LTS Network Operator 25.1.0 -- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.10.0 -- Red Hat OpenShift Container Platform 4.12 and higher +- Ubuntu 24.04 LTS Network Operator 25.7.0. +- Ubuntu 20.04 and 22.04 LTS with Network Operator 25.7.0. +- Red Hat OpenShift Container Platform 4.14 and higher. .. note:: Version v2.17.5 and higher of the NVIDIA GPUDirect Storage kernel driver, ``nvidia-fs``, requires the NVIDIA Open GPU Kernel module driver. - You can install the open kernel modules by specifying the ``driver.kernelModuleType=auto`` if you are using driver container version 570.86.15, 570.124.06 or later. + You can install the open kernel modules by specifying the ``driver.kernelModuleType=auto`` if you are using driver container version 570.86.15, 570.124.06 or later. Or use ``driver.kernelModuleType=open`` if you are using a different driver version or branch. argument to the ``helm`` command. Refer to :ref:`Common Chart Customization Options` for more information. diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index bf4f33deb..dcde1895d 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -33,6 +33,158 @@ Refer to the :ref:`GPU Operator Component Matrix` for a list of software compone ---- +.. _v25.10.0: + +25.10.0 +======= + +New Features +------------ + +* Updated software component versions: + + - NVIDIA Driver Manager for Kubernetes v0.9.0 + - NVIDIA Container Toolkit v1.18.0 + - NVIDIA DCGM v4.4.1 + - NVIDIA DCGM Exporter v4.4.1-4.6.0 + - Node Feature Discovery v0.18.2 + - NVIDIA GDS Driver v2.26.6 + - NVIDIA Kubernetes Device Plugin v0.18.0 + - NVIDIA MIG Manager for Kubernetes v0.13.0 + - NVIDIA vGPU Device Manager v0.4.1 + +* Added support for these NVIDIA Data Center GPU Driver versions: + + - 580.95.05 (default, recommended) + - 570.195.03 + - 535.274.02 + +* Container Device Interface (CDI) is now enabled by default when installing or upgrading (via helm) the GPU Operator to 25.10.0. + The ``cdi.enabled`` field in the ClusterPolicy is now set to ``true`` by default. + The ``cdi.default`` field is now deprecated and will be ignored. + + - When ``cdi.enabled`` is ``true`` the GPU Operator now leverages CDI support in container + runtimes, such as containerd and cri-o, for injecting GPU support into workload containers. + This differs from prior releases where CDI support in container runtimes was not used, and + instead, an ``nvidia`` runtime class configured in CDI mode was used. + - For OpenShift users upgrading to v25.10.0, we recommend updating the ``cdi.enabled`` + field in ClusterPolicy to ``true`` post-upgrade. This field will not automatically be + updated to ``true`` since the Operator Lifecycle Manager (OLM) does not mutate custom + resources on operator upgrades. + +* When using NVIDIA vGPU with KubeVirt / OpenShift Virtualization, on GPUs that support MIG, you now have the option to select MIG-backed vGPU instances instead of time-sliced vGPU instances. + To select a MIG-backed vGPU profile, label the node with the name of the MIG-backed vGPU profile. + +* Added support for NVIDIA Network Operator 25.7.0 integration. + Refer to :ref:`Support for GPUDirect RDMA` and :ref:`Support for GPUDirect Storage`. + +* Added support for Mirantis k0s. + +* Added support for Red Hat OpenShift GPU dashboard integration. + +* Added support for Red Hat OpenShift Container Platform 4.20. + +* Added support for Red Hat OpenShift with HGX GB200 NVL72. + +* Added support for Kubernetes v1.34. + +* Added support for NVIDIA HGX B300 and NVIDIA HGX GB300 NVL72. + +* Added support for new MIG profiles with NVIDIA HGX B300. + + * Supports these profiles: + + * ``1g.34gb`` + * ``1g.34gb+me`` + * ``1g.67gb`` + * ``2g.67gb`` + * ``3g.135gb`` + * ``4g.135gb`` + * ``7g.269gb`` + + * Added an ``all-balanced`` profile that creates the following GPU instances: + + * ``1g.34gb`` :math:`\times` 2 + * ``2g.67gb`` :math:`\times` 1 + * ``3g.135gb`` :math:`\times` 1 + + +* Added support for new MIG profiles with NVIDIA HGX GB300 NVL72. + + * Supports these profiles: + + * ``1g.35gb`` + * ``1g.35gb+me`` + * ``1g.70gb`` + * ``2g.70gb`` + * ``3g.139gb`` + * ``4g.139gb`` + * ``7g.278gb`` + + * Added an ``all-balanced`` profile that creates the following GPU instances: + + * ``1g.35gb`` :math:`\times` 2 + * ``2g.70gb`` :math:`\times` 1 + * ``3g.139gb`` :math:`\times` 1 + +Improvements +------------ + +* The GPU Operator now configures containerd and cri-o to use drop-in files for container runtime config overrides by default. + As a consequence of this change, some of the install procedures for Kubernetes distributions + that use custom containerd installations have changed. + + - The install procedure for microk8s has changed. Refer to the latest :ref:`MicroK8s` install procedure. + +* Hardened the GPU Operator container image by using a distroless image as a base image. + +* Validator for NVIDIA GPU Operator is now included as part of the GPU Operator container image. + It is no longer a separate image. + +* The GPU Operator now supports passing the vGPU licensing token as a secret. + It is recommended that you migrate to using secrets instead of a configMap for improved security. + +* Enhanced the driver pod to allow resource requests and limits to be configurable for all containers in the driver pod. + +* Added support for specifying hostPID via the GPU Operator Helm charts + +Fixed Issues +------------ + +* Fixed an issue where the vGPU Manager pod was terminated before it finished disabling VFs on all GPUs. + The terminationGracePeriodSeconds is now set to 120 seconds to ensure the vGPU Manager has enough time to finish its cleanup logic when the pod is terminated. + +* Added GDRCopy validation to validator daemonset. When GDRCopy is enabled, this ensures that the GDRCopy driver is loaded prior to the k8s-device-plugin from starting up. + +* Added required permissions when GPU Feature Discovery is configured to use the Node Feature API instead of feature files. + + +Known Issues +------------ + +* When using cri-o as the container runtime, several of the GPU Operator pods may be stuck in the ``RunContainerError`` state during installation of GPU Operator, upgrade of GPU Operator, or upgrade of the GPU driver daemonset. + The pods may be in this state for several minutes and restart several times. + The pods will recover from this state as soon as the container toolkit pod starts running. + +* NVIDIA Container Toolkit 1.18.0 will overwrite the `imports` field in the top-level containerd configuration file, so any previously imported paths will be lost. + + +* When using MIG-backed vGPU on the RTX Pro 6000 Blackwell Server Edition, the vgpu-device-manager will fail to configure nodes with the default vgpu-device-manager configuration. + To work around this, create a custom ConfigMap that adds the GFX suffix to the vGPU profile name. + All of the MIG-backed vGPU profiles are only supported on MIG instances created with the ``+gfx`` attribute. + Refer to the following example: + + .. code-block:: yaml + + version: v1 + vgpu-configs: + DC-1-2Q: + - devices: all + vgpu-devices: + DC-1-2QGFX: 48 + + Create the ConfigMap, then update the ClusterPolicy with the name of the configMap in the ``vgpuDeviceManager.config.name``, and restart the vgpu-device-manager pod. + .. _v25.3.4: 25.3.4 diff --git a/gpu-operator/versions.json b/gpu-operator/versions.json index eb9ca1765..c893de42f 100644 --- a/gpu-operator/versions.json +++ b/gpu-operator/versions.json @@ -1,24 +1,15 @@ { - "latest": "25.3.4", + "latest": "25.10", "versions": [ { - "version": "25.3.4" + "version": "25.10" }, { - "version": "25.3.3" + "version": "25.3" }, { - "version": "25.3.2" - }, - { - "version": "25.3.1" - }, - { - "version": "25.3.0" - }, - { - "version": "24.9.2" + "version": "24.9" } ] } diff --git a/gpu-operator/versions1.json b/gpu-operator/versions1.json index b1673edb6..3557db032 100644 --- a/gpu-operator/versions1.json +++ b/gpu-operator/versions1.json @@ -1,27 +1,15 @@ [ { "preferred": "true", - "url": "../25.3.4", - "version": "25.3.4" + "url": "../25.10", + "version": "25.10" }, { - "url": "../25.3.3", - "version": "25.3.3" + "url": "../25.3", + "version": "25.3" }, { - "url": "../25.3.2", - "version": "25.3.2" - }, - { - "url": "../25.3.1", - "version": "25.3.1" - }, - { - "url": "../25.3.0", - "version": "25.3.0" - }, - { - "url": "../24.9.2", - "version": "24.9.2" + "url": "../24.9", + "version": "24.9" } ] diff --git a/openshift/gpu-operator-with-precompiled-drivers.rst b/openshift/gpu-operator-with-precompiled-drivers.rst index de44cacf7..5084925a6 100644 --- a/openshift/gpu-operator-with-precompiled-drivers.rst +++ b/openshift/gpu-operator-with-precompiled-drivers.rst @@ -121,16 +121,6 @@ Perform the following steps to build a custom driver image for use with Red Hat export DRIVER_VERSION=525.105.17 export OS_TAG=rhcos4.12 - .. note:: The driver container image tag for OpenShift has changed after the OCP 4.19 release. - - - Before OCP 4.19: The driver image tag is formed with the suffix ``-rhcos4.17`` (such as with OCP 4.17). - - Starting OCP 4.19 and later: The driver image tag is formed with the suffix ``-rhel9.6`` (such as with OCP 4.19). - - Refer to `OpenShift Container Platform 4.19 Release Notes section 1.4.5 `_, - `RHEL Versions Utilized by RHEL CoreOS and OCP `_, - and `Split RHCOS into layers: /etc/os-release `_ - for more information. - #. Build and push the image: .. code-block:: console diff --git a/openshift/openshift-virtualization.rst b/openshift/openshift-virtualization.rst index 747626d8f..1cf782234 100644 --- a/openshift/openshift-virtualization.rst +++ b/openshift/openshift-virtualization.rst @@ -725,3 +725,8 @@ You should now see 12 **A10-4Q** devices on the node, as 6 **A10-4Q** devices ca { "nvidia.com/NVIDIA_A10-4Q": "12" } + +.. note:: + + On GPUs that support MIG, you have the option to select MIG-backed vGPU instances instead of time-sliced vGPU instances. + To select a MIG-backed vGPU profile, label the node with the name of the MIG-backed vGPU profile. diff --git a/openshift/steps-overview.rst b/openshift/steps-overview.rst index 904711b6d..e667d9d2e 100644 --- a/openshift/steps-overview.rst +++ b/openshift/steps-overview.rst @@ -16,6 +16,16 @@ Installation and Upgrade Overview on OpenShift Update your OCP cluster to version `4.18.24` or later, which includes a fix for the issue. Refer to `NVIDIA GPU Operator Validator Pod Error `_ in the Red Hat Knowledgebase for more information. +.. note:: The driver container image tag for OpenShift has changed after the OCP 4.19 release. + + - Before OCP 4.19: The driver image tag is formed with the suffix ``-rhcos4.17`` (such as with OCP 4.17). + - Starting OCP 4.19 and later: The driver image tag is formed with the suffix ``-rhel9.6`` (such as with OCP 4.19). + + Refer to `OpenShift Container Platform 4.19 Release Notes section 1.4.5 `_, + `RHEL Versions Utilized by RHEL CoreOS and OCP `_, + and `Split RHCOS into layers: /etc/os-release `_ + for more information. + **************** High-Level Steps **************** diff --git a/openshift/versions.json b/openshift/versions.json index eb9ca1765..c893de42f 100644 --- a/openshift/versions.json +++ b/openshift/versions.json @@ -1,24 +1,15 @@ { - "latest": "25.3.4", + "latest": "25.10", "versions": [ { - "version": "25.3.4" + "version": "25.10" }, { - "version": "25.3.3" + "version": "25.3" }, { - "version": "25.3.2" - }, - { - "version": "25.3.1" - }, - { - "version": "25.3.0" - }, - { - "version": "24.9.2" + "version": "24.9" } ] } diff --git a/openshift/versions1.json b/openshift/versions1.json index b1673edb6..3557db032 100644 --- a/openshift/versions1.json +++ b/openshift/versions1.json @@ -1,27 +1,15 @@ [ { "preferred": "true", - "url": "../25.3.4", - "version": "25.3.4" + "url": "../25.10", + "version": "25.10" }, { - "url": "../25.3.3", - "version": "25.3.3" + "url": "../25.3", + "version": "25.3" }, { - "url": "../25.3.2", - "version": "25.3.2" - }, - { - "url": "../25.3.1", - "version": "25.3.1" - }, - { - "url": "../25.3.0", - "version": "25.3.0" - }, - { - "url": "../24.9.2", - "version": "24.9.2" + "url": "../24.9", + "version": "24.9" } ] diff --git a/repo.toml b/repo.toml index c5d87f62b..6127aca38 100644 --- a/repo.toml +++ b/repo.toml @@ -28,6 +28,7 @@ sphinx_conf_py_extra = """ ] templates_path = ['${root}/templates'] extensions.extend([ + "linuxdoc.rstFlatTable", "sphinx.ext.autosectionlabel", "sphinx_copybutton", ]) @@ -166,8 +167,8 @@ output_format = "linkcheck" docs_root = "${root}/gpu-operator" project = "gpu-operator" name = "NVIDIA GPU Operator" -version = "25.3.4" -source_substitutions = { version = "v25.3.4", recommended = "580.82.07" } +version = "25.10" # Update repo_docs.projects.openshift.version to match latest patch version maj.min.patch +source_substitutions = { minor_version = "25.10", version = "v25.10.0", recommended = "580.95.05" } copyright_start = 2020 sphinx_exclude_patterns = [ "life-cycle-policy.rst", @@ -197,6 +198,17 @@ build_by_default = false output_format = "linkcheck" +[repo_docs.projects.openshift] +docs_root = "${root}/openshift" +project = "gpu-operator-openshift" +name = "NVIDIA GPU Operator on Red Hat OpenShift Container Platform" +version = "25.10.0" # Check that this value matches maj.min version of repo_docs.projects.gpu-operator.version +copyright_start = 2020 +sphinx_exclude_patterns = [ + "get-entitlement.rst", +] + + [repo_docs.projects.gpu-telemetry] docs_root = "${root}/gpu-telemetry" project = "gpu-telemetry" @@ -221,16 +233,6 @@ build_by_default = false output_format = "linkcheck" -[repo_docs.projects.openshift] -docs_root = "${root}/openshift" -project = "gpu-operator-openshift" -name = "NVIDIA GPU Operator on Red Hat OpenShift Container Platform" -version = "25.3.4" -copyright_start = 2020 -sphinx_exclude_patterns = [ - "get-entitlement.rst", -] - [repo_docs.projects.openshift.builds.linkcheck] build_by_default = false output_format = "linkcheck"