From 4f49f650d172b865bbb753d37d760d19a36c3ddf Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 19 Dec 2024 18:34:05 -0800
Subject: [PATCH 001/130] [template] generate gpu/install_gpu_driver.sh from
 templates

---
 templates/common/template_disclaimer   |    5 +
 templates/common/util_functions        |  365 ++++++++
 templates/generate-action.pl           |   25 +
 templates/gpu/install_gpu_driver.sh.in |  280 ++++++
 templates/gpu/util_functions           | 1192 ++++++++++++++++++++++++
 templates/legal/license_header         |   11 +
 templates/secure-boot/util_functions   |  105 +++
 7 files changed, 1983 insertions(+)
 create mode 100644 templates/common/template_disclaimer
 create mode 100644 templates/common/util_functions
 create mode 100644 templates/generate-action.pl
 create mode 100644 templates/gpu/install_gpu_driver.sh.in
 create mode 100644 templates/gpu/util_functions
 create mode 100644 templates/legal/license_header
 create mode 100644 templates/secure-boot/util_functions

diff --git a/templates/common/template_disclaimer b/templates/common/template_disclaimer
new file mode 100644
index 000000000..3b417deff
--- /dev/null
+++ b/templates/common/template_disclaimer
@@ -0,0 +1,5 @@
+# This initialization action is generated from
+# initialization-actions/templates/[% template_path %]
+#
+# Modifications made directly to the generated file will be lost when
+# the template is re-evaluated
diff --git a/templates/common/util_functions b/templates/common/util_functions
new file mode 100644
index 000000000..5b85cad65
--- /dev/null
+++ b/templates/common/util_functions
@@ -0,0 +1,365 @@
+function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
+
+function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
+function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
+function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
+function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
+
+readonly -A supported_os=(
+  ['debian']="10 11 12"
+  ['rocky']="8 9"
+  ['ubuntu']="18.04 20.04 22.04"
+)
+
+# dynamically define OS version test utility functions
+if [[ "$(os_id)" == "rocky" ]];
+then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+else _os_version="$(os_version)"; fi
+for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+  done
+done
+
+function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
+
+function os_vercat()   ( set +x
+  if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
+  elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
+                   else os_version ; fi ; )
+
+function repair_old_backports {
+  if ! is_debuntu ; then return ; fi
+  # This script uses 'apt-get update' and is therefore potentially dependent on
+  # backports repositories which have been archived.  In order to mitigate this
+  # problem, we will use archive.debian.org for the oldoldstable repo
+
+  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
+  debdists="https://deb.debian.org/debian/dists"
+  oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
+  oldstable=$(   curl -s "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
+  stable=$(      curl -s "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');
+
+  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
+
+  for filename in "${matched_files[@]}"; do
+    # Fetch from archive.debian.org for ${oldoldstable}-backports
+    perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
+                  {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
+  done
+}
+
+function print_metadata_value() {
+  local readonly tmpfile=$(mktemp)
+  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
+    -s -o ${tmpfile} 2>/dev/null)
+  local readonly return_code=$?
+  # If the command completed successfully, print the metadata value to stdout.
+  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
+    cat ${tmpfile}
+  fi
+  rm -f ${tmpfile}
+  return ${return_code}
+}
+
+function print_metadata_value_if_exists() {
+  local return_code=1
+  local readonly url=$1
+  print_metadata_value ${url}
+  return_code=$?
+  return ${return_code}
+}
+
+# replicates /usr/share/google/get_metadata_value
+function get_metadata_value() (
+  set +x
+  local readonly varname=$1
+  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
+  # Print the instance metadata value.
+  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
+  return_code=$?
+  # If the instance doesn't have the value, try the project.
+  if [[ ${return_code} != 0 ]]; then
+    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
+    return_code=$?
+  fi
+
+  return ${return_code}
+)
+
+function get_metadata_attribute() (
+  set +x
+  local -r attribute_name="$1"
+  local -r default_value="${2:-}"
+  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
+)
+
+function execute_with_retries() (
+  set +x
+  local -r cmd="$*"
+
+  if [[ "$cmd" =~ "^apt-get install" ]] ; then
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+  fi
+  for ((i = 0; i < 3; i++)); do
+    set -x
+    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    set +x
+    if [[ $retval == 0 ]] ; then return 0 ; fi
+    sleep 5
+  done
+  return 1
+)
+
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
+
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
+
+function add_contrib_component() {
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
+}
+
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
+}
+
+function configure_yarn_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
+
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
+
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
+
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
+
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    fi
+
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
+
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
+
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
+
+
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
+
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
+
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+      fi
+    done
+  fi
+
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
+
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  fi
+
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+
+}
+
+function set_proxy(){
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+  export METADATA_HTTP_PROXY
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+  local no_proxy_svc
+  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
+                      bigquery composer      pubsub bigquerydatatransfer dataflow \
+                      storage  datafusion    ; do
+    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  done
+
+  export NO_PROXY="${no_proxy}"
+}
+
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Download pip packages to tmpfs
+  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+}
+
+function check_os() {
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
+}
+
+readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
+# Dataproc configurations
+readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+readonly HIVE_CONF_DIR='/etc/hive/conf'
+readonly SPARK_CONF_DIR='/etc/spark/conf'
diff --git a/templates/generate-action.pl b/templates/generate-action.pl
new file mode 100644
index 000000000..407dfe310
--- /dev/null
+++ b/templates/generate-action.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/perl -w
+# -*-CPerl-*-
+
+# Usage: Run this script from the root directory of the git clone:
+# perl templates/generate-action.pl gpu/install_gpu_driver.sh
+
+use Template;
+use strict;
+use v5.10;
+
+my $tt = Template->new( {
+  INCLUDE_PATH => "$ENV{PWD}/templates",
+  INTERPOLATE  => 0,
+}) || die "$Template::ERROR$/";
+
+my $action = $ARGV[0];
+
+sub usage{
+  die "Usage: $0 <action>";
+}
+
+usage unless( -f "$ENV{PWD}/templates/${action}.in" );
+
+$tt->process("${action}.in")
+    || die $tt->error(), "\n";
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
new file mode 100644
index 000000000..e4924f51e
--- /dev/null
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -0,0 +1,280 @@
+#!/bin/bash
+#
+[% template_path="gpu/install_gpu_driver.sh.in" %]
+[% INSERT legal/license_header %]
+#
+[% PROCESS common/template_disclaimer %]
+#
+# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
+
+set -euxo pipefail
+
+[% INSERT common/util_functions %]
+
+[% INSERT gpu/util_functions %]
+
+[% INSERT 'secure-boot/util_functions' %]
+
+function main() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_resources
+
+  # Detect NVIDIA GPU
+  if (lspci | grep -q NVIDIA); then
+    # if this is called without the MIG script then the drivers are not installed
+    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
+    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
+      fi
+    fi
+
+    # if mig is enabled drivers would have already been installed
+    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+      install_nvidia_gpu_driver
+      install_nvidia_container_toolkit
+      install_cuda
+      load_kernel_module
+
+      if [[ -n ${CUDNN_VERSION} ]]; then
+        install_nvidia_nccl
+        install_nvidia_cudnn
+      fi
+      #Install GPU metrics collection in Stackdriver if needed
+      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+        #install_ops_agent
+	install_gpu_agent
+        echo 'GPU metrics agent successfully deployed.'
+      else
+        echo 'GPU metrics agent will not be installed.'
+      fi
+
+      # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
+      for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
+        rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+      done
+
+      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
+      if test -n "$(nvsmi -L)" ; then
+	# cache the result of the gpu query
+        ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
+        echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
+      fi
+      NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
+      if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+        # enable MIG on every GPU
+	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
+	  nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
+	done
+
+        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+        MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
+        fetch_mig_scripts
+      else
+        configure_gpu_exclusive_mode
+      fi
+    fi
+
+    configure_yarn_nodemanager
+    configure_gpu_script
+    configure_gpu_isolation
+  elif [[ "${ROLE}" == "Master" ]]; then
+    configure_yarn_nodemanager
+    configure_gpu_script
+  fi
+
+  # Restart YARN services if they are running already
+  if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then
+    systemctl restart hadoop-yarn-resourcemanager.service
+  fi
+  if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then
+    systemctl restart hadoop-yarn-nodemanager.service
+  fi
+}
+
+function exit_handler() {
+  # Purge private key material until next grant
+  clear_dkms_key
+
+  set +ex
+  echo "Exit handler invoked"
+
+  # Clear pip cache
+  pip cache purge || echo "unable to purge pip cache"
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
+
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
+
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3 | sort -h
+  elif is_debian ; then
+    du -x -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /var/lib/{docker,mysql,} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+      /usr/bin \
+      /usr \
+      /var \
+      / 2>/dev/null | sort -h
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  fi
+
+  # Process disk usage logs from installation period
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
+  # compute maximum size of disk during installation
+  # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+
+  echo "exit_handler has completed"
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
+  fi
+
+  return 0
+}
+
+function prepare_to_install(){
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
+
+  prepare_gpu_env
+
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
+
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
+
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
+
+  mkdir -p "${workdir}"
+  trap exit_handler EXIT
+  set_proxy
+  mount_ramdisk
+
+  readonly install_log="${tmpdir}/install.log"
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+
+  if test -f "${workdir}/prepare-complete" ; then return ; fi
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  ) fi
+
+  install_dependencies
+
+  # Monitor disk usage in a screen session
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
+  screen -d -m -LUS keep-running-df \
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+
+  touch "${workdir}/prepare-complete"
+}
+
+prepare_to_install
+
+main
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
new file mode 100644
index 000000000..5727da537
--- /dev/null
+++ b/templates/gpu/util_functions
@@ -0,0 +1,1192 @@
+function set_support_matrix() {
+  # CUDA version and Driver version
+  # https://docs.nvidia.com/deploy/cuda-compatibility/
+  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+  # https://developer.nvidia.com/cuda-downloads
+
+  # Minimum supported version for open kernel driver is 515.43.04
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
+  # Rocky8: 12.0: 525.147.05
+  local latest
+  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+  readonly -A DRIVER_FOR_CUDA=(
+          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
+          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+  )
+  readonly -A DRIVER_SUBVER=(
+          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+  )
+  # https://developer.nvidia.com/cudnn-downloads
+  if is_debuntu ; then
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+  )
+  elif is_rocky ; then
+  # rocky:
+  #   12.0: 8.8.1.3
+  #   12.1: 8.9.3.28
+  #   12.2: 8.9.7.29
+  #   12.3: 9.0.0.312
+  #   12.4: 9.1.1.17
+  #   12.5: 9.2.1.18
+  #   12.6: 9.5.1.17
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
+  )
+  fi
+  # https://developer.nvidia.com/nccl/nccl-download
+  # 12.2: 2.19.3, 12.5: 2.21.5
+  readonly -A NCCL_FOR_CUDA=(
+          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
+          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+  )
+  readonly -A CUDA_SUBVER=(
+          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
+          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
+  )
+}
+
+set_support_matrix
+
+function set_cuda_version() {
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+  if [[ -n "${cuda_url}" ]] ; then
+    # if cuda-url metadata variable has been passed, extract default version from url
+    local CUDA_URL_VERSION
+    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
+      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
+    fi
+  fi
+
+  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
+    DEFAULT_CUDA_VERSION='12.4'
+  fi
+  readonly DEFAULT_CUDA_VERSION
+
+  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  readonly CUDA_VERSION
+  if ( ! test -v CUDA_FULL_VERSION ) ; then
+    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
+  fi
+  readonly CUDA_FULL_VERSION
+
+}
+
+set_cuda_version
+
+function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
+function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
+function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
+
+function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
+function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
+function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
+
+function set_driver_version() {
+  local gpu_driver_url
+  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
+
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+  local DEFAULT_DRIVER
+  # Take default from gpu-driver-url metadata value
+  if [[ -n "${gpu_driver_url}" ]] ; then
+    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
+    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
+  # Take default from cuda-url metadata value as a backup
+  elif [[ -n "${cuda_url}" ]] ; then
+    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
+      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
+      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
+      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the version indicated by the cuda url as the default if it exists
+	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the maximum sub-version available for the major version indicated in cuda url as the default
+	DEFAULT_DRIVER="${driver_max_maj_version}"
+      fi
+    fi
+  fi
+
+  if ( ! test -v DEFAULT_DRIVER ) ; then
+    # If a default driver version has not been extracted, use the default for this version of CUDA
+    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
+  fi
+
+  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
+
+  readonly DRIVER_VERSION
+  readonly DRIVER="${DRIVER_VERSION%%.*}"
+
+  export DRIVER_VERSION DRIVER
+
+  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
+    exit 1
+  fi
+}
+
+set_driver_version
+
+readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+# Parameters for NVIDIA-provided cuDNN library
+readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
+function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
+# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
+  # cuDNN v8 is not distribution for ubuntu20+, debian12
+  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
+  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+  CUDNN_VERSION="8.8.0.121"
+fi
+readonly CUDNN_VERSION
+
+readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
+# Parameters for NVIDIA-provided Debian GPU driver
+readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+readonly USERSPACE_FILENAME
+
+# Short name for urls
+if is_ubuntu22  ; then
+    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
+    # https://developer.download.nvidia.com/compute/machine-learning/repos/
+    # use packages from previous release until such time as nvidia
+    # release ubuntu2204 builds
+
+    shortname="$(os_id)$(os_vercat)"
+    nccl_shortname="ubuntu2004"
+elif ge_rocky9 ; then
+    # use packages from previous release until such time as nvidia
+    # release rhel9 builds
+
+    shortname="rhel9"
+    nccl_shortname="rhel8"
+elif is_rocky ; then
+    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
+    nccl_shortname="${shortname}"
+else
+    shortname="$(os_id)$(os_vercat)"
+    nccl_shortname="${shortname}"
+fi
+
+# Parameters for NVIDIA-provided package repositories
+readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+
+# Parameters for NVIDIA-provided NCCL library
+readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
+NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
+readonly NCCL_REPO_URL
+readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
+
+function set_cuda_runfile_url() {
+  local MAX_DRIVER_VERSION
+  local MAX_CUDA_VERSION
+
+  local MIN_OPEN_DRIVER_VER="515.48.07"
+  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
+  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
+
+  if is_cuda12 ; then
+    if is_debian12 ; then
+      MIN_DRIVER_VERSION="545.23.06"
+      MIN_CUDA_VERSION="12.3.0"
+    elif is_debian10 ; then
+      MAX_DRIVER_VERSION="555.42.02"
+      MAX_CUDA_VERSION="12.5.0"
+    elif is_ubuntu18 ; then
+      MAX_DRIVER_VERSION="530.30.02"
+      MAX_CUDA_VERSION="12.1.1"
+    fi
+  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    if le_debian10 ; then
+      # cuda 11 is not supported for <= debian10
+      MAX_CUDA_VERSION="0"
+      MAX_DRIVER_VERSION="0"
+    fi
+  else
+    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+
+  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
+    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
+    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
+    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  fi
+
+  # driver version named in cuda runfile filename
+  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+  readonly -A drv_for_cuda=(
+          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+          ["11.8.0"]="520.61.05"
+          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+          ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+  )
+
+  # Verify that the file with the indicated combination exists
+  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
+  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
+  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
+  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
+
+  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
+  readonly NVIDIA_CUDA_URL
+
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
+  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    exit 1
+  fi
+
+  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
+  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+  fi
+}
+
+set_cuda_runfile_url
+
+# Parameter for NVIDIA-provided Rocky Linux GPU driver
+readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+
+CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
+CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
+if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
+  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
+  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
+    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
+    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
+  fi
+  # Use legacy url format with one of the tarball name formats depending on version as above
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
+fi
+if ( version_ge "${CUDA_VERSION}" "12.0" ); then
+  # Use modern url format When cuda version is greater than or equal to 12.0
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
+fi
+readonly CUDNN_TARBALL
+readonly CUDNN_TARBALL_URL
+
+# Whether to install NVIDIA-provided or OS-provided GPU driver
+GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+readonly GPU_DRIVER_PROVIDER
+
+# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+readonly INSTALL_GPU_AGENT
+
+NVIDIA_SMI_PATH='/usr/bin'
+MIG_MAJOR_CAPS=0
+IS_MIG_ENABLED=0
+
+CUDA_KEYRING_PKG_INSTALLED="0"
+function install_cuda_keyring_pkg() {
+  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
+  local kr_ver=1.1
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
+    -o "${tmpdir}/cuda-keyring.deb"
+  dpkg -i "${tmpdir}/cuda-keyring.deb"
+  rm -f "${tmpdir}/cuda-keyring.deb"
+  CUDA_KEYRING_PKG_INSTALLED="1"
+}
+
+function uninstall_cuda_keyring_pkg() {
+  apt-get purge -yq cuda-keyring
+  CUDA_KEYRING_PKG_INSTALLED="0"
+}
+
+function install_local_cuda_repo() {
+  if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
+
+  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  CUDA_LOCAL_REPO_INSTALLED="1"
+  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
+  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
+  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
+  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
+  readonly DIST_KEYRING_DIR="/var/${pkgname}"
+
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+
+  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+
+  if is_ubuntu ; then
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
+      -o /etc/apt/preferences.d/cuda-repository-pin-600
+  fi
+
+  touch "${workdir}/install-local-cuda-repo-complete"
+}
+function uninstall_local_cuda_repo(){
+  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
+  rm -f "${workdir}/install-local-cuda-repo-complete"
+}
+
+CUDNN_PKG_NAME=""
+function install_local_cudnn_repo() {
+  if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi
+  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
+  CUDNN_PKG_NAME="${pkgname}"
+  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
+
+  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
+
+  dpkg -i "${tmpdir}/local-installer.deb"
+
+  rm -f "${tmpdir}/local-installer.deb"
+
+  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+
+  touch "${workdir}/install-local-cudnn-repo-complete"
+}
+
+function uninstall_local_cudnn_repo() {
+  apt-get purge -yq "${CUDNN_PKG_NAME}"
+  rm -f "${workdir}/install-local-cudnn-repo-complete"
+}
+
+CUDNN8_LOCAL_REPO_INSTALLED="0"
+CUDNN8_PKG_NAME=""
+function install_local_cudnn8_repo() {
+  if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi
+
+  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
+  elif is_debian ; then cudnn8_shortname="debian11"
+  else return 0 ; fi
+  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
+  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
+  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
+
+  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
+  CUDNN8_PKG_NAME="${pkgname}"
+
+  deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_fn="${tmpdir}/${deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+
+  # cache the cudnn package
+  cache_fetched_package "${local_deb_url}" \
+                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${local_deb_fn}"
+
+  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
+  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
+  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
+    mkdir -p "${cudnn_path}"
+    mount -t tmpfs tmpfs "${cudnn_path}"
+  fi
+
+  dpkg -i "${local_deb_fn}"
+
+  rm -f "${local_deb_fn}"
+
+  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  touch "${workdir}/install-local-cudnn8-repo-complete"
+}
+
+function uninstall_local_cudnn8_repo() {
+  apt-get purge -yq "${CUDNN8_PKG_NAME}"
+  rm -f "${workdir}/install-local-cudnn8-repo-complete"
+}
+
+function install_nvidia_nccl() {
+  if test -f "${workdir}/nccl-complete" ; then return ; fi
+
+  if is_cuda11 && is_debian12 ; then
+    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
+    return
+  fi
+
+  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
+
+  # https://github.com/NVIDIA/nccl/blob/master/README.md
+  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Fermi:     SM_20,             compute_30
+  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+  # The following architectures are suppored by open kernel driver
+  # Volta:     SM_70,SM_72,       compute_70,compute_72
+  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+  # The following architectures are supported by CUDA v11.8+
+  # Ada:       SM_89,             compute_89
+  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+  # Blackwell: SM_100,            compute_100
+                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
+  if version_ge "${CUDA_VERSION}" "11.8" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
+  fi
+  if version_ge "${CUDA_VERSION}" "12.0" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
+  fi
+
+  mkdir -p "${workdir}"
+  pushd "${workdir}"
+
+  test -d "${workdir}/nccl" || {
+    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "nccl-${NCCL_VERSION}-1" nccl
+  }
+
+  local build_path
+  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
+                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
+
+  test -d "${workdir}/nccl/build" || {
+    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+
+    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+    if echo "${output}" | grep -q "${gcs_tarball}" ; then
+      # cache hit - unpack from cache
+      echo "cache hit"
+    else
+      # build and cache
+      pushd nccl
+      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+      install_build_dependencies
+      if is_debuntu ; then
+        # These packages are required to build .deb packages from source
+        execute_with_retries \
+          apt-get install -y -qq build-essential devscripts debhelper fakeroot
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.debian.build
+      elif is_rocky ; then
+        # These packages are required to build .rpm packages from source
+        execute_with_retries \
+          dnf -y -q install rpm-build rpmdevtools
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.redhat.build
+      fi
+      tar czvf "/${local_tarball}" "../${build_path}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar xz
+  }
+
+  if is_debuntu ; then
+    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
+  elif is_rocky ; then
+    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
+  fi
+
+  popd
+  touch "${workdir}/nccl-complete"
+}
+
+function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
+function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
+
+function install_nvidia_cudnn() {
+  if test -f "${workdir}/cudnn-complete" ; then return ; fi
+  local major_version
+  major_version="${CUDNN_VERSION%%.*}"
+  local cudnn_pkg_version
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
+
+  if is_rocky ; then
+    if is_cudnn8 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn${major_version}" \
+        "libcudnn${major_version}-devel"
+      sync
+    elif is_cudnn9 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
+        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
+      sync
+    else
+      echo "Unsupported cudnn version: '${major_version}'"
+    fi
+  elif is_debuntu; then
+    if ge_debian12 && is_src_os ; then
+      apt-get -y install nvidia-cudnn
+    else
+      if is_cudnn8 ; then
+        install_local_cudnn8_repo
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+            "libcudnn8=${cudnn_pkg_version}" \
+            "libcudnn8-dev=${cudnn_pkg_version}"
+
+        uninstall_local_cudnn8_repo
+	sync
+      elif is_cudnn9 ; then
+	install_cuda_keyring_pkg
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
+	sync
+      else
+        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
+      fi
+    fi
+  else
+    echo "Unsupported OS: '${_shortname}'"
+    exit 1
+  fi
+
+  ldconfig
+
+  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
+  touch "${workdir}/cudnn-complete"
+}
+
+function add_nonfree_components() {
+  if is_src_nvidia ; then return; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-open-kernel-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib non-free non-free-firmware"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
+  fi
+}
+
+function add_repo_nvidia_container_toolkit() {
+  if is_debuntu ; then
+      local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+      local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
+      # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+      test -f "${kr_path}" ||
+        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+          | gpg --dearmor -o "${kr_path}"
+
+      test -f "${sources_list_path}" ||
+        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+          | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
+          | tee "${sources_list_path}"
+      apt-get update
+  else
+    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
+      tee /etc/yum.repos.d/nvidia-container-toolkit.repo
+  fi
+}
+
+function add_repo_cuda() {
+  if is_debuntu ; then
+    install_cuda_keyring_pkg # 11.7+, 12.0+
+  elif is_rocky ; then
+    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
+  fi
+}
+
+function build_driver_from_github() {
+  # non-GPL driver will have been built on rocky8
+  if is_rocky8 ; then return 0 ; fi
+  pushd "${workdir}"
+
+  test -d "${workdir}/open-gpu-kernel-modules" || {
+    local tarball_fn="${DRIVER_VERSION}.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
+  }
+
+  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local build_dir
+    if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+      then build_dir="${modulus_md5sum}"
+      else build_dir="unsigned" ; fi
+
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+      echo "cache hit"
+    else
+      # build the kernel modules
+      pushd open-gpu-kernel-modules
+      install_build_dependencies
+      if is_cuda11 && is_ubuntu22 ; then
+        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
+        exit 1
+      fi
+      execute_with_retries make -j$(nproc) modules \
+        >  kernel-open/build.log \
+        2> kernel-open/build_error.log
+      # Sign kernel modules
+      if [[ -n "${PSN}" ]]; then
+        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
+          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+          "${mok_key}" \
+          "${mok_der}" \
+          "${module}"
+        done
+      fi
+      make modules_install \
+        >>  kernel-open/build.log \
+        2>> kernel-open/build_error.log
+      # Collect build logs and installed binaries
+      tar czvf "${local_tarball}" \
+        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+    depmod -a
+  }
+
+  popd
+}
+
+function build_driver_from_packages() {
+  if is_debuntu ; then
+    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
+      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
+      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
+    if is_debian ; then
+      pkglist=(
+        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
+        "nvidia-smi=${DRIVER_VERSION}-1"
+        "nvidia-alternative=${DRIVER_VERSION}-1"
+        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
+        "nvidia-kernel-support=${DRIVER_VERSION}-1"
+        "nvidia-modprobe=${DRIVER_VERSION}-1"
+        "libnvidia-ml1=${DRIVER_VERSION}-1"
+      )
+    fi
+    add_contrib_component
+    apt-get update -qq
+    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
+    #configure_dkms_certs
+    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
+    sync
+
+  elif is_rocky ; then
+    #configure_dkms_certs
+    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
+      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
+    else
+      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
+    fi
+    sync
+  fi
+  #clear_dkms_key
+}
+
+function install_nvidia_userspace_runfile() {
+
+  # This .run file contains NV's OpenGL implementation as well as
+  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
+  # including glib (https://docs.gtk.org/glib/), and what appears to
+  # be a copy of the source from the kernel-open directory of for
+  # example DRIVER_VERSION=560.35.03
+  #
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
+  #
+  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
+  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
+  if test -f "${workdir}/userspace-complete" ; then return ; fi
+  local local_fn="${tmpdir}/userspace.run"
+
+  cache_fetched_package "${USERSPACE_URL}" \
+                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
+                        "${local_fn}"
+
+  local runfile_args
+  runfile_args=""
+  local cache_hit="0"
+  local local_tarball
+
+  if is_rocky8 ; then
+    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+      local_tarball="${workdir}/${build_tarball}"
+      local build_dir
+      if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+        then build_dir="${modulus_md5sum}"
+        else build_dir="unsigned" ; fi
+
+      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+        cache_hit="1"
+        runfile_args="--no-kernel-modules"
+        echo "cache hit"
+      else
+        install_build_dependencies
+
+        local signing_options
+        signing_options=""
+        if [[ -n "${PSN}" ]]; then
+          signing_options="--module-signing-hash sha256 \
+          --module-signing-x509-hash sha256 \
+          --module-signing-secret-key \"${mok_key}\" \
+          --module-signing-public-key \"${mok_der}\" \
+          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
+          "
+        fi
+
+        runfile_args="--no-dkms ${signing_options}"
+      fi
+    }
+  else
+    runfile_args="--no-kernel-modules"
+  fi
+
+  execute_with_retries bash "${local_fn}" -e -q \
+    ${runfile_args} \
+    --ui=none \
+    --install-libglvnd \
+    --tmpdir="${tmpdir}"
+
+  if is_rocky8 ; then
+    if [[ "${cache_hit}" == "1" ]] ; then
+      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+      depmod -a
+    else
+      tar czvf "${local_tarball}" \
+        /var/log/nvidia-installer.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    fi
+  fi
+
+  rm -f "${local_fn}"
+  touch "${workdir}/userspace-complete"
+  sync
+}
+
+function install_cuda_runfile() {
+  if test -f "${workdir}/cuda-complete" ; then return ; fi
+  local local_fn="${tmpdir}/cuda.run"
+
+  cache_fetched_package "${NVIDIA_CUDA_URL}" \
+			"${pkg_bucket}/${CUDA_RUNFILE}" \
+                        "${local_fn}"
+
+  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
+  rm -f "${local_fn}"
+  touch "${workdir}/cuda-complete"
+  sync
+}
+
+function install_cuda_toolkit() {
+  local cudatk_package=cuda-toolkit
+  if ge_debian12 && is_src_os ; then
+    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
+  elif [[ -n "${CUDA_VERSION}" ]]; then
+    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
+  fi
+  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
+  readonly cudatk_package
+  if is_debuntu ; then
+#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
+    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
+  elif is_rocky ; then
+    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
+    execute_with_retries dnf -y -q install "${cudatk_package}"
+  fi
+  sync
+}
+
+function load_kernel_module() {
+  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
+  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
+    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+  done
+
+  depmod -a
+  modprobe nvidia
+  for suffix in uvm modeset drm; do
+    modprobe "nvidia-${suffix}"
+  done
+  # TODO: if peermem is available, also modprobe nvidia-peermem
+}
+
+function install_cuda(){
+  if test -f "${workdir}/cuda-repo-complete" ; then return ; fi
+
+  if ( ge_debian12 && is_src_os ) ; then
+    echo "installed with the driver on ${_shortname}"
+    return 0
+  fi
+
+  # The OS package distributions are unreliable
+  install_cuda_runfile
+
+  # Includes CUDA packages
+  add_repo_cuda
+
+  touch "${workdir}/cuda-repo-complete"
+}
+
+function install_nvidia_container_toolkit() {
+  local container_runtime_default
+    if command -v docker     ; then container_runtime_default='docker'
+  elif command -v containerd ; then container_runtime_default='containerd'
+  elif command -v crio       ; then container_runtime_default='crio'
+                               else container_runtime_default='' ; fi
+  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
+
+  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
+
+  add_repo_nvidia_container_toolkit
+  if is_debuntu ; then
+    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
+    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
+  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
+  systemctl restart "${CONTAINER_RUNTIME}"
+}
+
+# Install NVIDIA GPU driver provided by NVIDIA
+function install_nvidia_gpu_driver() {
+  if test -f "${workdir}/gpu-driver-complete" ; then return ; fi
+
+  if ( ge_debian12 && is_src_os ) ; then
+    add_nonfree_components
+    apt-get update -qq
+    apt-get -yq install \
+        dkms \
+        nvidia-open-kernel-dkms \
+        nvidia-open-kernel-support \
+        nvidia-smi \
+        libglvnd0 \
+        libcuda1
+    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
+    return 0
+  fi
+
+  # OS driver packages do not produce reliable driver ; use runfile
+  install_nvidia_userspace_runfile
+
+  build_driver_from_github
+
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+  touch "${workdir}/gpu-driver-complete"
+}
+
+function install_ops_agent(){
+  if test -f "${workdir}/ops-agent-complete" ; then return ; fi
+
+  mkdir -p /opt/google
+  cd /opt/google
+  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
+  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
+
+  touch "${workdir}/ops-agent-complete"
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_agent() {
+  # Stackdriver GPU agent parameters
+#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
+  if ( ! command -v pip && is_debuntu ) ; then
+    execute_with_retries "apt-get install -y -qq python3-pip"
+  fi
+  local install_dir=/opt/gpu-utilization-agent
+  mkdir -p "${install_dir}"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
+    | sed -e 's/-u --format=/--format=/' \
+    | dd status=none of="${install_dir}/report_gpu_metrics.py"
+  local venv="${install_dir}/venv"
+  python3 -m venv "${venv}"
+(
+  source "${venv}/bin/activate"
+  python3 -m pip install --upgrade pip
+  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
+)
+  sync
+
+  # Generate GPU service.
+  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
+[Unit]
+Description=GPU Utilization Metric Agent
+
+[Service]
+Type=simple
+PIDFile=/run/gpu_agent.pid
+ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
+User=root
+Group=root
+WorkingDirectory=/
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+EOF
+  # Reload systemd manager configuration
+  systemctl daemon-reload
+  # Enable gpu-utilization-agent service
+  systemctl --no-reload --now enable gpu-utilization-agent.service
+}
+
+function configure_gpu_exclusive_mode() {
+  # check if running spark 3, if not, enable GPU exclusive mode
+  local spark_version
+  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
+  if [[ ${spark_version} != 3.* ]]; then
+    # include exclusive mode on GPU
+    nvidia-smi -c EXCLUSIVE_PROCESS
+  fi
+}
+
+function fetch_mig_scripts() {
+  mkdir -p /usr/local/yarn-mig-scripts
+  sudo chmod 755 /usr/local/yarn-mig-scripts
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
+  sudo chmod 755 /usr/local/yarn-mig-scripts/*
+}
+
+function configure_gpu_script() {
+  # Download GPU discovery script
+  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
+  mkdir -p ${spark_gpu_script_dir}
+  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
+  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
+  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
+  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
+  cat > "${gpus_resources_script}" <<'EOF'
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
+
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+EOF
+
+  chmod a+rx "${gpus_resources_script}"
+
+  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
+  if version_ge "${SPARK_VERSION}" "3.0" ; then
+    local gpu_count
+    gpu_count="$(lspci | grep NVIDIA | wc -l)"
+    local executor_cores
+    executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+    local executor_memory
+    executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+    local task_cpus=2
+    local gpu_amount
+    gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+
+    cat >>"${spark_defaults_conf}" <<EOF
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
+# query explain output won't show GPU operator, if the user has doubts
+# they can uncomment the line before seeing the GPU plan explain;
+# having AQE enabled gives user the best performance.
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
+spark.executor.resource.gpu.amount=${gpu_count}
+spark.executor.cores=${executor_cores}
+spark.executor.memory=${executor_memory_gb}G
+spark.dynamicAllocation.enabled=false
+# please update this config according to your application
+spark.task.resource.gpu.amount=${gpu_amount}
+spark.task.cpus=2
+spark.yarn.unmanagedAM.enabled=false
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
+  fi
+}
+
+function configure_gpu_isolation() {
+  # enable GPU isolation
+  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
+  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
+    # configure the container-executor.cfg to have major caps
+    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+  else
+    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+  fi
+
+  # Configure a systemd unit to ensure that permissions are set on restart
+  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
+[Unit]
+Description=Set permissions to allow YARN to access device directories
+
+[Service]
+ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+  systemctl enable dataproc-cgroup-device-permissions
+  systemctl start dataproc-cgroup-device-permissions
+}
+
+function nvsmi() {
+  local nvsmi="/usr/bin/nvidia-smi"
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
+  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
+  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
+  else nvsmi_works="1" ; fi
+
+  if [[ "$1" == "-L" ]] ; then
+    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
+    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
+    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
+
+    return 0
+  fi
+
+  "${nvsmi}" $*
+}
+
+function install_build_dependencies() {
+  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
+
+  if is_debuntu ; then
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
+
+  elif is_rocky ; then
+    execute_with_retries dnf -y -q install gcc
+
+    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
+    set +e
+    eval "${dnf_cmd}" > "${install_log}" 2>&1
+    local retval="$?"
+    set -e
+
+    if [[ "${retval}" == "0" ]] ; then return ; fi
+
+    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
+      # this kernel-devel may have been migrated to the vault
+      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
+      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
+      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
+        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
+       )"
+    fi
+
+    execute_with_retries "${dnf_cmd}"
+  fi
+  touch "${workdir}/build-dependencies-complete"
+}
+
+function install_dependencies() {
+  pkg_list="pciutils screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+}
+
+function prepare_gpu_env(){
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+  nvsmi_works="0"
+
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+}
diff --git a/templates/legal/license_header b/templates/legal/license_header
new file mode 100644
index 000000000..4c05ecc74
--- /dev/null
+++ b/templates/legal/license_header
@@ -0,0 +1,11 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/templates/secure-boot/util_functions b/templates/secure-boot/util_functions
new file mode 100644
index 000000000..f96a48200
--- /dev/null
+++ b/templates/secure-boot/util_functions
@@ -0,0 +1,105 @@
+function configure_dkms_certs() {
+  if test -v PSN && [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
+
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+
+    return
+  fi
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
+}
+
+function check_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
+
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
+    exit 1
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Please either disable secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return 1
+  fi
+
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
+
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
+
+  configure_dkms_certs
+}

From 1dae02baddd6dfe86f2b131dee816b052cda53ef Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 19 Dec 2024 19:17:12 -0800
Subject: [PATCH 002/130] new hold nvidia packages function ; moved variable
 definition around a bit

---
 templates/common/util_functions        | 20 ++++++++++++++++++++
 templates/gpu/install_gpu_driver.sh.in | 13 +------------
 templates/gpu/util_functions           | 17 ++++++++++-------
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 5b85cad65..df84feff5 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -355,6 +355,26 @@ function check_os() {
       echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
       exit 1
   fi
+
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
 }
 
 readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index e4924f51e..23ae59d8f 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -134,6 +134,7 @@ function exit_handler() {
     # re-hold systemd package
     if ge_debian12 ; then
     apt-mark hold systemd libsystemd0 ; fi
+    hold_nvidia_packages
   else
     dnf clean all
   fi
@@ -232,18 +233,6 @@ function prepare_to_install(){
 
   readonly install_log="${tmpdir}/install.log"
 
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
-
   if test -f "${workdir}/prepare-complete" ; then return ; fi
 
   repair_old_backports
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 5727da537..17e38f8ca 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1176,13 +1176,6 @@ function install_dependencies() {
 function prepare_gpu_env(){
   # Verify SPARK compatability
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-  readonly SPARK_VERSION
-  if version_lt "${SPARK_VERSION}" "3.1" || \
-     version_ge "${SPARK_VERSION}" "4.0" ; then
-    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-    exit 1
-  fi
 
   readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
   nvsmi_works="0"
@@ -1190,3 +1183,13 @@ function prepare_gpu_env(){
   if   is_cuda11 ; then gcc_ver="11"
   elif is_cuda12 ; then gcc_ver="12" ; fi
 }
+
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
+  fi
+}

From e97e376b528d403e159d015332ac15d05f649f2d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 19 Dec 2024 19:34:32 -0800
Subject: [PATCH 003/130] added two new gpu functions: configure_mig_cgi and
 enable_mig

---
 templates/gpu/util_functions | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 17e38f8ca..eb7584745 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1193,3 +1193,17 @@ function hold_nvidia_packages() {
     apt-mark hold xserver-xorg-video-nvidia*
   fi
 }
+
+function configure_mig_cgi() {
+  if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then
+    META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI)
+    nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C
+  else
+    # Dataproc only supports A100's right now split in 2 if not specified
+    nvidia-smi mig -cgi 9,9  -C
+  fi
+}
+
+function enable_mig() {
+  nvidia-smi -mig 1
+}

From 310bb9d10eb47db1be705273c8f001b1491538a7 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 19 Dec 2024 19:34:58 -0800
Subject: [PATCH 004/130] templatized version of mig.sh

---
 templates/spark-rapids/mig.sh.in | 240 +++++++++++++++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 templates/spark-rapids/mig.sh.in

diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
new file mode 100644
index 000000000..f77b232fa
--- /dev/null
+++ b/templates/spark-rapids/mig.sh.in
@@ -0,0 +1,240 @@
+#!/bin/bash
+#
+[% template_path="spark-rapids/mig.sh.in" %]
+[% INSERT legal/license_header %]
+#
+# This script should be specified in --metadata=startup-script-url= option and
+# --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it.
+# The script does a reboot to fully enable MIG and then configures the MIG device based on the
+# user specified MIG_CGI profiles specified via: --metadata=^:^MIG_CGI='9,9'. If MIG_CGI
+# is not specified it assumes it's using an A100 and configures 2 instances with profile id 9.
+# It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the
+# YARN setup to fully utilize the MIG instances on YARN.
+#
+[% PROCESS common/template_disclaimer %]
+#
+# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
+
+set -euxo pipefail
+
+[% INSERT common/util_functions %]
+
+[% INSERT gpu/util_functions %]
+
+[% INSERT 'secure-boot/util_functions' %]
+
+function exit_handler() {
+  # Purge private key material until next grant
+  clear_dkms_key
+
+  set +ex
+  echo "Exit handler invoked"
+
+  # Clear pip cache
+  pip cache purge || echo "unable to purge pip cache"
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
+
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+    hold_nvidia_packages
+  else
+    dnf clean all
+  fi
+
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3 | sort -h
+  elif is_debian ; then
+    du -x -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /var/lib/{docker,mysql,} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+      /usr/bin \
+      /usr \
+      /var \
+      / 2>/dev/null | sort -h
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  fi
+
+  # Process disk usage logs from installation period
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
+  # compute maximum size of disk during installation
+  # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+
+  echo "exit_handler has completed"
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
+  fi
+
+  return 0
+}
+
+function prepare_to_install(){
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
+
+  prepare_gpu_env
+
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
+
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
+
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
+
+  mkdir -p "${workdir}"
+  trap exit_handler EXIT
+  set_proxy
+  mount_ramdisk
+
+  readonly install_log="${tmpdir}/install.log"
+
+  if test -f "${workdir}/prepare-complete" ; then return ; fi
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+    hold_nvidia_packages
+  else
+    dnf clean all
+  fi
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  ) fi
+
+  install_dependencies
+
+  # Monitor disk usage in a screen session
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
+  screen -d -m -LUS keep-running-df \
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+
+  touch "${workdir}/prepare-complete"
+}
+
+function main() {
+  # default MIG to on when this script is used
+  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+
+  if (lspci | grep -q NVIDIA); then
+    if [[ $META_MIG_VALUE -ne 0 ]]; then
+      # if the first invocation, the NVIDIA drivers and tools are not installed
+      if [[ -f "/usr/bin/nvidia-smi" ]]; then
+        # check to see if we already enabled mig mode and rebooted so we don't end
+        # up in infinite reboot loop
+        NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l`
+        if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
+          if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
+            echo "MIG is enabled on all GPUs, configuring instances"
+            configure_mig_cgi
+            exit 0
+          else
+            echo "GPUs present but MIG is not enabled"
+          fi
+        else
+          echo "More than 1 GPU with MIG configured differently between them"
+        fi
+      fi
+    fi
+
+    install_nvidia_gpu_driver
+
+    if [[ ${META_MIG_VALUE} -ne 0 ]]; then
+      enable_mig
+      NUM_GPUS_WITH_DIFF_MIG_MODES="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)"
+      if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
+        if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
+          echo "MIG is fully enabled, we don't need to reboot"
+          configure_mig_cgi
+        else
+          echo "MIG is configured on but NOT enabled.  Failing"
+          exit 1
+        fi
+      else
+        echo "MIG is NOT enabled all on GPUs.  Failing"
+        exit 1
+      fi
+    else
+      echo "Not enabling MIG"
+    fi
+  fi
+}
+
+prepare_to_install
+
+main

From 912ebe7f44cdbeae3a41dbf1e49a26ebd6e83254 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 19 Dec 2024 19:43:38 -0800
Subject: [PATCH 005/130] comment fix-up

---
 templates/spark-rapids/mig.sh.in | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
index f77b232fa..815065965 100644
--- a/templates/spark-rapids/mig.sh.in
+++ b/templates/spark-rapids/mig.sh.in
@@ -2,6 +2,7 @@
 #
 [% template_path="spark-rapids/mig.sh.in" %]
 [% INSERT legal/license_header %]
+# This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures.
 #
 # This script should be specified in --metadata=startup-script-url= option and
 # --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it.
@@ -12,8 +13,6 @@
 # YARN setup to fully utilize the MIG instances on YARN.
 #
 [% PROCESS common/template_disclaimer %]
-#
-# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
 
 set -euxo pipefail
 

From 87965de1b6af8e7599d149ff141fdccd66028e90 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 20 Dec 2024 15:40:51 -0800
Subject: [PATCH 006/130] nvidia-container-toolkit repo setup changes are
 working on rocky8

---
 templates/common/util_functions        | 97 +++++++++++++++++++++-----
 templates/gpu/install_gpu_driver.sh.in | 11 ++-
 templates/gpu/util_functions           | 32 ++++-----
 3 files changed, 99 insertions(+), 41 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index df84feff5..8cc3ede9e 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -7,25 +7,30 @@ function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
 function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
 
-readonly -A supported_os=(
-  ['debian']="10 11 12"
-  ['rocky']="8 9"
-  ['ubuntu']="18.04 20.04 22.04"
-)
-
-# dynamically define OS version test utility functions
-if [[ "$(os_id)" == "rocky" ]];
-then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
-else _os_version="$(os_version)"; fi
-for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
-  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
-
-  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
-    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
-    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
-    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+function define_os_comparison_functions() {
+
+  readonly -A supported_os=(
+    ['debian']="10 11 12"
+    ['rocky']="8 9"
+    ['ubuntu']="18.04 20.04 22.04"
+  )
+
+  # dynamically define OS version test utility functions
+  if [[ "$(os_id)" == "rocky" ]];
+  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+  else _os_version="$(os_version)"; fi
+  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+    done
   done
-done
+}
+
+define_os_comparison_functions
 
 function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
 
@@ -132,6 +137,7 @@ function cache_fetched_package() {
 }
 
 function add_contrib_component() {
+  if ! is_debuntu ; then return ; fi
   if ge_debian12 ; then
       # Include in sources file components on which nvidia-kernel-open-dkms depends
       local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
@@ -377,6 +383,61 @@ function check_os() {
   fi
 }
 
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+  fi
+}
+
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+    | dd of="${repo_path}" status=progress
+}
+
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
+#
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+  mkdir -p "$(dirname "${kr_path}")"
+
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+  if is_debuntu ; then apt_add_repo $*
+                  else dnf_add_repo $* ; fi
+}
+
+
 readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
 
 # Dataproc configurations
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 23ae59d8f..c52f79675 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -93,12 +93,11 @@ function main() {
   fi
 
   # Restart YARN services if they are running already
-  if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-resourcemanager.service
-  fi
-  if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-nodemanager.service
-  fi
+  for svc in resourcemanager nodemanager; do
+    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
+      systemctl restart hadoop-yarn-${svc}.service
+    fi
+  done
 }
 
 function exit_handler() {
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index eb7584745..7faed760c 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -70,6 +70,10 @@ function set_cuda_version() {
   readonly DEFAULT_CUDA_VERSION
 
   CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
+    CUDA_FULL_VERSION="${CUDA_VERSION}"
+    CUDA_VERSION="${CUDA_VERSION%.*}"
+  fi
   readonly CUDA_VERSION
   if ( ! test -v CUDA_FULL_VERSION ) ; then
     CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
@@ -614,23 +618,17 @@ function add_nonfree_components() {
 }
 
 function add_repo_nvidia_container_toolkit() {
-  if is_debuntu ; then
-      local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-      local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
-      # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
-      test -f "${kr_path}" ||
-        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
-          | gpg --dearmor -o "${kr_path}"
-
-      test -f "${sources_list_path}" ||
-        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
-          | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
-          | tee "${sources_list_path}"
-      apt-get update
-  else
-    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
-      tee /etc/yum.repos.d/nvidia-container-toolkit.repo
-  fi
+  local nvctk_root="https://nvidia.github.io/libnvidia-container"
+  local signing_key_url="${nvctk_root}/gpgkey"
+  local repo_data
+
+  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
+                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+
+  os_add_repo nvidia-container-toolkit \
+              "${signing_key_url}" \
+              "${repo_data}" \
+              "no"
 }
 
 function add_repo_cuda() {

From 93fe4cc6242b1c06df8920b8ee289954c23321f2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 20 Dec 2024 16:15:32 -0800
Subject: [PATCH 007/130] defining variables in the generator script instead of
 duplicating in the root template ; do not hold nvidia packages in the prepare
 function

---
 templates/generate-action.pl           | 19 +++++++++----------
 templates/gpu/install_gpu_driver.sh.in |  1 -
 templates/spark-rapids/mig.sh.in       |  2 --
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/templates/generate-action.pl b/templates/generate-action.pl
index 407dfe310..7cc954a67 100644
--- a/templates/generate-action.pl
+++ b/templates/generate-action.pl
@@ -6,20 +6,19 @@
 
 use Template;
 use strict;
-use v5.10;
+
+my $action = $ARGV[0];
+my $v = { template_path => "${action}.in" };
+
+sub usage{ die "Usage: $0 <action>" }
+
+usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}" );
 
 my $tt = Template->new( {
   INCLUDE_PATH => "$ENV{PWD}/templates",
+  VARIABLES => $v,
   INTERPOLATE  => 0,
 }) || die "$Template::ERROR$/";
 
-my $action = $ARGV[0];
-
-sub usage{
-  die "Usage: $0 <action>";
-}
-
-usage unless( -f "$ENV{PWD}/templates/${action}.in" );
 
-$tt->process("${action}.in")
-    || die $tt->error(), "\n";
+$tt->process($v->{template_path}) or die( $tt->error(), "\n" );
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index c52f79675..a5d4172dd 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -1,6 +1,5 @@
 #!/bin/bash
 #
-[% template_path="gpu/install_gpu_driver.sh.in" %]
 [% INSERT legal/license_header %]
 #
 [% PROCESS common/template_disclaimer %]
diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
index 815065965..fff1186dc 100644
--- a/templates/spark-rapids/mig.sh.in
+++ b/templates/spark-rapids/mig.sh.in
@@ -1,6 +1,5 @@
 #!/bin/bash
 #
-[% template_path="spark-rapids/mig.sh.in" %]
 [% INSERT legal/license_header %]
 # This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures.
 #
@@ -165,7 +164,6 @@ function prepare_to_install(){
     apt-get -o DPkg::Lock::Timeout=60 -y autoremove
     if ge_debian12 ; then
     apt-mark unhold systemd libsystemd0 ; fi
-    hold_nvidia_packages
   else
     dnf clean all
   fi

From b82aadc2e18e8905f3e2c71f1802c5abfb4f9a6e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 20 Dec 2024 17:36:16 -0800
Subject: [PATCH 008/130] tested with debian12

---
 templates/common/util_functions | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 8cc3ede9e..b777968e5 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -397,6 +397,8 @@ function apt_add_repo() {
   if [[ "${include_src}" == "yes" ]] ; then
     echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
   fi
+
+  apt-get update -qq
 }
 
 #
@@ -424,6 +426,7 @@ function dnf_add_repo() {
 function os_add_repo() {
   local -r repo_name="$1"
   local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
   local kr_path
   if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
                   else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
@@ -433,8 +436,8 @@ function os_add_repo() {
   curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
     | gpg --import --no-default-keyring --keyring "${kr_path}"
 
-  if is_debuntu ; then apt_add_repo $*
-                  else dnf_add_repo $* ; fi
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
 }
 
 

From dd98436cd728ce5e1366a6d3e602b4231024a105 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 20 Dec 2024 20:57:34 -0800
Subject: [PATCH 009/130] tested on 8x H100s with bookworm

---
 templates/gpu/util_functions | 53 ++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 7faed760c..f2f3e2a9c 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1192,13 +1192,56 @@ function hold_nvidia_packages() {
   fi
 }
 
+function delete_mig_instances() (
+  # delete all instances
+  set +e
+  nvidia-smi mig -dci
+
+  case "${?}" in
+    "0" ) echo "compute instances deleted"            ;;
+    "2" ) echo "invalid argument"                     ;;
+    "6" ) echo "No compute instances found to delete" ;;
+    *   ) echo "unrecognized return code"             ;;
+  esac
+
+  nvidia-smi mig -dgi
+  case "${?}" in
+    "0" ) echo "compute instances deleted"        ;;
+    "2" ) echo "invalid argument"                 ;;
+    "6" ) echo "No GPU instances found to delete" ;;
+    *   ) echo "unrecognized return code"         ;;
+  esac
+)
+
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
 function configure_mig_cgi() {
-  if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then
-    META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI)
-    nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C
+  delete_mig_instances
+  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
+  if test -n "${META_MIG_CGI_VALUE}"; then
+    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
   else
-    # Dataproc only supports A100's right now split in 2 if not specified
-    nvidia-smi mig -cgi 9,9  -C
+    if lspci | grep -q H100 ; then
+      # run the following command to list placement profiles
+      # nvidia-smi mig -lgipp
+      #
+      # This is the result when using H100 instances on 20241220
+      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
+      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
+      # GPU  0 Profile ID  9 Placements: {0,4}:4
+      # GPU  0 Profile ID  5 Placement : {0}:4
+      # GPU  0 Profile ID  0 Placement : {0}:8
+
+      # For H100 3D controllers, use profile 19, 7x1G instances
+      nvidia-smi mig -cgi 19 -C
+    elif lspci | grep -q A100 ; then
+      # Dataproc only supports A100s right now split in 2 if not specified
+      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
+      nvidia-smi mig -cgi 9,9 -C
+    else
+      echo "unrecognized 3D controller"
+    fi
   fi
 }
 

From b4dabad7eaf0f05ae6a889a7538f9362affd1111 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 21 Dec 2024 18:13:58 -0800
Subject: [PATCH 010/130] created and called function enable_and_configure_mig

---
 spark-rapids/mig.sh              | 2201 ++++++++++++++++++++++++++----
 templates/spark-rapids/mig.sh.in |   68 +-
 2 files changed, 1965 insertions(+), 304 deletions(-)

diff --git a/spark-rapids/mig.sh b/spark-rapids/mig.sh
index 85300348d..473513438 100644
--- a/spark-rapids/mig.sh
+++ b/spark-rapids/mig.sh
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 # This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures.
+#
 # This script should be specified in --metadata=startup-script-url= option and
 # --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it.
 # The script does a reboot to fully enable MIG and then configures the MIG device based on the
@@ -21,370 +22,2030 @@
 # It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the
 # YARN setup to fully utilize the MIG instances on YARN.
 #
-# Much of this code is copied from install_gpu_driver.sh to do the driver and CUDA installation.
-# It's copied in order to not affect the existing scripts when not using MIG.
+# This initialization action is generated from
+# initialization-actions/templates/spark-rapids/mig.sh.in
+#
+# Modifications made directly to the generated file will be lost when
+# the template is re-evaluated
+
 
 set -euxo pipefail
 
-function get_metadata_attribute() {
-  local -r attribute_name=$1
-  local -r default_value=$2
-  /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
-}
-
-# Fetch Linux Family distro and Dataproc Image version
-readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
-readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
-DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value image|grep -Eo 'dataproc-[0-9]-[0-9]'|grep -Eo '[0-9]-[0-9]'|sed -e 's/-/./g')
-echo "${DATAPROC_IMAGE_VERSION}" >> /usr/local/share/startup-mig-log
-
-# CUDA version and Driver version config
-CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.2.2')  #12.2.2
-NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '535.104.05') #535.104.05
-CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
-
-# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
-if [[ "${OS_NAME}" == "ubuntu" ]]; then
-    UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04
-    UBUNTU_VERSION=${UBUNTU_VERSION%.*}
-    if [[ "${UBUNTU_VERSION}" == "18" ]]; then
-      CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
-      NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
-      CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
-    fi
-fi
+function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
 
-SECURE_BOOT="disabled"
-SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
+function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
+function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
+function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
 
-function execute_with_retries() {
-  local -r cmd=$1
-  for ((i = 0; i < 10; i++)); do
-    if eval "$cmd"; then
-      return 0
-    fi
+function define_os_comparison_functions() {
+
+  readonly -A supported_os=(
+    ['debian']="10 11 12"
+    ['rocky']="8 9"
+    ['ubuntu']="18.04 20.04 22.04"
+  )
+
+  # dynamically define OS version test utility functions
+  if [[ "$(os_id)" == "rocky" ]];
+  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+  else _os_version="$(os_version)"; fi
+  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+    done
+  done
+}
+
+define_os_comparison_functions
+
+function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
+
+function os_vercat()   ( set +x
+  if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
+  elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
+                   else os_version ; fi ; )
+
+function repair_old_backports {
+  if ! is_debuntu ; then return ; fi
+  # This script uses 'apt-get update' and is therefore potentially dependent on
+  # backports repositories which have been archived.  In order to mitigate this
+  # problem, we will use archive.debian.org for the oldoldstable repo
+
+  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
+  debdists="https://deb.debian.org/debian/dists"
+  oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
+  oldstable=$(   curl -s "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
+  stable=$(      curl -s "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');
+
+  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
+
+  for filename in "${matched_files[@]}"; do
+    # Fetch from archive.debian.org for ${oldoldstable}-backports
+    perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
+                  {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
+  done
+}
+
+function print_metadata_value() {
+  local readonly tmpfile=$(mktemp)
+  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
+    -s -o ${tmpfile} 2>/dev/null)
+  local readonly return_code=$?
+  # If the command completed successfully, print the metadata value to stdout.
+  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
+    cat ${tmpfile}
+  fi
+  rm -f ${tmpfile}
+  return ${return_code}
+}
+
+function print_metadata_value_if_exists() {
+  local return_code=1
+  local readonly url=$1
+  print_metadata_value ${url}
+  return_code=$?
+  return ${return_code}
+}
+
+# replicates /usr/share/google/get_metadata_value
+function get_metadata_value() (
+  set +x
+  local readonly varname=$1
+  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
+  # Print the instance metadata value.
+  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
+  return_code=$?
+  # If the instance doesn't have the value, try the project.
+  if [[ ${return_code} != 0 ]]; then
+    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
+    return_code=$?
+  fi
+
+  return ${return_code}
+)
+
+function get_metadata_attribute() (
+  set +x
+  local -r attribute_name="$1"
+  local -r default_value="${2:-}"
+  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
+)
+
+function execute_with_retries() (
+  set +x
+  local -r cmd="$*"
+
+  if [[ "$cmd" =~ "^apt-get install" ]] ; then
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+  fi
+  for ((i = 0; i < 3; i++)); do
+    set -x
+    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    set +x
+    if [[ $retval == 0 ]] ; then return 0 ; fi
     sleep 5
   done
   return 1
+)
+
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
+
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
 }
 
-# Enables a systemd service on bootup to install new headers.
-# This service recompiles kernel modules for Ubuntu and Debian, which are necessary for the functioning of nvidia-smi.
-function setup_systemd_update_headers() {
-  cat <<EOF >/lib/systemd/system/install-headers.service
-[Unit]
-Description=Install Linux headers for the current kernel
-After=network-online.target
+function add_contrib_component() {
+  if ! is_debuntu ; then return ; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
 
-[Service]
-ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done'
-Type=oneshot
-RemainAfterExit=yes
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
+}
 
-[Install]
-WantedBy=multi-user.target
-EOF
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
+}
 
-  # Reload systemd to recognize the new unit file
-  systemctl daemon-reload
+function configure_yarn_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
 
-  # Enable and start the service
-  systemctl enable --now install-headers.service
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
 }
 
-# Install NVIDIA GPU driver provided by NVIDIA
-function install_nvidia_gpu_driver() {
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
 
-  ## common steps for all linux family distros
-  readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*}
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
 
-  ## installation steps based OS_NAME
-  if [[ ${OS_NAME} == "debian" ]]; then
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
 
-    DEBIAN_VERSION=$(lsb_release -r|awk '{print $2}') # 10 or 11
-    export DEBIAN_FRONTEND=noninteractive
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
 
-    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
 
-    readonly LOCAL_INSTALLER_DEB="cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
-    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-      "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
 
-    dpkg -i /tmp/local-installer.deb
-    cp /var/cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/
-    add-apt-repository contrib
-    execute_with_retries "apt-get update"
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
 
-    if [[ ${DEBIAN_VERSION} == 10 ]]; then
-      apt remove -y libglvnd0
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
     fi
 
-    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
-    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
 
-    # enable a systemd service that updates kernel headers after reboot
-    setup_systemd_update_headers
-   
-  elif [[ ${OS_NAME} == "ubuntu" ]]; then
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
 
-    UBUNTU_VERSION=$(lsb_release -r|awk '{print $2}') # 20.04 or 22.04
-    UBUNTU_VERSION=${UBUNTU_VERSION%.*} # 20 or 22
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
 
-    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
 
-    readonly UBUNTU_REPO_CUDA_PIN="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-ubuntu${UBUNTU_VERSION}04.pin"
-    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-      "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
 
-    readonly LOCAL_INSTALLER_DEB="cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
-    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-      "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
 
-    dpkg -i /tmp/local-installer.deb
-    cp /var/cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/
-    execute_with_retries "apt-get update"    
-    
-    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
-    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+      fi
+    done
+  fi
 
-    # enable a systemd service that updates kernel headers after reboot
-    setup_systemd_update_headers
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
 
-  elif [[ ${OS_NAME} == "rocky" ]]; then
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  fi
 
-    ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8.8 or 9.1
-    ROCKY_VERSION=${ROCKY_VERSION%.*} # 8 or 9
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
 
-    readonly NVIDIA_ROCKY_REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/rhel${ROCKY_VERSION}/x86_64/cuda-rhel${ROCKY_VERSION}.repo"
-    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
-    execute_with_retries "dnf clean all"
-    execute_with_retries "dnf -y -q module install nvidia-driver:${NVIDIA_DRIVER_VERSION_PREFIX}"
-    execute_with_retries "dnf -y -q install cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
-    modprobe nvidia
+}
+
+function set_proxy(){
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+  export METADATA_HTTP_PROXY
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+  local no_proxy_svc
+  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
+                      bigquery composer      pubsub bigquerydatatransfer dataflow \
+                      storage  datafusion    ; do
+    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  done
+
+  export NO_PROXY="${no_proxy}"
+}
+
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
 
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Download pip packages to tmpfs
+  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
   else
-    echo "Unsupported OS: '${OS_NAME}'"
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+}
+
+function check_os() {
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
+
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
     exit 1
   fi
-  ldconfig
-  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
 }
 
-function enable_mig() {
-  nvidia-smi -mig 1
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+  fi
+
+  apt-get update -qq
 }
 
-function configure_mig_cgi() {
-  if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then
-    META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI)
-    nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C
-  else
-    # Dataproc only supports A100's right now split in 2 if not specified
-    nvidia-smi mig -cgi 9,9  -C
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+    | dd of="${repo_path}" status=progress
+}
+
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
+#
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+  mkdir -p "$(dirname "${kr_path}")"
+
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
+
+
+readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
+# Dataproc configurations
+readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+readonly HIVE_CONF_DIR='/etc/hive/conf'
+readonly SPARK_CONF_DIR='/etc/spark/conf'
+
+
+function set_support_matrix() {
+  # CUDA version and Driver version
+  # https://docs.nvidia.com/deploy/cuda-compatibility/
+  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+  # https://developer.nvidia.com/cuda-downloads
+
+  # Minimum supported version for open kernel driver is 515.43.04
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
+  # Rocky8: 12.0: 525.147.05
+  local latest
+  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+  readonly -A DRIVER_FOR_CUDA=(
+          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
+          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+  )
+  readonly -A DRIVER_SUBVER=(
+          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+  )
+  # https://developer.nvidia.com/cudnn-downloads
+  if is_debuntu ; then
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+  )
+  elif is_rocky ; then
+  # rocky:
+  #   12.0: 8.8.1.3
+  #   12.1: 8.9.3.28
+  #   12.2: 8.9.7.29
+  #   12.3: 9.0.0.312
+  #   12.4: 9.1.1.17
+  #   12.5: 9.2.1.18
+  #   12.6: 9.5.1.17
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
+  )
   fi
+  # https://developer.nvidia.com/nccl/nccl-download
+  # 12.2: 2.19.3, 12.5: 2.21.5
+  readonly -A NCCL_FOR_CUDA=(
+          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
+          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+  )
+  readonly -A CUDA_SUBVER=(
+          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
+          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
+  )
 }
 
-function upgrade_kernel() {
-  # Determine which kernel is installed
-  if [[ "${OS_NAME}" == "debian" ]]; then
-    CURRENT_KERNEL_VERSION=`cat /proc/version  | perl -ne 'print( / Debian (\S+) / )'`
-  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
-    CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( /^Linux version (\S+) / )'`
-  elif [[ ${OS_NAME} == rocky ]]; then
-    KERN_VER=$(yum info --installed kernel | awk '/^Version/ {print $3}')
-    KERN_REL=$(yum info --installed kernel | awk '/^Release/ {print $3}')
-    CURRENT_KERNEL_VERSION="${KERN_VER}-${KERN_REL}"
-  else
-    echo "unsupported OS: ${OS_NAME}!"
-    exit -1
-  fi
-
-  # Get latest version available in repos
-  if [[ "${OS_NAME}" == "debian" ]]; then
-    apt-get -qq update
-    TARGET_VERSION=$(apt-cache show --no-all-versions linux-image-amd64 | awk '/^Version/ {print $2}')
-  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
-    apt-get -qq update
-    LATEST_VERSION=$(apt-cache show --no-all-versions linux-image-gcp | awk '/^Version/ {print $2}')
-    TARGET_VERSION=`echo ${LATEST_VERSION} | perl -ne 'printf(q{%s-%s-gcp},/(\d+\.\d+\.\d+)\.(\d+)/)'`
-  elif [[ "${OS_NAME}" == "rocky" ]]; then
-    if yum info --available kernel ; then
-      KERN_VER=$(yum info --available kernel | awk '/^Version/ {print $3}')
-      KERN_REL=$(yum info --available kernel | awk '/^Release/ {print $3}')
-      TARGET_VERSION="${KERN_VER}-${KERN_REL}"
-    else
-      TARGET_VERSION="${CURRENT_KERNEL_VERSION}"
+set_support_matrix
+
+function set_cuda_version() {
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+  if [[ -n "${cuda_url}" ]] ; then
+    # if cuda-url metadata variable has been passed, extract default version from url
+    local CUDA_URL_VERSION
+    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
+      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
     fi
   fi
 
-  # Skip this script if we are already on the target version
-  if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then
-    echo "target kernel version [${TARGET_VERSION}] is installed"
+  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
+    DEFAULT_CUDA_VERSION='12.4'
+  fi
+  readonly DEFAULT_CUDA_VERSION
 
-    # Reboot may have interrupted dpkg.  Bring package system to a good state
-    if [[ "${OS_NAME}" == "debian" || "${OS_NAME}" == "ubuntu" ]]; then
-      dpkg --configure -a
-    fi
+  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
+    CUDA_FULL_VERSION="${CUDA_VERSION}"
+    CUDA_VERSION="${CUDA_VERSION%.*}"
+  fi
+  readonly CUDA_VERSION
+  if ( ! test -v CUDA_FULL_VERSION ) ; then
+    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
+  fi
+  readonly CUDA_FULL_VERSION
 
-    return 0
+}
+
+set_cuda_version
+
+function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
+function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
+function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
+
+function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
+function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
+function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
+
+function set_driver_version() {
+  local gpu_driver_url
+  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
+
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+  local DEFAULT_DRIVER
+  # Take default from gpu-driver-url metadata value
+  if [[ -n "${gpu_driver_url}" ]] ; then
+    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
+    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
+  # Take default from cuda-url metadata value as a backup
+  elif [[ -n "${cuda_url}" ]] ; then
+    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
+      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
+      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
+      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the version indicated by the cuda url as the default if it exists
+	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the maximum sub-version available for the major version indicated in cuda url as the default
+	DEFAULT_DRIVER="${driver_max_maj_version}"
+      fi
+    fi
   fi
 
-  # Install the latest kernel
-  if [[ ${OS_NAME} == debian ]]; then
-    apt-get install -y linux-image-amd64
-  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
-    apt-get install -y linux-image-gcp
-  elif [[ "${OS_NAME}" == "rocky" ]]; then
-    dnf -y -q install kernel
+  if ( ! test -v DEFAULT_DRIVER ) ; then
+    # If a default driver version has not been extracted, use the default for this version of CUDA
+    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
   fi
 
-  # Make it possible to reboot before init actions are complete - #1033
-  DP_ROOT=/usr/local/share/google/dataproc
-  STARTUP_SCRIPT="${DP_ROOT}/startup-script.sh"
-  POST_HDFS_STARTUP_SCRIPT="${DP_ROOT}/post-hdfs-startup-script.sh"
+  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
 
-  for startup_script in ${STARTUP_SCRIPT} ${POST_HDFS_STARTUP_SCRIPT} ; do
-    sed -i -e 's:/usr/bin/env bash:/usr/bin/env bash\nexit 0:' ${startup_script}
-  done
+  readonly DRIVER_VERSION
+  readonly DRIVER="${DRIVER_VERSION%%.*}"
 
-  cp /var/log/dataproc-initialization-script-0.log /var/log/dataproc-initialization-script-0.log.0
+  export DRIVER_VERSION DRIVER
 
-  systemctl reboot
+  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
+    exit 1
+  fi
 }
 
-# Verify if compatible linux distros and secure boot options are used
-function check_os_and_secure_boot() {
-  if [[ "${OS_NAME}" == "debian" ]]; then
-    DEBIAN_VERSION=$(lsb_release -r | awk '{print $2}') # 10 or 11
-    if [[ "${DEBIAN_VERSION}" != "10" && "${DEBIAN_VERSION}" != "11" ]]; then
-      echo "Error: The Debian version (${DEBIAN_VERSION}) is not supported. Please use a compatible Debian version."
-      exit 1
-    fi
-  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
-    UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04
-    UBUNTU_VERSION=${UBUNTU_VERSION%.*}
-    if [[ "${UBUNTU_VERSION}" != "18" && "${UBUNTU_VERSION}" != "20" && "${UBUNTU_VERSION}" != "22" ]]; then
-      echo "Error: The Ubuntu version (${UBUNTU_VERSION}) is not supported. Please use a compatible Ubuntu version."
-      exit 1
+set_driver_version
+
+readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+# Parameters for NVIDIA-provided cuDNN library
+readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
+function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
+# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
+  # cuDNN v8 is not distribution for ubuntu20+, debian12
+  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
+  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+  CUDNN_VERSION="8.8.0.121"
+fi
+readonly CUDNN_VERSION
+
+readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
+# Parameters for NVIDIA-provided Debian GPU driver
+readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+readonly USERSPACE_FILENAME
+
+# Short name for urls
+if is_ubuntu22  ; then
+    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
+    # https://developer.download.nvidia.com/compute/machine-learning/repos/
+    # use packages from previous release until such time as nvidia
+    # release ubuntu2204 builds
+
+    shortname="$(os_id)$(os_vercat)"
+    nccl_shortname="ubuntu2004"
+elif ge_rocky9 ; then
+    # use packages from previous release until such time as nvidia
+    # release rhel9 builds
+
+    shortname="rhel9"
+    nccl_shortname="rhel8"
+elif is_rocky ; then
+    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
+    nccl_shortname="${shortname}"
+else
+    shortname="$(os_id)$(os_vercat)"
+    nccl_shortname="${shortname}"
+fi
+
+# Parameters for NVIDIA-provided package repositories
+readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+
+# Parameters for NVIDIA-provided NCCL library
+readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
+NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
+readonly NCCL_REPO_URL
+readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
+
+function set_cuda_runfile_url() {
+  local MAX_DRIVER_VERSION
+  local MAX_CUDA_VERSION
+
+  local MIN_OPEN_DRIVER_VER="515.48.07"
+  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
+  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
+
+  if is_cuda12 ; then
+    if is_debian12 ; then
+      MIN_DRIVER_VERSION="545.23.06"
+      MIN_CUDA_VERSION="12.3.0"
+    elif is_debian10 ; then
+      MAX_DRIVER_VERSION="555.42.02"
+      MAX_CUDA_VERSION="12.5.0"
+    elif is_ubuntu18 ; then
+      MAX_DRIVER_VERSION="530.30.02"
+      MAX_CUDA_VERSION="12.1.1"
     fi
-  elif [[ "${OS_NAME}" == "rocky" ]]; then
-    ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8 or 9
-    ROCKY_VERSION=${ROCKY_VERSION%.*}
-    if [[ "${ROCKY_VERSION}" != "8" && "${ROCKY_VERSION}" != "9" ]]; then
-      echo "Error: The Rocky Linux version (${ROCKY_VERSION}) is not supported. Please use a compatible Rocky Linux version."
-      exit 1
+  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    if le_debian10 ; then
+      # cuda 11 is not supported for <= debian10
+      MAX_CUDA_VERSION="0"
+      MAX_DRIVER_VERSION="0"
     fi
+  else
+    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+
+  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
+    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
+    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
+    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
   fi
 
-  if [[ "${SECURE_BOOT}" == "enabled" ]]; then 
-    echo "Error: Secure Boot is enabled. Please disable Secure Boot while creating the cluster."
+  # driver version named in cuda runfile filename
+  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+  readonly -A drv_for_cuda=(
+          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+          ["11.8.0"]="520.61.05"
+          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+          ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+  )
+
+  # Verify that the file with the indicated combination exists
+  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
+  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
+  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
+  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
+
+  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
+  readonly NVIDIA_CUDA_URL
+
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
+  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
     exit 1
   fi
+
+  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
+  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+  fi
 }
 
-# Detect dataproc image version from its various names
-if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then
-  DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+set_cuda_runfile_url
+
+# Parameter for NVIDIA-provided Rocky Linux GPU driver
+readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+
+CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
+CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
+if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
+  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
+  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
+    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
+    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
+  fi
+  # Use legacy url format with one of the tarball name formats depending on version as above
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
+fi
+if ( version_ge "${CUDA_VERSION}" "12.0" ); then
+  # Use modern url format When cuda version is greater than or equal to 12.0
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
 fi
+readonly CUDNN_TARBALL
+readonly CUDNN_TARBALL_URL
 
-function remove_old_backports {
-  # This script uses 'apt-get update' and is therefore potentially dependent on
-  # backports repositories which have been archived.  In order to mitigate this
-  # problem, we will remove any reference to backports repos older than oldstable
+# Whether to install NVIDIA-provided or OS-provided GPU driver
+GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+readonly GPU_DRIVER_PROVIDER
 
-  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
-  oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}');
-  stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}');
-
-  matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)"
-  if [[ -n "$matched_files" ]]; then
-    for filename in "$matched_files"; do
-      grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \
-        sed -i -e 's/^.*-backports.*$//' "$filename"
-    done
+# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+readonly INSTALL_GPU_AGENT
+
+NVIDIA_SMI_PATH='/usr/bin'
+MIG_MAJOR_CAPS=0
+IS_MIG_ENABLED=0
+
+CUDA_KEYRING_PKG_INSTALLED="0"
+function install_cuda_keyring_pkg() {
+  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
+  local kr_ver=1.1
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
+    -o "${tmpdir}/cuda-keyring.deb"
+  dpkg -i "${tmpdir}/cuda-keyring.deb"
+  rm -f "${tmpdir}/cuda-keyring.deb"
+  CUDA_KEYRING_PKG_INSTALLED="1"
+}
+
+function uninstall_cuda_keyring_pkg() {
+  apt-get purge -yq cuda-keyring
+  CUDA_KEYRING_PKG_INSTALLED="0"
+}
+
+function install_local_cuda_repo() {
+  if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
+
+  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  CUDA_LOCAL_REPO_INSTALLED="1"
+  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
+  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
+  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
+  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
+  readonly DIST_KEYRING_DIR="/var/${pkgname}"
+
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+
+  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+
+  if is_ubuntu ; then
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
+      -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
+
+  touch "${workdir}/install-local-cuda-repo-complete"
+}
+function uninstall_local_cuda_repo(){
+  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
+  rm -f "${workdir}/install-local-cuda-repo-complete"
 }
 
-function main() {
-  if [[ ${OS_NAME} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
-    remove_old_backports
+CUDNN_PKG_NAME=""
+function install_local_cudnn_repo() {
+  if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi
+  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
+  CUDNN_PKG_NAME="${pkgname}"
+  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
+
+  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
+
+  dpkg -i "${tmpdir}/local-installer.deb"
+
+  rm -f "${tmpdir}/local-installer.deb"
+
+  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+
+  touch "${workdir}/install-local-cudnn-repo-complete"
+}
+
+function uninstall_local_cudnn_repo() {
+  apt-get purge -yq "${CUDNN_PKG_NAME}"
+  rm -f "${workdir}/install-local-cudnn-repo-complete"
+}
+
+CUDNN8_LOCAL_REPO_INSTALLED="0"
+CUDNN8_PKG_NAME=""
+function install_local_cudnn8_repo() {
+  if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi
+
+  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
+  elif is_debian ; then cudnn8_shortname="debian11"
+  else return 0 ; fi
+  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
+  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
+  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
+
+  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
+  CUDNN8_PKG_NAME="${pkgname}"
+
+  deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_fn="${tmpdir}/${deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+
+  # cache the cudnn package
+  cache_fetched_package "${local_deb_url}" \
+                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${local_deb_fn}"
+
+  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
+  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
+  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
+    mkdir -p "${cudnn_path}"
+    mount -t tmpfs tmpfs "${cudnn_path}"
+  fi
+
+  dpkg -i "${local_deb_fn}"
+
+  rm -f "${local_deb_fn}"
+
+  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  touch "${workdir}/install-local-cudnn8-repo-complete"
+}
+
+function uninstall_local_cudnn8_repo() {
+  apt-get purge -yq "${CUDNN8_PKG_NAME}"
+  rm -f "${workdir}/install-local-cudnn8-repo-complete"
+}
+
+function install_nvidia_nccl() {
+  if test -f "${workdir}/nccl-complete" ; then return ; fi
+
+  if is_cuda11 && is_debian12 ; then
+    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
+    return
+  fi
+
+  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
+
+  # https://github.com/NVIDIA/nccl/blob/master/README.md
+  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Fermi:     SM_20,             compute_30
+  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+  # The following architectures are suppored by open kernel driver
+  # Volta:     SM_70,SM_72,       compute_70,compute_72
+  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+  # The following architectures are supported by CUDA v11.8+
+  # Ada:       SM_89,             compute_89
+  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+  # Blackwell: SM_100,            compute_100
+                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
+  if version_ge "${CUDA_VERSION}" "11.8" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
+  fi
+  if version_ge "${CUDA_VERSION}" "12.0" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
+  fi
+
+  mkdir -p "${workdir}"
+  pushd "${workdir}"
+
+  test -d "${workdir}/nccl" || {
+    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "nccl-${NCCL_VERSION}-1" nccl
+  }
+
+  local build_path
+  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
+                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
+
+  test -d "${workdir}/nccl/build" || {
+    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+
+    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+    if echo "${output}" | grep -q "${gcs_tarball}" ; then
+      # cache hit - unpack from cache
+      echo "cache hit"
+    else
+      # build and cache
+      pushd nccl
+      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+      install_build_dependencies
+      if is_debuntu ; then
+        # These packages are required to build .deb packages from source
+        execute_with_retries \
+          apt-get install -y -qq build-essential devscripts debhelper fakeroot
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.debian.build
+      elif is_rocky ; then
+        # These packages are required to build .rpm packages from source
+        execute_with_retries \
+          dnf -y -q install rpm-build rpmdevtools
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.redhat.build
+      fi
+      tar czvf "/${local_tarball}" "../${build_path}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar xz
+  }
+
+  if is_debuntu ; then
+    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
+  elif is_rocky ; then
+    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
   fi
 
-  check_os_and_secure_boot
-    
-  if [[ "${OS_NAME}" == "rocky" ]]; then
-    if dnf list kernel-devel-$(uname -r) && dnf list kernel-headers-$(uname -r); then
-      echo "kernel devel and headers packages are available.  Proceed without kernel upgrade."
+  popd
+  touch "${workdir}/nccl-complete"
+}
+
+function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
+function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
+
+function install_nvidia_cudnn() {
+  if test -f "${workdir}/cudnn-complete" ; then return ; fi
+  local major_version
+  major_version="${CUDNN_VERSION%%.*}"
+  local cudnn_pkg_version
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
+
+  if is_rocky ; then
+    if is_cudnn8 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn${major_version}" \
+        "libcudnn${major_version}-devel"
+      sync
+    elif is_cudnn9 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
+        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
+      sync
     else
-      upgrade_kernel
+      echo "Unsupported cudnn version: '${major_version}'"
+    fi
+  elif is_debuntu; then
+    if ge_debian12 && is_src_os ; then
+      apt-get -y install nvidia-cudnn
+    else
+      if is_cudnn8 ; then
+        install_local_cudnn8_repo
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+            "libcudnn8=${cudnn_pkg_version}" \
+            "libcudnn8-dev=${cudnn_pkg_version}"
+
+        uninstall_local_cudnn8_repo
+	sync
+      elif is_cudnn9 ; then
+	install_cuda_keyring_pkg
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
+	sync
+      else
+        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
+      fi
     fi
-  fi  
-  
-  if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then
-    export DEBIAN_FRONTEND=noninteractive
-    execute_with_retries "apt-get update"
-    execute_with_retries "apt-get install -y -q pciutils"
-  elif [[ ${OS_NAME} == rocky ]] ; then
-    execute_with_retries "dnf -y -q install pciutils"
+  else
+    echo "Unsupported OS: '${_shortname}'"
+    exit 1
   fi
 
-  # default MIG to on when this script is used
-  META_MIG_VALUE=1
-  if (/usr/share/google/get_metadata_value attributes/ENABLE_MIG); then
-    META_MIG_VALUE=$(/usr/share/google/get_metadata_value attributes/ENABLE_MIG)
-  fi
-
-  if (lspci | grep -q NVIDIA); then
-    if [[ $META_MIG_VALUE -ne 0 ]]; then
-      # if the first invocation, the NVIDIA drivers and tools are not installed
-      if [[ -f "/usr/bin/nvidia-smi" ]]; then
-        # check to see if we already enabled mig mode and rebooted so we don't end
-        # up in infinite reboot loop
-        NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l`
-        if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
-          if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
-            echo "MIG is enabled on all GPUs, configuring instances"
-            configure_mig_cgi
-            exit 0
-          else
-            echo "GPUs present but MIG is not enabled"
-          fi
-        else
-          echo "More than 1 GPU with MIG configured differently between them"
+  ldconfig
+
+  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
+  touch "${workdir}/cudnn-complete"
+}
+
+function add_nonfree_components() {
+  if is_src_nvidia ; then return; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-open-kernel-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib non-free non-free-firmware"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
+  fi
+}
+
+function add_repo_nvidia_container_toolkit() {
+  local nvctk_root="https://nvidia.github.io/libnvidia-container"
+  local signing_key_url="${nvctk_root}/gpgkey"
+  local repo_data
+
+  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
+                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+
+  os_add_repo nvidia-container-toolkit \
+              "${signing_key_url}" \
+              "${repo_data}" \
+              "no"
+}
+
+function add_repo_cuda() {
+  if is_debuntu ; then
+    install_cuda_keyring_pkg # 11.7+, 12.0+
+  elif is_rocky ; then
+    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
+  fi
+}
+
+function build_driver_from_github() {
+  # non-GPL driver will have been built on rocky8
+  if is_rocky8 ; then return 0 ; fi
+  pushd "${workdir}"
+
+  test -d "${workdir}/open-gpu-kernel-modules" || {
+    local tarball_fn="${DRIVER_VERSION}.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
+  }
+
+  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local build_dir
+    if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+      then build_dir="${modulus_md5sum}"
+      else build_dir="unsigned" ; fi
+
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+      echo "cache hit"
+    else
+      # build the kernel modules
+      pushd open-gpu-kernel-modules
+      install_build_dependencies
+      if is_cuda11 && is_ubuntu22 ; then
+        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
+        exit 1
+      fi
+      execute_with_retries make -j$(nproc) modules \
+        >  kernel-open/build.log \
+        2> kernel-open/build_error.log
+      # Sign kernel modules
+      if [[ -n "${PSN}" ]]; then
+        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
+          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+          "${mok_key}" \
+          "${mok_der}" \
+          "${module}"
+        done
+      fi
+      make modules_install \
+        >>  kernel-open/build.log \
+        2>> kernel-open/build_error.log
+      # Collect build logs and installed binaries
+      tar czvf "${local_tarball}" \
+        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+    depmod -a
+  }
+
+  popd
+}
+
+function build_driver_from_packages() {
+  if is_debuntu ; then
+    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
+      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
+      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
+    if is_debian ; then
+      pkglist=(
+        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
+        "nvidia-smi=${DRIVER_VERSION}-1"
+        "nvidia-alternative=${DRIVER_VERSION}-1"
+        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
+        "nvidia-kernel-support=${DRIVER_VERSION}-1"
+        "nvidia-modprobe=${DRIVER_VERSION}-1"
+        "libnvidia-ml1=${DRIVER_VERSION}-1"
+      )
+    fi
+    add_contrib_component
+    apt-get update -qq
+    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
+    #configure_dkms_certs
+    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
+    sync
+
+  elif is_rocky ; then
+    #configure_dkms_certs
+    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
+      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
+    else
+      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
+    fi
+    sync
+  fi
+  #clear_dkms_key
+}
+
+function install_nvidia_userspace_runfile() {
+
+  # This .run file contains NV's OpenGL implementation as well as
+  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
+  # including glib (https://docs.gtk.org/glib/), and what appears to
+  # be a copy of the source from the kernel-open directory of for
+  # example DRIVER_VERSION=560.35.03
+  #
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
+  #
+  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
+  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
+  if test -f "${workdir}/userspace-complete" ; then return ; fi
+  local local_fn="${tmpdir}/userspace.run"
+
+  cache_fetched_package "${USERSPACE_URL}" \
+                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
+                        "${local_fn}"
+
+  local runfile_args
+  runfile_args=""
+  local cache_hit="0"
+  local local_tarball
+
+  if is_rocky8 ; then
+    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+      local_tarball="${workdir}/${build_tarball}"
+      local build_dir
+      if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+        then build_dir="${modulus_md5sum}"
+        else build_dir="unsigned" ; fi
+
+      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+        cache_hit="1"
+        runfile_args="--no-kernel-modules"
+        echo "cache hit"
+      else
+        install_build_dependencies
+
+        local signing_options
+        signing_options=""
+        if [[ -n "${PSN}" ]]; then
+          signing_options="--module-signing-hash sha256 \
+          --module-signing-x509-hash sha256 \
+          --module-signing-secret-key \"${mok_key}\" \
+          --module-signing-public-key \"${mok_der}\" \
+          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
+          "
         fi
+
+        runfile_args="--no-dkms ${signing_options}"
       fi
+    }
+  else
+    runfile_args="--no-kernel-modules"
+  fi
+
+  execute_with_retries bash "${local_fn}" -e -q \
+    ${runfile_args} \
+    --ui=none \
+    --install-libglvnd \
+    --tmpdir="${tmpdir}"
+
+  if is_rocky8 ; then
+    if [[ "${cache_hit}" == "1" ]] ; then
+      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+      depmod -a
+    else
+      tar czvf "${local_tarball}" \
+        /var/log/nvidia-installer.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
     fi
   fi
-  
-  # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
-    if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then
-      execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
-    elif [[ ${OS_NAME} == rocky ]]; then
-      echo "kernel devel and headers not required on rocky.  installing from binary"
+
+  rm -f "${local_fn}"
+  touch "${workdir}/userspace-complete"
+  sync
+}
+
+function install_cuda_runfile() {
+  if test -f "${workdir}/cuda-complete" ; then return ; fi
+  local local_fn="${tmpdir}/cuda.run"
+
+  cache_fetched_package "${NVIDIA_CUDA_URL}" \
+			"${pkg_bucket}/${CUDA_RUNFILE}" \
+                        "${local_fn}"
+
+  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
+  rm -f "${local_fn}"
+  touch "${workdir}/cuda-complete"
+  sync
+}
+
+function install_cuda_toolkit() {
+  local cudatk_package=cuda-toolkit
+  if ge_debian12 && is_src_os ; then
+    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
+  elif [[ -n "${CUDA_VERSION}" ]]; then
+    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
+  fi
+  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
+  readonly cudatk_package
+  if is_debuntu ; then
+#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
+    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
+  elif is_rocky ; then
+    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
+    execute_with_retries dnf -y -q install "${cudatk_package}"
+  fi
+  sync
+}
+
+function load_kernel_module() {
+  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
+  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
+    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+  done
+
+  depmod -a
+  modprobe nvidia
+  for suffix in uvm modeset drm; do
+    modprobe "nvidia-${suffix}"
+  done
+  # TODO: if peermem is available, also modprobe nvidia-peermem
+}
+
+function install_cuda(){
+  if test -f "${workdir}/cuda-repo-complete" ; then return ; fi
+
+  if ( ge_debian12 && is_src_os ) ; then
+    echo "installed with the driver on ${_shortname}"
+    return 0
+  fi
+
+  # The OS package distributions are unreliable
+  install_cuda_runfile
+
+  # Includes CUDA packages
+  add_repo_cuda
+
+  touch "${workdir}/cuda-repo-complete"
+}
+
+function install_nvidia_container_toolkit() {
+  local container_runtime_default
+    if command -v docker     ; then container_runtime_default='docker'
+  elif command -v containerd ; then container_runtime_default='containerd'
+  elif command -v crio       ; then container_runtime_default='crio'
+                               else container_runtime_default='' ; fi
+  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
+
+  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
+
+  add_repo_nvidia_container_toolkit
+  if is_debuntu ; then
+    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
+    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
+  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
+  systemctl restart "${CONTAINER_RUNTIME}"
+}
+
+# Install NVIDIA GPU driver provided by NVIDIA
+function install_nvidia_gpu_driver() {
+  if test -f "${workdir}/gpu-driver-complete" ; then return ; fi
+
+  if ( ge_debian12 && is_src_os ) ; then
+    add_nonfree_components
+    apt-get update -qq
+    apt-get -yq install \
+        dkms \
+        nvidia-open-kernel-dkms \
+        nvidia-open-kernel-support \
+        nvidia-smi \
+        libglvnd0 \
+        libcuda1
+    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
+    return 0
+  fi
+
+  # OS driver packages do not produce reliable driver ; use runfile
+  install_nvidia_userspace_runfile
+
+  build_driver_from_github
+
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+  touch "${workdir}/gpu-driver-complete"
+}
+
+function install_ops_agent(){
+  if test -f "${workdir}/ops-agent-complete" ; then return ; fi
+
+  mkdir -p /opt/google
+  cd /opt/google
+  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
+  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
+
+  touch "${workdir}/ops-agent-complete"
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_agent() {
+  # Stackdriver GPU agent parameters
+#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
+  if ( ! command -v pip && is_debuntu ) ; then
+    execute_with_retries "apt-get install -y -qq python3-pip"
+  fi
+  local install_dir=/opt/gpu-utilization-agent
+  mkdir -p "${install_dir}"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
+    | sed -e 's/-u --format=/--format=/' \
+    | dd status=none of="${install_dir}/report_gpu_metrics.py"
+  local venv="${install_dir}/venv"
+  python3 -m venv "${venv}"
+(
+  source "${venv}/bin/activate"
+  python3 -m pip install --upgrade pip
+  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
+)
+  sync
+
+  # Generate GPU service.
+  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
+[Unit]
+Description=GPU Utilization Metric Agent
+
+[Service]
+Type=simple
+PIDFile=/run/gpu_agent.pid
+ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
+User=root
+Group=root
+WorkingDirectory=/
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+EOF
+  # Reload systemd manager configuration
+  systemctl daemon-reload
+  # Enable gpu-utilization-agent service
+  systemctl --no-reload --now enable gpu-utilization-agent.service
+}
+
+function configure_gpu_exclusive_mode() {
+  # check if running spark 3, if not, enable GPU exclusive mode
+  local spark_version
+  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
+  if [[ ${spark_version} != 3.* ]]; then
+    # include exclusive mode on GPU
+    nvidia-smi -c EXCLUSIVE_PROCESS
+  fi
+}
+
+function fetch_mig_scripts() {
+  mkdir -p /usr/local/yarn-mig-scripts
+  sudo chmod 755 /usr/local/yarn-mig-scripts
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
+  sudo chmod 755 /usr/local/yarn-mig-scripts/*
+}
+
+function configure_gpu_script() {
+  # Download GPU discovery script
+  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
+  mkdir -p ${spark_gpu_script_dir}
+  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
+  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
+  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
+  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
+  cat > "${gpus_resources_script}" <<'EOF'
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
+
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+EOF
+
+  chmod a+rx "${gpus_resources_script}"
+
+  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
+  if version_ge "${SPARK_VERSION}" "3.0" ; then
+    local gpu_count
+    gpu_count="$(lspci | grep NVIDIA | wc -l)"
+    local executor_cores
+    executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+    local executor_memory
+    executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+    local task_cpus=2
+    local gpu_amount
+    gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+
+    cat >>"${spark_defaults_conf}" <<EOF
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
+# query explain output won't show GPU operator, if the user has doubts
+# they can uncomment the line before seeing the GPU plan explain;
+# having AQE enabled gives user the best performance.
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
+spark.executor.resource.gpu.amount=${gpu_count}
+spark.executor.cores=${executor_cores}
+spark.executor.memory=${executor_memory_gb}G
+spark.dynamicAllocation.enabled=false
+# please update this config according to your application
+spark.task.resource.gpu.amount=${gpu_amount}
+spark.task.cpus=2
+spark.yarn.unmanagedAM.enabled=false
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
+  fi
+}
+
+function configure_gpu_isolation() {
+  # enable GPU isolation
+  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
+  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
+    # configure the container-executor.cfg to have major caps
+    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+  else
+    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+  fi
+
+  # Configure a systemd unit to ensure that permissions are set on restart
+  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
+[Unit]
+Description=Set permissions to allow YARN to access device directories
+
+[Service]
+ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+  systemctl enable dataproc-cgroup-device-permissions
+  systemctl start dataproc-cgroup-device-permissions
+}
+
+function nvsmi() {
+  local nvsmi="/usr/bin/nvidia-smi"
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
+  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
+  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
+  else nvsmi_works="1" ; fi
+
+  if [[ "$1" == "-L" ]] ; then
+    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
+    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
+    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
+
+    return 0
+  fi
+
+  "${nvsmi}" $*
+}
+
+function install_build_dependencies() {
+  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
+
+  if is_debuntu ; then
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
+
+  elif is_rocky ; then
+    execute_with_retries dnf -y -q install gcc
+
+    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
+    set +e
+    eval "${dnf_cmd}" > "${install_log}" 2>&1
+    local retval="$?"
+    set -e
+
+    if [[ "${retval}" == "0" ]] ; then return ; fi
+
+    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
+      # this kernel-devel may have been migrated to the vault
+      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
+      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
+      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
+        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
+       )"
+    fi
+
+    execute_with_retries "${dnf_cmd}"
+  fi
+  touch "${workdir}/build-dependencies-complete"
+}
+
+function install_dependencies() {
+  pkg_list="pciutils screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+}
+
+function prepare_gpu_env(){
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+
+  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+  nvsmi_works="0"
+
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+}
+
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
+  fi
+}
+
+function delete_mig_instances() (
+  # delete all instances
+  set +e
+  nvidia-smi mig -dci
+
+  case "${?}" in
+    "0" ) echo "compute instances deleted"            ;;
+    "2" ) echo "invalid argument"                     ;;
+    "6" ) echo "No compute instances found to delete" ;;
+    *   ) echo "unrecognized return code"             ;;
+  esac
+
+  nvidia-smi mig -dgi
+  case "${?}" in
+    "0" ) echo "compute instances deleted"        ;;
+    "2" ) echo "invalid argument"                 ;;
+    "6" ) echo "No GPU instances found to delete" ;;
+    *   ) echo "unrecognized return code"         ;;
+  esac
+)
+
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
+function configure_mig_cgi() {
+  delete_mig_instances
+  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
+  if test -n "${META_MIG_CGI_VALUE}"; then
+    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
+  else
+    if lspci | grep -q H100 ; then
+      # run the following command to list placement profiles
+      # nvidia-smi mig -lgipp
+      #
+      # This is the result when using H100 instances on 20241220
+      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
+      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
+      # GPU  0 Profile ID  9 Placements: {0,4}:4
+      # GPU  0 Profile ID  5 Placement : {0}:4
+      # GPU  0 Profile ID  0 Placement : {0}:8
+
+      # For H100 3D controllers, use profile 19, 7x1G instances
+      nvidia-smi mig -cgi 19 -C
+    elif lspci | grep -q A100 ; then
+      # Dataproc only supports A100s right now split in 2 if not specified
+      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
+      nvidia-smi mig -cgi 9,9 -C
+    else
+      echo "unrecognized 3D controller"
+    fi
+  fi
+}
+
+function enable_mig() {
+  nvidia-smi -mig 1
+}
+
+
+function configure_dkms_certs() {
+  if test -v PSN && [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
+
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
     fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+
+    return
+  fi
 
-    install_nvidia_gpu_driver
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
 
-    if [[ ${META_MIG_VALUE} -ne 0 ]]; then
-      enable_mig
-      NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l`
-      if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
-        if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
-          echo "MIG is fully enabled, we don't need to reboot"
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
+}
+
+function check_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
+
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
+    exit 1
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Please either disable secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return 1
+  fi
+
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
+
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
+
+  configure_dkms_certs
+}
+
+
+function exit_handler() {
+  # Purge private key material until next grant
+  clear_dkms_key
+
+  set +ex
+  echo "Exit handler invoked"
+
+  # Clear pip cache
+  pip cache purge || echo "unable to purge pip cache"
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
+
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+    hold_nvidia_packages
+  else
+    dnf clean all
+  fi
+
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3 | sort -h
+  elif is_debian ; then
+    du -x -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /var/lib/{docker,mysql,} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+      /usr/bin \
+      /usr \
+      /var \
+      / 2>/dev/null | sort -h
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  fi
+
+  # Process disk usage logs from installation period
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
+  # compute maximum size of disk during installation
+  # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+
+  echo "exit_handler has completed"
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
+  fi
+
+  return 0
+}
+
+function prepare_to_install(){
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
+
+  prepare_gpu_env
+
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
+
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
+
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
+
+  mkdir -p "${workdir}"
+  trap exit_handler EXIT
+  set_proxy
+  mount_ramdisk
+
+  readonly install_log="${tmpdir}/install.log"
+
+  if test -f "${workdir}/prepare-complete" ; then return ; fi
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  ) fi
+
+  install_dependencies
+
+  # Monitor disk usage in a screen session
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
+  screen -d -m -LUS keep-running-df \
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+
+  touch "${workdir}/prepare-complete"
+}
+
+function main() {
+  # default MIG to on when this script is used
+  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+
+  if ! (lspci | grep -q NVIDIA) ; then return ; fi
+  if [[ $META_MIG_VALUE -ne 0 ]]; then
+    # if the first invocation, the NVIDIA drivers and tools are not installed
+    if [[ -f "/usr/bin/nvidia-smi" ]]; then
+      # check to see if we already enabled mig mode and rebooted so we don't end
+      # up in infinite reboot loop
+      mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
+      NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)"
+      if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
+        if (echo "${mig_mode_current}" | grep Enabled); then
+          echo "MIG is enabled on all GPUs, configuring instances"
           configure_mig_cgi
+          exit 0
         else
-          echo "MIG is configured on but NOT enabled, we need to reboot"
-          reboot
+          echo "GPUs present but MIG is not enabled"
         fi
       else
-        echo "MIG is NOT enabled all on GPUs, we need to reboot"
-        reboot
+        echo "More than 1 GPU with MIG configured differently between them"
       fi
-    else
-      echo "Not enabling MIG"
     fi
   fi
+
+  install_nvidia_gpu_driver
+
+  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
+
+  enable_mig
+
+  mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
+
+  NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)"
+  if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -ne 1 ]]      ; then echo "MIG is NOT enabled all on GPUs.  Failing"       ; exit 1 ; fi
+  if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured on but NOT enabled.  Failing" ; exit 1 ; fi
+
+  echo "MIG is fully enabled"
+  configure_mig_cgi
 }
 
+prepare_to_install
+
 main
diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
index fff1186dc..0779a1c28 100644
--- a/templates/spark-rapids/mig.sh.in
+++ b/templates/spark-rapids/mig.sh.in
@@ -184,52 +184,52 @@ function prepare_to_install(){
   touch "${workdir}/prepare-complete"
 }
 
-function main() {
+function enable_and_configure_mig() {
   # default MIG to on when this script is used
   META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
 
-  if (lspci | grep -q NVIDIA); then
-    if [[ $META_MIG_VALUE -ne 0 ]]; then
-      # if the first invocation, the NVIDIA drivers and tools are not installed
-      if [[ -f "/usr/bin/nvidia-smi" ]]; then
-        # check to see if we already enabled mig mode and rebooted so we don't end
-        # up in infinite reboot loop
-        NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l`
-        if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
-          if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
-            echo "MIG is enabled on all GPUs, configuring instances"
-            configure_mig_cgi
-            exit 0
-          else
-            echo "GPUs present but MIG is not enabled"
-          fi
-        else
-          echo "More than 1 GPU with MIG configured differently between them"
-        fi
-      fi
-    fi
+  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
+
+  enable_mig
+
+  mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
 
-    install_nvidia_gpu_driver
+  NUM_GPUS_WITH_DIFF_MIG_MODES=
+  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled all on GPUs.  Failing"       ; exit 1 ; fi
+  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured on but NOT enabled.  Failing" ; exit 1 ; fi
 
-    if [[ ${META_MIG_VALUE} -ne 0 ]]; then
-      enable_mig
-      NUM_GPUS_WITH_DIFF_MIG_MODES="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)"
-      if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
-        if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
-          echo "MIG is fully enabled, we don't need to reboot"
+  echo "MIG is fully enabled"
+  configure_mig_cgi
+}
+
+function main() {
+  # default MIG to on when this script is used
+  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+
+  if ! (lspci | grep -q NVIDIA) ; then return ; fi
+  if [[ $META_MIG_VALUE -ne 0 ]]; then
+    # if the first invocation, the NVIDIA drivers and tools are not installed
+    if [[ -f "/usr/bin/nvidia-smi" ]]; then
+      # check to see if we already enabled mig mode and rebooted so we don't end
+      # up in infinite reboot loop
+      mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
+      NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)"
+      if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
+        if (echo "${mig_mode_current}" | grep Enabled); then
+          echo "MIG is enabled on all GPUs, configuring instances"
           configure_mig_cgi
+          exit 0
         else
-          echo "MIG is configured on but NOT enabled.  Failing"
-          exit 1
+          echo "GPUs present but MIG is not enabled"
         fi
       else
-        echo "MIG is NOT enabled all on GPUs.  Failing"
-        exit 1
+        echo "More than 1 GPU with MIG configured differently between them"
       fi
-    else
-      echo "Not enabling MIG"
     fi
   fi
+
+  install_nvidia_gpu_driver
+  enable_and_configure_mig
 }
 
 prepare_to_install

From edeab284b81404e3148e5cbd12c4c928d23bd50c Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 23 Dec 2024 15:19:12 -0800
Subject: [PATCH 011/130] moved comment to correct function

---
 templates/common/util_functions | 3 ---
 templates/gpu/util_functions    | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index b777968e5..9133072d9 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -415,9 +415,6 @@ function dnf_add_repo() {
     | dd of="${repo_path}" status=progress
 }
 
-#
-# Install package signing key and add corresponding repository
-# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 #
 # Keyrings default to
 # /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index f2f3e2a9c..c475cc269 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -617,6 +617,9 @@ function add_nonfree_components() {
   fi
 }
 
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 function add_repo_nvidia_container_toolkit() {
   local nvctk_root="https://nvidia.github.io/libnvidia-container"
   local signing_key_url="${nvctk_root}/gpgkey"

From 0e8946cdbdc5cd31ff8d2eb3960bfb368e8264fa Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 23 Dec 2024 20:25:58 -0800
Subject: [PATCH 012/130] do not point to local rpm pgp key

---
 templates/common/util_functions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 9133072d9..3373fb24e 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -411,8 +411,8 @@ function dnf_add_repo() {
   local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
 
   curl -s -L "${repo_url}" \
-    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
     | dd of="${repo_path}" status=progress
+#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
 }
 
 #

From c44195a913fb3604fcf9d66a744620e41caa1f7f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 21:01:03 -0800
Subject: [PATCH 013/130] store completion signal files in their own directory

---
 templates/gpu/util_functions | 52 +++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index c475cc269..5631ab414 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -342,7 +342,7 @@ function uninstall_cuda_keyring_pkg() {
 }
 
 function install_local_cuda_repo() {
-  if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi
 
   if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
   CUDA_LOCAL_REPO_INSTALLED="1"
@@ -365,16 +365,16 @@ function install_local_cuda_repo() {
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
 
-  touch "${workdir}/install-local-cuda-repo-complete"
+  touch "${workdir}/complete/install-local-cuda-repo"
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  rm -f "${workdir}/install-local-cuda-repo-complete"
+  rm -f "${workdir}/complete/install-local-cuda-repo"
 }
 
 CUDNN_PKG_NAME=""
 function install_local_cudnn_repo() {
-  if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
   CUDNN_PKG_NAME="${pkgname}"
   local_deb_fn="${pkgname}_1.0-1_amd64.deb"
@@ -390,18 +390,18 @@ function install_local_cudnn_repo() {
 
   cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
-  touch "${workdir}/install-local-cudnn-repo-complete"
+  touch "${workdir}/complete/install-local-cudnn-repo"
 }
 
 function uninstall_local_cudnn_repo() {
   apt-get purge -yq "${CUDNN_PKG_NAME}"
-  rm -f "${workdir}/install-local-cudnn-repo-complete"
+  rm -f "${workdir}/complete/install-local-cudnn-repo"
 }
 
 CUDNN8_LOCAL_REPO_INSTALLED="0"
 CUDNN8_PKG_NAME=""
 function install_local_cudnn8_repo() {
-  if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
 
   if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
   elif is_debian ; then cudnn8_shortname="debian11"
@@ -435,16 +435,16 @@ function install_local_cudnn8_repo() {
   rm -f "${local_deb_fn}"
 
   cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  touch "${workdir}/install-local-cudnn8-repo-complete"
+  touch "${workdir}/complete/install-local-cudnn8-repo"
 }
 
 function uninstall_local_cudnn8_repo() {
   apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  rm -f "${workdir}/install-local-cudnn8-repo-complete"
+  rm -f "${workdir}/complete/install-local-cudnn8-repo"
 }
 
 function install_nvidia_nccl() {
-  if test -f "${workdir}/nccl-complete" ; then return ; fi
+  if test -f "${workdir}/complete/nccl" ; then return ; fi
 
   if is_cuda11 && is_debian12 ; then
     echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
@@ -535,14 +535,14 @@ function install_nvidia_nccl() {
   fi
 
   popd
-  touch "${workdir}/nccl-complete"
+  touch "${workdir}/complete/nccl"
 }
 
 function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
 function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
-  if test -f "${workdir}/cudnn-complete" ; then return ; fi
+  if test -f "${workdir}/complete/cudnn" ; then return ; fi
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -601,7 +601,7 @@ function install_nvidia_cudnn() {
   ldconfig
 
   echo "NVIDIA cuDNN successfully installed for ${_shortname}."
-  touch "${workdir}/cudnn-complete"
+  touch "${workdir}/complete/cudnn"
 }
 
 function add_nonfree_components() {
@@ -754,7 +754,7 @@ function install_nvidia_userspace_runfile() {
   #
   # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
   # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  if test -f "${workdir}/userspace-complete" ; then return ; fi
+  if test -f "${workdir}/complete/userspace" ; then return ; fi
   local local_fn="${tmpdir}/userspace.run"
 
   cache_fetched_package "${USERSPACE_URL}" \
@@ -822,12 +822,12 @@ function install_nvidia_userspace_runfile() {
   fi
 
   rm -f "${local_fn}"
-  touch "${workdir}/userspace-complete"
+  touch "${workdir}/complete/userspace"
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${workdir}/cuda-complete" ; then return ; fi
+  if test -f "${workdir}/complete/cuda" ; then return ; fi
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
@@ -836,7 +836,7 @@ function install_cuda_runfile() {
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
   rm -f "${local_fn}"
-  touch "${workdir}/cuda-complete"
+  touch "${workdir}/complete/cuda"
   sync
 }
 
@@ -874,7 +874,7 @@ function load_kernel_module() {
 }
 
 function install_cuda(){
-  if test -f "${workdir}/cuda-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/cuda-repo" ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     echo "installed with the driver on ${_shortname}"
@@ -887,7 +887,7 @@ function install_cuda(){
   # Includes CUDA packages
   add_repo_cuda
 
-  touch "${workdir}/cuda-repo-complete"
+  touch "${workdir}/complete/cuda-repo"
 }
 
 function install_nvidia_container_toolkit() {
@@ -910,7 +910,7 @@ function install_nvidia_container_toolkit() {
 
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if test -f "${workdir}/gpu-driver-complete" ; then return ; fi
+  if test -f "${workdir}/complete/gpu-driver" ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
@@ -932,11 +932,11 @@ function install_nvidia_gpu_driver() {
   build_driver_from_github
 
   echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  touch "${workdir}/gpu-driver-complete"
+  touch "${workdir}/complete/gpu-driver"
 }
 
 function install_ops_agent(){
-  if test -f "${workdir}/ops-agent-complete" ; then return ; fi
+  if test -f "${workdir}/complete/ops-agent" ; then return ; fi
 
   mkdir -p /opt/google
   cd /opt/google
@@ -944,7 +944,7 @@ function install_ops_agent(){
   curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
   execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
-  touch "${workdir}/ops-agent-complete"
+  touch "${workdir}/complete/ops-agent"
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
@@ -1127,7 +1127,7 @@ function nvsmi() {
 }
 
 function install_build_dependencies() {
-  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
+  if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
 
   if is_debuntu ; then
     if is_ubuntu22 && is_cuda12 ; then
@@ -1165,7 +1165,7 @@ function install_build_dependencies() {
 
     execute_with_retries "${dnf_cmd}"
   fi
-  touch "${workdir}/build-dependencies-complete"
+  touch "${workdir}/complete/build-dependencies"
 }
 
 function install_dependencies() {
@@ -1183,6 +1183,8 @@ function prepare_gpu_env(){
 
   if   is_cuda11 ; then gcc_ver="11"
   elif is_cuda12 ; then gcc_ver="12" ; fi
+
+  mkdir -p "${workdir}/complete"
 }
 
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades

From 31d1a9e65425cffb32e09d1a5599b622058c5d97 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 21:47:53 -0800
Subject: [PATCH 014/130] excessive sudo

---
 templates/gpu/util_functions | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 5631ab414..4ce22f01a 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1007,10 +1007,10 @@ function configure_gpu_exclusive_mode() {
 
 function fetch_mig_scripts() {
   mkdir -p /usr/local/yarn-mig-scripts
-  sudo chmod 755 /usr/local/yarn-mig-scripts
+  chmod 755 /usr/local/yarn-mig-scripts
   wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
   wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  sudo chmod 755 /usr/local/yarn-mig-scripts/*
+  chmod 755 /usr/local/yarn-mig-scripts/*
 }
 
 function configure_gpu_script() {

From 4a3a8cdbc1e6cad61efa4b8072ff4279700f65b2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 22:23:50 -0800
Subject: [PATCH 015/130] install spark rapids in all cases

---
 templates/gpu/install_gpu_driver.sh.in    |   2 +
 templates/spark-rapids/spark-rapids.sh.in | 300 ++++++++++++++++++++++
 templates/spark-rapids/util_functions     |  49 ++++
 3 files changed, 351 insertions(+)
 create mode 100644 templates/spark-rapids/spark-rapids.sh.in
 create mode 100644 templates/spark-rapids/util_functions

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index a5d4172dd..e852ed73c 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -84,10 +84,12 @@ function main() {
     fi
 
     configure_yarn_nodemanager
+    install_spark_rapids
     configure_gpu_script
     configure_gpu_isolation
   elif [[ "${ROLE}" == "Master" ]]; then
     configure_yarn_nodemanager
+    install_spark_rapids
     configure_gpu_script
   fi
 
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
new file mode 100644
index 000000000..fc37f109f
--- /dev/null
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -0,0 +1,300 @@
+#!/bin/bash
+#
+[% INSERT legal/license_header %]
+# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2.
+# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only
+# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions.
+# Note that the script is designed to work when secure boot is disabled during cluster creation.
+# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu.
+#
+[% PROCESS common/template_disclaimer %]
+
+set -euxo pipefail
+
+[% INSERT common/util_functions %]
+
+[% INSERT 'secure-boot/util_functions' %]
+
+[% INSERT gpu/util_functions %]
+
+[% INSERT 'spark-rapids/util_functions' %]
+
+check_secure_boot
+
+# Stackdriver GPU agent parameters
+# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+readonly INSTALL_GPU_AGENT
+
+# Dataproc configurations
+readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+readonly HIVE_CONF_DIR='/etc/hive/conf'
+readonly SPARK_CONF_DIR='/etc/spark/conf'
+
+NVIDIA_SMI_PATH='/usr/bin'
+MIG_MAJOR_CAPS=0
+IS_MIG_ENABLED=0
+
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_resources
+
+  # Detect NVIDIA GPU
+  if (lspci | grep -q NVIDIA); then
+    # if this is called without the MIG script then the drivers are not installed
+    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
+    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
+      fi
+    fi
+
+    if is_debuntu ; then
+      execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
+    elif is_rocky ; then
+      echo "kernel devel and headers not required on rocky.  installing from binary"
+    fi
+
+    # if mig is enabled drivers would have already been installed
+    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+      install_nvidia_gpu_driver
+      install_cuda
+      load_kernel_module
+
+      #Install GPU metrics collection in Stackdriver if needed
+      if [[ ${INSTALL_GPU_AGENT} == true ]]; then
+        #install_gpu_agent
+        install_gpu_monitoring_agent
+
+        echo 'GPU metrics agent successfully deployed.'
+      else
+        echo 'GPU metrics agent will not be installed.'
+      fi
+      configure_gpu_exclusive_mode
+    fi
+
+    configure_yarn_nodemanager
+    configure_gpu_script
+    configure_gpu_isolation
+  elif [[ "${ROLE}" == "Master" ]]; then
+    configure_yarn_nodemanager
+    configure_gpu_script
+  fi
+
+  # Restart YARN services if they are running already
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl restart "hadoop-yarn-${svc}.service"
+    fi
+  done
+}
+
+function main() {
+  repair_old_backports
+  check_os
+  check_secure_boot
+
+  setup_gpu_yarn
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    install_spark_rapids
+    configure_gpu_script
+    echo "RAPIDS initialized with Spark runtime"
+  else
+    echo "Unsupported RAPIDS Runtime: ${RUNTIME}"
+    exit 1
+  fi
+
+  for svc in resourcemanager nodemanager; do
+    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
+      systemctl restart hadoop-yarn-${svc}.service
+    fi
+  done
+}
+
+function exit_handler() {
+  # Purge private key material until next grant
+  clear_dkms_key
+
+  set +ex
+  echo "Exit handler invoked"
+
+  # Clear pip cache
+  pip cache purge || echo "unable to purge pip cache"
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
+
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+    hold_nvidia_packages
+  else
+    dnf clean all
+  fi
+
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3 | sort -h
+  elif is_debian ; then
+    du -x -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /var/lib/{docker,mysql,} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+      /usr/bin \
+      /usr \
+      /var \
+      / 2>/dev/null | sort -h
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  fi
+
+  # Process disk usage logs from installation period
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
+  # compute maximum size of disk during installation
+  # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+
+  echo "exit_handler has completed"
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
+  fi
+
+  return 0
+}
+
+# Update SPARK RAPIDS config
+readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+
+# Fetch instance roles and runtime
+readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
+readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
+
+# CUDA version and Driver version config
+CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1')  #12.2.2
+NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05
+CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
+
+function prepare_to_install(){
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
+
+  prepare_gpu_env
+
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
+
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
+
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
+
+  mkdir -p "${workdir}"
+  trap exit_handler EXIT
+  set_proxy
+  mount_ramdisk
+
+  readonly install_log="${tmpdir}/install.log"
+
+  if test -f "${workdir}/prepare-complete" ; then return ; fi
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  ) fi
+
+  install_dependencies
+
+  # Monitor disk usage in a screen session
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
+  screen -d -m -LUS keep-running-df \
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+
+  touch "${workdir}/prepare-complete"
+}
+
+main
diff --git a/templates/spark-rapids/util_functions b/templates/spark-rapids/util_functions
new file mode 100644
index 000000000..93c87db8a
--- /dev/null
+++ b/templates/spark-rapids/util_functions
@@ -0,0 +1,49 @@
+function install_spark_rapids() {
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_monitoring_agent() {
+  download_gpu_monitoring_agent
+  install_gpu_monitoring_agent_dependency
+  start_gpu_monitoring_agent_service
+}
+
+function download_gpu_monitoring_agent(){
+  if [[ ${OS_NAME} == rocky ]]; then
+    execute_with_retries "dnf -y -q install git"
+  else
+    execute_with_retries "apt-get install git -y"
+  fi
+  mkdir -p /opt/google
+  chmod 777 /opt/google
+  cd /opt/google
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+}
+
+function install_gpu_monitoring_agent_dependency(){
+  cd /opt/google/compute-gpu-monitoring/linux
+  python3 -m venv venv
+  venv/bin/pip install wheel
+  venv/bin/pip install -Ur requirements.txt
+}
+
+function start_gpu_monitoring_agent_service(){
+  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
+  systemctl daemon-reload
+  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
+}
+

From 6ab36a5868be4190a1c4c0bb235c07bfe3b31331 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 12:36:49 -0800
Subject: [PATCH 016/130] merged spark-rapids functions into general gpu
 util_functions template

---
 templates/gpu/util_functions          | 152 ++++++++++++++++++++++----
 templates/spark-rapids/util_functions |  49 ---------
 2 files changed, 129 insertions(+), 72 deletions(-)
 delete mode 100644 templates/spark-rapids/util_functions

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 4ce22f01a..53e7daa93 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -141,25 +141,29 @@ function set_driver_version() {
 
 set_driver_version
 
-readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+function set_cudnn_version() {
+  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+  # Parameters for NVIDIA-provided cuDNN library
+  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
+    # cuDNN v8 is not distribution for ubuntu20+, debian12
+    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
+    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+    CUDNN_VERSION="8.8.0.121"
+  fi
+  readonly CUDNN_VERSION
+}
+set_cudnn_version
 
-# Parameters for NVIDIA-provided cuDNN library
-readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
 function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
 function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
-  # cuDNN v8 is not distribution for ubuntu20+, debian12
-  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
-  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-  CUDNN_VERSION="8.8.0.121"
-fi
-readonly CUDNN_VERSION
 
 readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
 readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
@@ -947,6 +951,39 @@ function install_ops_agent(){
   touch "${workdir}/complete/ops-agent"
 }
 
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_monitoring_agent() {
+  download_gpu_monitoring_agent
+  install_gpu_monitoring_agent_dependency
+  start_gpu_monitoring_agent_service
+}
+
+function download_gpu_monitoring_agent(){
+  if is_rocky ; then
+    execute_with_retries "dnf -y -q install git"
+  else
+    execute_with_retries "apt-get install git -y"
+  fi
+  mkdir -p /opt/google
+  chmod 777 /opt/google
+  cd /opt/google
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+}
+
+function install_gpu_monitoring_agent_dependency(){
+  cd /opt/google/compute-gpu-monitoring/linux
+  python3 -m venv venv
+  venv/bin/pip install wheel
+  venv/bin/pip install -Ur requirements.txt
+}
+
+function start_gpu_monitoring_agent_service(){
+  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
+  systemctl daemon-reload
+  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
+}
+
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
 function install_gpu_agent() {
   # Stackdriver GPU agent parameters
@@ -1013,6 +1050,28 @@ function fetch_mig_scripts() {
   chmod 755 /usr/local/yarn-mig-scripts/*
 }
 
+function install_spark_rapids() {
+  # Update SPARK RAPIDS config
+  readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+}
+
 function configure_gpu_script() {
   # Download GPU discovery script
   local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
@@ -1049,9 +1108,9 @@ EOF
   chmod a+rx "${gpus_resources_script}"
 
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-  if version_ge "${SPARK_VERSION}" "3.0" ; then
-    local gpu_count
-    gpu_count="$(lspci | grep NVIDIA | wc -l)"
+  local gpu_count
+  gpu_count="$(lspci | grep NVIDIA | wc -l)"
+  if version_ge "${gpu_count}" "1" ; then
     local executor_cores
     executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
     local executor_memory
@@ -1066,8 +1125,9 @@ EOF
 # query explain output won't show GPU operator, if the user has doubts
 # they can uncomment the line before seeing the GPU plan explain;
 # having AQE enabled gives user the best performance.
-spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.resource.gpu.amount=${gpu_count}
+spark.plugins=com.nvidia.spark.SQLPlugin
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.cores=${executor_cores}
 spark.executor.memory=${executor_memory_gb}G
 spark.dynamicAllocation.enabled=false
@@ -1178,13 +1238,10 @@ function prepare_gpu_env(){
   # Verify SPARK compatability
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 
-  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
   nvsmi_works="0"
 
   if   is_cuda11 ; then gcc_ver="11"
   elif is_cuda12 ; then gcc_ver="12" ; fi
-
-  mkdir -p "${workdir}/complete"
 }
 
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
@@ -1253,3 +1310,52 @@ function configure_mig_cgi() {
 function enable_mig() {
   nvidia-smi -mig 1
 }
+
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_resources
+
+  # Detect NVIDIA GPU
+  if (lspci | grep -q NVIDIA); then
+    # if this is called without the MIG script then the drivers are not installed
+    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
+    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
+      fi
+    fi
+
+    # if mig is enabled drivers would have already been installed
+    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+      install_nvidia_gpu_driver
+      install_cuda
+      load_kernel_module
+
+      #Install GPU metrics collection in Stackdriver if needed
+      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+        install_gpu_agent
+#        install_gpu_monitoring_agent
+        echo 'GPU metrics agent successfully deployed.'
+      else
+        echo 'GPU metrics agent has not been installed.'
+      fi
+      configure_gpu_exclusive_mode
+    fi
+
+    configure_yarn_nodemanager
+    configure_gpu_script
+    configure_gpu_isolation
+  elif [[ "${ROLE}" == "Master" ]]; then
+    configure_yarn_nodemanager
+    configure_gpu_script
+  fi
+}
diff --git a/templates/spark-rapids/util_functions b/templates/spark-rapids/util_functions
deleted file mode 100644
index 93c87db8a..000000000
--- a/templates/spark-rapids/util_functions
+++ /dev/null
@@ -1,49 +0,0 @@
-function install_spark_rapids() {
-  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
-  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
-  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
-
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-}
-
-# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
-function install_gpu_monitoring_agent() {
-  download_gpu_monitoring_agent
-  install_gpu_monitoring_agent_dependency
-  start_gpu_monitoring_agent_service
-}
-
-function download_gpu_monitoring_agent(){
-  if [[ ${OS_NAME} == rocky ]]; then
-    execute_with_retries "dnf -y -q install git"
-  else
-    execute_with_retries "apt-get install git -y"
-  fi
-  mkdir -p /opt/google
-  chmod 777 /opt/google
-  cd /opt/google
-  test -d compute-gpu-monitoring || \
-    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
-}
-
-function install_gpu_monitoring_agent_dependency(){
-  cd /opt/google/compute-gpu-monitoring/linux
-  python3 -m venv venv
-  venv/bin/pip install wheel
-  venv/bin/pip install -Ur requirements.txt
-}
-
-function start_gpu_monitoring_agent_service(){
-  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
-  systemctl daemon-reload
-  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
-}
-

From ef366947c871f69e185b262892615bb87f974ea1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 12:43:13 -0800
Subject: [PATCH 017/130] correcting variable name

---
 templates/gpu/install_gpu_driver.sh.in    | 102 ++++------------
 templates/spark-rapids/spark-rapids.sh.in | 139 +++++-----------------
 2 files changed, 51 insertions(+), 190 deletions(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index e852ed73c..616fc5eb2 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -10,93 +10,33 @@ set -euxo pipefail
 
 [% INSERT common/util_functions %]
 
-[% INSERT gpu/util_functions %]
-
 [% INSERT 'secure-boot/util_functions' %]
 
-function main() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn_resources
-
-  # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
-    # if this is called without the MIG script then the drivers are not installed
-    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
-    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
-
-    # if mig is enabled drivers would have already been installed
-    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-      install_nvidia_gpu_driver
-      install_nvidia_container_toolkit
-      install_cuda
-      load_kernel_module
-
-      if [[ -n ${CUDNN_VERSION} ]]; then
-        install_nvidia_nccl
-        install_nvidia_cudnn
-      fi
-      #Install GPU metrics collection in Stackdriver if needed
-      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-        #install_ops_agent
-	install_gpu_agent
-        echo 'GPU metrics agent successfully deployed.'
-      else
-        echo 'GPU metrics agent will not be installed.'
-      fi
-
-      # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
-      for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-        rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
-      done
+[% INSERT gpu/util_functions %]
 
-      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
-      if test -n "$(nvsmi -L)" ; then
-	# cache the result of the gpu query
-        ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
-        echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
-      fi
-      NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
-      if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-        # enable MIG on every GPU
-	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
-	  nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
-	done
-
-        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-        MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
-        fetch_mig_scripts
-      else
-        configure_gpu_exclusive_mode
-      fi
-    fi
+function main() {
+  setup_gpu_yarn
+  if [[ -n ${CUDNN_VERSION} ]]; then
+    install_nvidia_nccl
+    install_nvidia_cudnn
+  fi
+  install_nvidia_container_toolkit
 
-    configure_yarn_nodemanager
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
     install_spark_rapids
-    configure_gpu_script
-    configure_gpu_isolation
-  elif [[ "${ROLE}" == "Master" ]]; then
-    configure_yarn_nodemanager
-    install_spark_rapids
-    configure_gpu_script
+    configure_spark
+    echo "RAPIDS initialized with Spark runtime"
+  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
+    # we are not currently tooled for installing dask in this action.
+    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
+  else
+    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
   fi
 
   # Restart YARN services if they are running already
   for svc in resourcemanager nodemanager; do
-    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
-      systemctl restart hadoop-yarn-${svc}.service
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl restart "hadoop-yarn-${svc}.service"
     fi
   done
 }
@@ -226,14 +166,14 @@ function prepare_to_install(){
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
-  mkdir -p "${workdir}"
+  mkdir -p "${workdir}/complete"
   trap exit_handler EXIT
   set_proxy
   mount_ramdisk
 
   readonly install_log="${tmpdir}/install.log"
 
-  if test -f "${workdir}/prepare-complete" ; then return ; fi
+  if test -f "${workdir}/complete/prepare" ; then return ; fi
 
   repair_old_backports
 
@@ -261,7 +201,7 @@ function prepare_to_install(){
   screen -d -m -LUS keep-running-df \
     bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 
-  touch "${workdir}/prepare-complete"
+  touch "${workdir}/complete/prepare"
 }
 
 prepare_to_install
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index fc37f109f..1781909d2 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -1,13 +1,24 @@
 #!/bin/bash
 #
 [% INSERT legal/license_header %]
-# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2.
-# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only
-# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions.
-# Note that the script is designed to work when secure boot is disabled during cluster creation.
-# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu.
 #
 [% PROCESS common/template_disclaimer %]
+#
+# This script installs NVIDIA GPU drivers (version 550.135) along with
+# CUDA 12.4.
+#
+# Additionally, it installs the RAPIDS Spark plugin, configures Spark
+# and YARN, installs an agent to collect GPU utilization metrics.  The
+# installer is compatible with Debian, Ubuntu, and Rocky Linux
+# distributions.
+#
+# Note that the script is designed to work both when secure boot is
+# enabled with a custom image and when disabled during cluster
+# creation.
+#
+# For details see
+# github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot
+#
 
 set -euxo pipefail
 
@@ -17,105 +28,24 @@ set -euxo pipefail
 
 [% INSERT gpu/util_functions %]
 
-[% INSERT 'spark-rapids/util_functions' %]
-
-check_secure_boot
-
-# Stackdriver GPU agent parameters
-# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-readonly INSTALL_GPU_AGENT
-
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
-
-NVIDIA_SMI_PATH='/usr/bin'
-MIG_MAJOR_CAPS=0
-IS_MIG_ENABLED=0
-
-function setup_gpu_yarn() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn_resources
-
-  # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
-    # if this is called without the MIG script then the drivers are not installed
-    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
-    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
-
-    if is_debuntu ; then
-      execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
-    elif is_rocky ; then
-      echo "kernel devel and headers not required on rocky.  installing from binary"
-    fi
-
-    # if mig is enabled drivers would have already been installed
-    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-      install_nvidia_gpu_driver
-      install_cuda
-      load_kernel_module
-
-      #Install GPU metrics collection in Stackdriver if needed
-      if [[ ${INSTALL_GPU_AGENT} == true ]]; then
-        #install_gpu_agent
-        install_gpu_monitoring_agent
-
-        echo 'GPU metrics agent successfully deployed.'
-      else
-        echo 'GPU metrics agent will not be installed.'
-      fi
-      configure_gpu_exclusive_mode
-    fi
-
-    configure_yarn_nodemanager
-    configure_gpu_script
-    configure_gpu_isolation
-  elif [[ "${ROLE}" == "Master" ]]; then
-    configure_yarn_nodemanager
-    configure_gpu_script
-  fi
-
-  # Restart YARN services if they are running already
-  for svc in resourcemanager nodemanager; do
-    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl restart "hadoop-yarn-${svc}.service"
-    fi
-  done
-}
-
 function main() {
-  repair_old_backports
-  check_os
-  check_secure_boot
-
   setup_gpu_yarn
+
   if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
     install_spark_rapids
-    configure_gpu_script
+    configure_spark
     echo "RAPIDS initialized with Spark runtime"
+  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
+    # we are not currently tooled for installing dask in this action.
+    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
   else
-    echo "Unsupported RAPIDS Runtime: ${RUNTIME}"
-    exit 1
+    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
   fi
 
+  # Restart YARN services if they are running already
   for svc in resourcemanager nodemanager; do
-    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
-      systemctl restart hadoop-yarn-${svc}.service
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl restart "hadoop-yarn-${svc}.service"
     fi
   done
 }
@@ -221,20 +151,9 @@ print( "    samples-taken: ", scalar @siz, $/,
   return 0
 }
 
-# Update SPARK RAPIDS config
-readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
-readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
-readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
-
 # Fetch instance roles and runtime
-readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
 readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
 
-# CUDA version and Driver version config
-CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1')  #12.2.2
-NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05
-CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
-
 function prepare_to_install(){
   # Verify OS compatability and Secure boot state
   check_os
@@ -259,14 +178,14 @@ function prepare_to_install(){
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
-  mkdir -p "${workdir}"
+  mkdir -p "${workdir}/complete"
   trap exit_handler EXIT
   set_proxy
   mount_ramdisk
 
   readonly install_log="${tmpdir}/install.log"
 
-  if test -f "${workdir}/prepare-complete" ; then return ; fi
+  if test -f "${workdir}/complete/prepare" ; then return ; fi
 
   repair_old_backports
 
@@ -294,7 +213,9 @@ function prepare_to_install(){
   screen -d -m -LUS keep-running-df \
     bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 
-  touch "${workdir}/prepare-complete"
+  touch "${workdir}/complete/prepare"
 }
 
+prepare_to_install
+
 main

From af69141efd78d9edb4dee2793938be664552a837 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 12:55:46 -0800
Subject: [PATCH 018/130] using new function name

---
 templates/gpu/install_gpu_driver.sh.in    | 2 +-
 templates/spark-rapids/spark-rapids.sh.in | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 616fc5eb2..09ba877ba 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -24,7 +24,7 @@ function main() {
 
   if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
     install_spark_rapids
-    configure_spark
+    configure_gpu_script
     echo "RAPIDS initialized with Spark runtime"
   elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
     # we are not currently tooled for installing dask in this action.
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 1781909d2..73e360c42 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -33,7 +33,7 @@ function main() {
 
   if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
     install_spark_rapids
-    configure_spark
+    configure_gpu_script
     echo "RAPIDS initialized with Spark runtime"
   elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
     # we are not currently tooled for installing dask in this action.

From d59d5e6d90a12f1f142b9d281e42be3054e5a721 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 16:51:44 -0800
Subject: [PATCH 019/130] driver version for 12.4.0 had not been tested in a
 while and had become incorrect

---
 templates/gpu/util_functions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 53e7daa93..8836a0caa 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -258,7 +258,7 @@ function set_cuda_runfile_url() {
           ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
           ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
           ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
-          ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+          ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
           ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
           ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
   )

From 8a9e00a08197f38621d87bb8b436ef923a4010fc Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 19:33:19 -0800
Subject: [PATCH 020/130] expanding non-default version tests ; adding utility
 function to verify pyspark

---
 spark-rapids/test_spark_rapids.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 6e03f2d62..b8e0fe133 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -20,6 +20,10 @@ class SparkRapidsTestCase(DataprocTestCase):
   def verify_spark_instance(self, name):
     self.assert_instance_command(name, "nvidia-smi")
 
+  def verify_pyspark(self, name):
+    # Verify that pyspark works
+    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -114,13 +118,22 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
     # Only need to do this once
     self.verify_spark_job_sql()
 
-  @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
+  @parameterized.parameters(
+    ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"),
+    ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"),
+    ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03")
+  )
   def test_non_default_cuda_versions(self, configuration, machine_suffixes,
                                      accelerator, cuda_version, driver_version):
 
     if self.getImageOs() == "rocky":
       self.skipTest("Not supported for Rocky OS")
 
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+
     if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
       self.skipTest("Not supported in 2.0 and earlier images")
 
@@ -134,7 +147,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="50GB",
+        boot_disk_size="60GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:

From 1113855ba26bfa29e2e734ac2b5475633a96aec3 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 19:41:57 -0800
Subject: [PATCH 021/130] reduced boot disk size to 50GB

---
 spark-rapids/test_spark_rapids.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index b8e0fe133..ce78f3f52 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -147,7 +147,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="60GB",
+        boot_disk_size="50GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:

From 7034739d34c13339ab87c41cc25cb262e3573274 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 20:57:43 -0800
Subject: [PATCH 022/130] skipping old cuda on new images ; sizing instances to
 build

---
 spark-rapids/test_spark_rapids.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index ce78f3f52..f3aa19c6f 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -76,7 +76,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
         self.INIT_ACTIONS,
         optional_components=optional_components,
         metadata=metadata,
-        machine_type="n1-standard-4",
+        machine_type="n1-standard-32",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
         boot_disk_size="50GB",
@@ -106,7 +106,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
       self.INIT_ACTIONS,
       optional_components=optional_components,
       metadata=metadata,
-      machine_type="n1-standard-4",
+      machine_type="n1-standard-32",
       master_accelerator=accelerator if configuration == "SINGLE" else None,
       worker_accelerator=accelerator,
       boot_disk_size="50GB",
@@ -134,6 +134,11 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
       self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
+    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+
     if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
       self.skipTest("Not supported in 2.0 and earlier images")
 
@@ -144,7 +149,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
         configuration,
         self.INIT_ACTIONS,
         metadata=metadata,
-        machine_type="n1-standard-4",
+        machine_type="n1-standard-32",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
         boot_disk_size="50GB",

From 2873f490553080ff892086eab106db7d4ceb7f93 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 21:25:11 -0800
Subject: [PATCH 023/130] skipping older debuntu when cuda version not
 specified

---
 spark-rapids/test_spark_rapids.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index f3aa19c6f..2d67a0df2 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -62,12 +62,13 @@ def verify_spark_job_sql(self):
                             ("STANDARD", ["w-0"], GPU_T4))
   def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
 
-    if self.getImageOs() == "rocky":
-      self.skipTest("Not supported for Rocky OS")
-
     if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
       self.skipTest("Not supported in 2.0 and earlier images")
 
+    if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+         ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+      self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases")
+
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -92,12 +93,13 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
                             ("STANDARD", ["w-0"], GPU_T4))
   def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
 
-    if self.getImageOs() == "rocky":
-      self.skipTest("Not supported for Rocky OS")
-
     if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
       self.skipTest("Not supported in 2.0 and earlier images")
 
+    if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+         ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+      self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases")
+
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -120,16 +122,14 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
 
   @parameterized.parameters(
     ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"),
+    ("STANDARD", ["w-0"], GPU_T4, "12.0.1", "525.147.05"),
     ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"),
     ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03")
   )
   def test_non_default_cuda_versions(self, configuration, machine_suffixes,
                                      accelerator, cuda_version, driver_version):
 
-    if self.getImageOs() == "rocky":
-      self.skipTest("Not supported for Rocky OS")
-
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0.1") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
       self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")

From 576b32f6417248899d8f1784f241066eedf7b9fe Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 14:51:40 -0800
Subject: [PATCH 024/130] refactor into functions

---
 templates/gpu/util_functions | 134 ++++++++++++++++++-----------------
 1 file changed, 69 insertions(+), 65 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 8836a0caa..cdf0d847f 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -65,7 +65,23 @@ function set_cuda_version() {
   fi
 
   if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
-    DEFAULT_CUDA_VERSION='12.4'
+    DEFAULT_CUDA_VERSION='12.4.1'
+  fi
+  # EXCEPTIONS
+  # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+  case "${DATAPROC_IMAGE_VERSION}" in
+    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;;
+    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    *   )
+      echo "unrecognized Dataproc image version"
+      exit 1
+      ;;
+  esac
+
+  if le_ubuntu18 ; then
+    DEFAULT_CUDA_VERSION="12.1.1"
+    CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
   fi
   readonly DEFAULT_CUDA_VERSION
 
@@ -82,8 +98,6 @@ function set_cuda_version() {
 
 }
 
-set_cuda_version
-
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
 function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
 function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
@@ -139,8 +153,6 @@ function set_driver_version() {
   fi
 }
 
-set_driver_version
-
 function set_cudnn_version() {
   readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
   readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
@@ -160,54 +172,29 @@ function set_cudnn_version() {
   fi
   readonly CUDNN_VERSION
 }
-set_cudnn_version
+
 
 function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
 function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
 
-readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
-
-# Parameters for NVIDIA-provided Debian GPU driver
-readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-
-readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
-
-USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
-readonly USERSPACE_FILENAME
-
+function set_cuda_repo_shortname() {
 # Short name for urls
-if is_ubuntu22  ; then
-    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
-    # https://developer.download.nvidia.com/compute/machine-learning/repos/
-    # use packages from previous release until such time as nvidia
-    # release ubuntu2204 builds
-
-    shortname="$(os_id)$(os_vercat)"
-    nccl_shortname="ubuntu2004"
-elif ge_rocky9 ; then
-    # use packages from previous release until such time as nvidia
-    # release rhel9 builds
-
-    shortname="rhel9"
-    nccl_shortname="rhel8"
-elif is_rocky ; then
+# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
+  if is_rocky ; then
     shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-    nccl_shortname="${shortname}"
-else
+  else
     shortname="$(os_id)$(os_vercat)"
-    nccl_shortname="${shortname}"
-fi
+  fi
+}
 
-# Parameters for NVIDIA-provided package repositories
-readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+function set_nv_urls() {
+  # Parameters for NVIDIA-provided package repositories
+  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
 
-# Parameters for NVIDIA-provided NCCL library
-readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
-NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
-readonly NCCL_REPO_URL
-readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
+  # Parameter for NVIDIA-provided Rocky Linux GPU driver
+  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+}
 
 function set_cuda_runfile_url() {
   local MAX_DRIVER_VERSION
@@ -291,11 +278,7 @@ function set_cuda_runfile_url() {
   fi
 }
 
-set_cuda_runfile_url
-
-# Parameter for NVIDIA-provided Rocky Linux GPU driver
-readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
-
+function set_cudnn_tarball_url() {
 CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
 CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
 if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
@@ -315,20 +298,8 @@ if ( version_ge "${CUDA_VERSION}" "12.0" ); then
 fi
 readonly CUDNN_TARBALL
 readonly CUDNN_TARBALL_URL
+}
 
-# Whether to install NVIDIA-provided or OS-provided GPU driver
-GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
-readonly GPU_DRIVER_PROVIDER
-
-# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-readonly INSTALL_GPU_AGENT
-
-NVIDIA_SMI_PATH='/usr/bin'
-MIG_MAJOR_CAPS=0
-IS_MIG_ENABLED=0
-
-CUDA_KEYRING_PKG_INSTALLED="0"
 function install_cuda_keyring_pkg() {
   if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
   local kr_ver=1.1
@@ -376,7 +347,6 @@ function uninstall_local_cuda_repo(){
   rm -f "${workdir}/complete/install-local-cuda-repo"
 }
 
-CUDNN_PKG_NAME=""
 function install_local_cudnn_repo() {
   if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
@@ -402,8 +372,6 @@ function uninstall_local_cudnn_repo() {
   rm -f "${workdir}/complete/install-local-cudnn-repo"
 }
 
-CUDNN8_LOCAL_REPO_INSTALLED="0"
-CUDNN8_PKG_NAME=""
 function install_local_cudnn8_repo() {
   if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
 
@@ -448,6 +416,9 @@ function uninstall_local_cudnn8_repo() {
 }
 
 function install_nvidia_nccl() {
+  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
   if test -f "${workdir}/complete/nccl" ; then return ; fi
 
   if is_cuda11 && is_debian12 ; then
@@ -747,6 +718,13 @@ function build_driver_from_packages() {
 }
 
 function install_nvidia_userspace_runfile() {
+  # Parameters for NVIDIA-provided Debian GPU driver
+  readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+  readonly USERSPACE_FILENAME
 
   # This .run file contains NV's OpenGL implementation as well as
   # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
@@ -1118,6 +1096,7 @@ EOF
     local task_cpus=2
     local gpu_amount
     gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+    if version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ; then gpu_amount="0.5" ; fi
 
     cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
@@ -1235,6 +1214,31 @@ function install_dependencies() {
 }
 
 function prepare_gpu_env(){
+  set_cuda_version
+  set_driver_version
+  set_cuda_repo_shortname
+  set_nv_urls
+  set_cuda_runfile_url
+  set_cudnn_version
+  set_cudnn_tarball_url
+
+  # Whether to install NVIDIA-provided or OS-provided GPU driver
+  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+  readonly GPU_DRIVER_PROVIDER
+
+  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+  readonly INSTALL_GPU_AGENT
+
+  NVIDIA_SMI_PATH='/usr/bin'
+  MIG_MAJOR_CAPS=0
+  IS_MIG_ENABLED=0
+
+  CUDA_KEYRING_PKG_INSTALLED="0"
+  CUDNN_PKG_NAME=""
+  CUDNN8_LOCAL_REPO_INSTALLED="0"
+  CUDNN8_PKG_NAME=""
+
   # Verify SPARK compatability
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 

From b03dc5726d330290c47916e3ad71bd7f59c9d8d3 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 15:22:53 -0800
Subject: [PATCH 025/130] moved secure-boot utility functions and common
 environment setup into common/util_functions

---
 templates/common/util_functions           | 179 +++++++++++++++++++++-
 templates/gpu/install_gpu_driver.sh.in    |  61 +-------
 templates/secure-boot/util_functions      | 105 -------------
 templates/spark-rapids/spark-rapids.sh.in |  65 +-------
 4 files changed, 176 insertions(+), 234 deletions(-)
 delete mode 100644 templates/secure-boot/util_functions

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 3373fb24e..4ae90e722 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -30,8 +30,6 @@ function define_os_comparison_functions() {
   done
 }
 
-define_os_comparison_functions
-
 function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
 
 function os_vercat()   ( set +x
@@ -437,10 +435,177 @@ function os_add_repo() {
                   else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
 }
 
+function configure_dkms_certs() {
+  if test -v PSN && [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
+
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+
+    return
+  fi
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
+}
+
+function check_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
+
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
+    exit 1
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Please either disable secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return 1
+  fi
+
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
 
-readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
 
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
+  configure_dkms_certs
+}
+
+function prepare_common_env() {
+  define_os_comparison_functions
+
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
+
+  readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
+  # Dataproc configurations
+  readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+  readonly HIVE_CONF_DIR='/etc/hive/conf'
+  readonly SPARK_CONF_DIR='/etc/spark/conf'
+
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
+
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
+
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
+
+  mkdir -p "${workdir}/complete"
+  trap exit_handler EXIT
+  set_proxy
+  mount_ramdisk
+
+  readonly install_log="${tmpdir}/install.log"
+
+  if test -f "${workdir}/complete/prepare.common" ; then return ; fi
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  ) fi
+
+  install_dependencies
+
+  # Monitor disk usage in a screen session
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
+  screen -d -m -LUS keep-running-df \
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+
+  touch "${workdir}/complete/prepare.common"
+}
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 09ba877ba..bb17b2ab6 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -10,8 +10,6 @@ set -euxo pipefail
 
 [% INSERT common/util_functions %]
 
-[% INSERT 'secure-boot/util_functions' %]
-
 [% INSERT gpu/util_functions %]
 
 function main() {
@@ -143,65 +141,8 @@ print( "    samples-taken: ", scalar @siz, $/,
 }
 
 function prepare_to_install(){
-  # Verify OS compatability and Secure boot state
-  check_os
-  check_secure_boot
-
+  prepare_common_env
   prepare_gpu_env
-
-  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
-  ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
-  workdir=/opt/install-dpgce
-  tmpdir=/tmp/
-  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
-  readonly temp_bucket
-  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
-  uname_r=$(uname -r)
-  readonly uname_r
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  export DEBIAN_FRONTEND=noninteractive
-
-  mkdir -p "${workdir}/complete"
-  trap exit_handler EXIT
-  set_proxy
-  mount_ramdisk
-
-  readonly install_log="${tmpdir}/install.log"
-
-  if test -f "${workdir}/complete/prepare" ; then return ; fi
-
-  repair_old_backports
-
-  if is_debuntu ; then
-    clean_up_sources_lists
-    apt-get update -qq
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-    if ge_debian12 ; then
-    apt-mark unhold systemd libsystemd0 ; fi
-  else
-    dnf clean all
-  fi
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  ) fi
-
-  install_dependencies
-
-  # Monitor disk usage in a screen session
-  df / > "/run/disk-usage.log"
-  touch "/run/keep-running-df"
-  screen -d -m -LUS keep-running-df \
-    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
-
-  touch "${workdir}/complete/prepare"
 }
 
 prepare_to_install
diff --git a/templates/secure-boot/util_functions b/templates/secure-boot/util_functions
deleted file mode 100644
index f96a48200..000000000
--- a/templates/secure-boot/util_functions
+++ /dev/null
@@ -1,105 +0,0 @@
-function configure_dkms_certs() {
-  if test -v PSN && [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping";
-      return 0
-  fi
-
-  mkdir -p "${CA_TMPDIR}"
-
-  # If the private key exists, verify it
-  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
-    echo "Private key material exists"
-
-    local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
-      modulus_md5sum="${expected_modulus_md5sum}"
-
-      # Verify that cert md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key"
-      fi
-
-      # Verify that key md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert"
-      fi
-    else
-      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-
-    return
-  fi
-
-  # Retrieve cloud secrets keys
-  local sig_priv_secret_name
-  sig_priv_secret_name="${PSN}"
-  local sig_pub_secret_name
-  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
-  local sig_secret_project
-  sig_secret_project="$(get_metadata_attribute secret_project)"
-  local sig_secret_version
-  sig_secret_version="$(get_metadata_attribute secret_version)"
-
-  # If metadata values are not set, do not write mok keys
-  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
-
-  # Write private material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_priv_secret_name}" \
-      | dd status=none of="${CA_TMPDIR}/db.rsa"
-
-  # Write public material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_pub_secret_name}" \
-      | base64 --decode \
-      | dd status=none of="${CA_TMPDIR}/db.der"
-
-  local mok_directory="$(dirname "${mok_key}")"
-  mkdir -p "${mok_directory}"
-
-  # symlink private key and copy public cert from volatile storage to DKMS directory
-  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
-
-  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
-}
-
-function clear_dkms_key {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping" >&2
-      return 0
-  fi
-  rm -rf "${CA_TMPDIR}" "${mok_key}"
-}
-
-function check_secure_boot() {
-  local SECURE_BOOT="disabled"
-  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
-
-  PSN="$(get_metadata_attribute private_secret_name)"
-  readonly PSN
-
-  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
-    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
-    exit 1
-  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
-    echo "Secure boot is enabled, but no signing material provided."
-    echo "Please either disable secure boot or provide signing material as per"
-    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-    return 1
-  fi
-
-  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-  readonly CA_TMPDIR
-
-  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
-                      mok_der=/var/lib/shim-signed/mok/MOK.der
-                 else mok_key=/var/lib/dkms/mok.key
-                      mok_der=/var/lib/dkms/mok.pub ; fi
-
-  configure_dkms_certs
-}
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 73e360c42..729c556ed 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -24,8 +24,6 @@ set -euxo pipefail
 
 [% INSERT common/util_functions %]
 
-[% INSERT 'secure-boot/util_functions' %]
-
 [% INSERT gpu/util_functions %]
 
 function main() {
@@ -151,69 +149,12 @@ print( "    samples-taken: ", scalar @siz, $/,
   return 0
 }
 
-# Fetch instance roles and runtime
-readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
-
 function prepare_to_install(){
-  # Verify OS compatability and Secure boot state
-  check_os
-  check_secure_boot
-
+  prepare_common_env
   prepare_gpu_env
 
-  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
-  ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
-  workdir=/opt/install-dpgce
-  tmpdir=/tmp/
-  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
-  readonly temp_bucket
-  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
-  uname_r=$(uname -r)
-  readonly uname_r
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  export DEBIAN_FRONTEND=noninteractive
-
-  mkdir -p "${workdir}/complete"
-  trap exit_handler EXIT
-  set_proxy
-  mount_ramdisk
-
-  readonly install_log="${tmpdir}/install.log"
-
-  if test -f "${workdir}/complete/prepare" ; then return ; fi
-
-  repair_old_backports
-
-  if is_debuntu ; then
-    clean_up_sources_lists
-    apt-get update -qq
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-    if ge_debian12 ; then
-    apt-mark unhold systemd libsystemd0 ; fi
-  else
-    dnf clean all
-  fi
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  ) fi
-
-  install_dependencies
-
-  # Monitor disk usage in a screen session
-  df / > "/run/disk-usage.log"
-  touch "/run/keep-running-df"
-  screen -d -m -LUS keep-running-df \
-    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
-
-  touch "${workdir}/complete/prepare"
+  # Fetch instance roles and runtime
+  readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
 }
 
 prepare_to_install

From bf98d8591dfe59582bd043e43867adb02b05e995 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 15:31:21 -0800
Subject: [PATCH 026/130] refactored exit_handler

---
 templates/common/util_functions           | 94 +++++++++++++++++++++
 templates/gpu/install_gpu_driver.sh.in    | 99 +----------------------
 templates/gpu/util_functions              | 10 +++
 templates/spark-rapids/spark-rapids.sh.in | 99 +----------------------
 4 files changed, 108 insertions(+), 194 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 4ae90e722..929eff37a 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -609,3 +609,97 @@ function prepare_common_env() {
 
   touch "${workdir}/complete/prepare.common"
 }
+
+function common_exit_handler() {
+  # Purge private key material until next grant
+  clear_dkms_key
+
+  set +ex
+  echo "Exit handler invoked"
+
+  # Clear pip cache
+  pip cache purge || echo "unable to purge pip cache"
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
+
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+    hold_nvidia_packages
+  else
+    dnf clean all
+  fi
+
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /opt/conda/miniconda3 | sort -h
+  elif is_debian ; then
+    du -x -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
+      /var/lib/{docker,mysql,} \
+      /opt/nvidia/* \
+      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+      /usr/bin \
+      /usr \
+      /var \
+      / 2>/dev/null | sort -h
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
+      /usr/lib64/google-cloud-sdk \
+      /opt/nvidia/* \
+      /opt/conda/miniconda3
+  fi
+
+  # Process disk usage logs from installation period
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
+  # compute maximum size of disk during installation
+  # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
+
+  perl -e '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+
+  echo "exit_handler has completed"
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
+  fi
+}
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index bb17b2ab6..b2fa93b35 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -40,103 +40,8 @@ function main() {
 }
 
 function exit_handler() {
-  # Purge private key material until next grant
-  clear_dkms_key
-
-  set +ex
-  echo "Exit handler invoked"
-
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
-  # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-
-    # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
-        umount -f ${shmdir}
-      fi
-    done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-  fi
-
-  if is_debuntu ; then
-    # Clean up OS package cache
-    apt-get -y -qq clean
-    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
-    # re-hold systemd package
-    if ge_debian12 ; then
-    apt-mark hold systemd libsystemd0 ; fi
-    hold_nvidia_packages
-  else
-    dnf clean all
-  fi
-
-  # print disk usage statistics for large components
-  if is_ubuntu ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  elif is_debian ; then
-    du -x -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /var/lib/{docker,mysql,} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
-      /usr/bin \
-      /usr \
-      /var \
-      / 2>/dev/null | sort -h
-  else
-    du -hs \
-      /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
-      /usr/lib64/google-cloud-sdk \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
-  fi
-
-  # Process disk usage logs from installation period
-  rm -f /run/keep-running-df
-  sync
-  sleep 5.01s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
-#Filesystem     1K-blocks    Used Available Use% Mounted on
-#/dev/vda2        7096908 2611344   4182932  39% /
-  df / | tee -a "/run/disk-usage.log"
-
-  perl -e '@siz=( sort { $a => $b }
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
-print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
-
-  echo "exit_handler has completed"
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero
-    sync
-    sleep 3s
-    rm -f /zero
-  fi
-
+  gpu_exit_handler
+  common_exit_handler
   return 0
 }
 
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index cdf0d847f..1b204622e 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1363,3 +1363,13 @@ function setup_gpu_yarn() {
     configure_gpu_script
   fi
 }
+
+function gpu_exit_handler() {
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    for shmdir in /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    fi
+  fi
+}
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 729c556ed..bc1d98e94 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -49,103 +49,8 @@ function main() {
 }
 
 function exit_handler() {
-  # Purge private key material until next grant
-  clear_dkms_key
-
-  set +ex
-  echo "Exit handler invoked"
-
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
-  # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-
-    # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
-        umount -f ${shmdir}
-      fi
-    done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-  fi
-
-  if is_debuntu ; then
-    # Clean up OS package cache
-    apt-get -y -qq clean
-    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
-    # re-hold systemd package
-    if ge_debian12 ; then
-    apt-mark hold systemd libsystemd0 ; fi
-    hold_nvidia_packages
-  else
-    dnf clean all
-  fi
-
-  # print disk usage statistics for large components
-  if is_ubuntu ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  elif is_debian ; then
-    du -x -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /var/lib/{docker,mysql,} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
-      /usr/bin \
-      /usr \
-      /var \
-      / 2>/dev/null | sort -h
-  else
-    du -hs \
-      /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
-      /usr/lib64/google-cloud-sdk \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
-  fi
-
-  # Process disk usage logs from installation period
-  rm -f /run/keep-running-df
-  sync
-  sleep 5.01s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
-#Filesystem     1K-blocks    Used Available Use% Mounted on
-#/dev/vda2        7096908 2611344   4182932  39% /
-  df / | tee -a "/run/disk-usage.log"
-
-  perl -e '@siz=( sort { $a => $b }
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
-print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
-
-  echo "exit_handler has completed"
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero
-    sync
-    sleep 3s
-    rm -f /zero
-  fi
-
+  gpu_exit_handler
+  common_exit_handler
   return 0
 }
 

From 4320953abb720da5e4ee452580de6c67cf3d4b57 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 15:51:58 -0800
Subject: [PATCH 027/130] declaring constants prior to running functions

---
 templates/gpu/util_functions | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 1b204622e..5489ea33d 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1214,13 +1214,14 @@ function install_dependencies() {
 }
 
 function prepare_gpu_env(){
-  set_cuda_version
-  set_driver_version
-  set_cuda_repo_shortname
-  set_nv_urls
-  set_cuda_runfile_url
-  set_cudnn_version
-  set_cudnn_tarball_url
+  nvsmi_works="0"
+  NVIDIA_SMI_PATH='/usr/bin'
+  MIG_MAJOR_CAPS=0
+  IS_MIG_ENABLED=0
+  CUDA_KEYRING_PKG_INSTALLED="0"
+  CUDNN_PKG_NAME=""
+  CUDNN8_LOCAL_REPO_INSTALLED="0"
+  CUDNN8_PKG_NAME=""
 
   # Whether to install NVIDIA-provided or OS-provided GPU driver
   GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
@@ -1230,19 +1231,17 @@ function prepare_gpu_env(){
   INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
   readonly INSTALL_GPU_AGENT
 
-  NVIDIA_SMI_PATH='/usr/bin'
-  MIG_MAJOR_CAPS=0
-  IS_MIG_ENABLED=0
-
-  CUDA_KEYRING_PKG_INSTALLED="0"
-  CUDNN_PKG_NAME=""
-  CUDNN8_LOCAL_REPO_INSTALLED="0"
-  CUDNN8_PKG_NAME=""
-
   # Verify SPARK compatability
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+  readonly RAPIDS_RUNTIME
 
-  nvsmi_works="0"
+  set_cuda_version
+  set_driver_version
+  set_cuda_repo_shortname
+  set_nv_urls
+  set_cuda_runfile_url
+  set_cudnn_version
+  set_cudnn_tarball_url
 
   if   is_cuda11 ; then gcc_ver="11"
   elif is_cuda12 ; then gcc_ver="12" ; fi
@@ -1370,6 +1369,6 @@ function gpu_exit_handler() {
       if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
         umount -f ${shmdir}
       fi
-    fi
+    done
   fi
 }

From 2b0947bb194d856855e8ae0da5b9b12619d97fd3 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 16:10:15 -0800
Subject: [PATCH 028/130] removed old variables, included a current one which
 does not get exercise

---
 templates/gpu/util_functions | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 5489ea33d..07d6f92ec 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1218,10 +1218,9 @@ function prepare_gpu_env(){
   NVIDIA_SMI_PATH='/usr/bin'
   MIG_MAJOR_CAPS=0
   IS_MIG_ENABLED=0
-  CUDA_KEYRING_PKG_INSTALLED="0"
   CUDNN_PKG_NAME=""
-  CUDNN8_LOCAL_REPO_INSTALLED="0"
   CUDNN8_PKG_NAME=""
+  CUDA_LOCAL_REPO_INSTALLED="0"
 
   # Whether to install NVIDIA-provided or OS-provided GPU driver
   GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')

From 5dbc1f28ea24ae2699b71d865b230f26776b9478 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 16:23:52 -0800
Subject: [PATCH 029/130] do not break if variable undefined

---
 templates/gpu/util_functions | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 07d6f92ec..6516ad948 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -301,7 +301,8 @@ readonly CUDNN_TARBALL_URL
 }
 
 function install_cuda_keyring_pkg() {
-  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
+  if ( test -v CUDA_KEYRING_PKG_INSTALLED &&
+       [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi
   local kr_ver=1.1
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \

From c5d46d3e62c76783b839f4a04d876e93f7403393 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 19:00:08 -0800
Subject: [PATCH 030/130] order of operations error fixed with parantheses.

---
 templates/gpu/util_functions | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 6516ad948..55b241989 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -81,7 +81,7 @@ function set_cuda_version() {
 
   if le_ubuntu18 ; then
     DEFAULT_CUDA_VERSION="12.1.1"
-    CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
+    CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}"  #12.1
   fi
   readonly DEFAULT_CUDA_VERSION
 
@@ -648,7 +648,7 @@ function build_driver_from_github() {
       # build the kernel modules
       pushd open-gpu-kernel-modules
       install_build_dependencies
-      if is_cuda11 && is_ubuntu22 ; then
+      if ( is_cuda11 && is_ubuntu22 ) ; then
         echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
         exit 1
       fi

From 7be62b33afb27561e0e8908dae6ff8f7bc4b60e1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 19:29:46 -0800
Subject: [PATCH 031/130] using lower xgboost version for older dataproc images

---
 templates/gpu/util_functions | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 55b241989..f093c00dc 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1033,7 +1033,14 @@ function install_spark_rapids() {
   # Update SPARK RAPIDS config
   readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
-  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+
+  if version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
+    DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+  elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then
+    DEFAULT_XGBOOST_VERSION="1.6.2"
+  fi
+
+  readonly DEFAULT_XGBOOST_VERSION
   readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 
   local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'

From 41c327a6c7d2422e9c1d165db79d8bac29dd5465 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 20:24:09 -0800
Subject: [PATCH 032/130] test whether the variable is defined before testing
 its value

---
 templates/gpu/install_gpu_driver.sh.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index b2fa93b35..963becaf7 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -14,7 +14,7 @@ set -euxo pipefail
 
 function main() {
   setup_gpu_yarn
-  if [[ -n ${CUDNN_VERSION} ]]; then
+  if ( test -v CUDNN_VERSION && [[ -n ${CUDNN_VERSION} ]] ) ; then
     install_nvidia_nccl
     install_nvidia_cudnn
   fi

From b70477bf1f2efebc810e02de86cc7c96daf3ff77 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 20:34:59 -0800
Subject: [PATCH 033/130] refactor the xgboost installer a little

---
 templates/gpu/util_functions | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index f093c00dc..8af157063 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1034,8 +1034,12 @@ function install_spark_rapids() {
   readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
 
+  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
+  local -r scala_ver="2.12"
+  if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
+    DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.3
   if version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
-    DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+    DEFAULT_XGBOOST_VERSION="1.7.6"
   elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then
     DEFAULT_XGBOOST_VERSION="1.6.2"
   fi
@@ -1048,13 +1052,13 @@ function install_spark_rapids() {
   local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
 
   wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
     -P /usr/lib/spark/jars/
   wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
     -P /usr/lib/spark/jars/
   wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \
+    "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \
     -P /usr/lib/spark/jars/
 }
 

From 073ed1f6521dc5db3e15bc2a289fc40ee673e700 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 23:09:25 -0800
Subject: [PATCH 034/130] only minor changes

---
 spark-rapids/spark-rapids.sh | 10 ++++------
 templates/gpu/util_functions | 28 ++++++++++++++--------------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
index 0b4aabd57..6fdfbb78c 100644
--- a/spark-rapids/spark-rapids.sh
+++ b/spark-rapids/spark-rapids.sh
@@ -232,12 +232,10 @@ CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
 
 # EXCEPTIONS
 # Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
-if [[ "${OS_NAME}" == "ubuntu" ]]; then
-    if is_ubuntu18 ; then
-      CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
-      NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
-      CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
-    fi
+if is_ubuntu18 ; then
+  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
+  NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
+  CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
 fi
 
 # Verify Secure boot
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 8af157063..82f05d6e8 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1036,9 +1036,9 @@ function install_spark_rapids() {
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
-  if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
+  if   version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
     DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.3
-  if version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
+  elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
     DEFAULT_XGBOOST_VERSION="1.7.6"
   elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then
     DEFAULT_XGBOOST_VERSION="1.6.2"
@@ -1100,17 +1100,18 @@ EOF
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
   local gpu_count
   gpu_count="$(lspci | grep NVIDIA | wc -l)"
-  if version_ge "${gpu_count}" "1" ; then
-    local executor_cores
-    executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
-    local executor_memory
-    executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
-    local task_cpus=2
-    local gpu_amount
-    gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-    if version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ; then gpu_amount="0.5" ; fi
-
-    cat >>"${spark_defaults_conf}" <<EOF
+  if version_lt "${gpu_count}" "1" ; then return ; fi
+
+  local executor_cores
+  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+  local executor_memory
+  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+  local task_cpus=2
+  local gpu_amount
+  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+  if version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ; then gpu_amount="0.5" ; fi
+
+  cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
 # Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
 # query explain output won't show GPU operator, if the user has doubts
@@ -1128,7 +1129,6 @@ spark.task.cpus=2
 spark.yarn.unmanagedAM.enabled=false
 ###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
 EOF
-  fi
 }
 
 function configure_gpu_isolation() {

From 4f66a51c56c3d1ade992fba94eaeef42abe48e5d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 23:10:18 -0800
Subject: [PATCH 035/130] explicitly notifying at the completion of the main
 function

---
 templates/gpu/install_gpu_driver.sh.in    | 5 ++++-
 templates/spark-rapids/spark-rapids.sh.in | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 963becaf7..53e2f33c7 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -14,7 +14,7 @@ set -euxo pipefail
 
 function main() {
   setup_gpu_yarn
-  if ( test -v CUDNN_VERSION && [[ -n ${CUDNN_VERSION} ]] ) ; then
+  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
     install_nvidia_nccl
     install_nvidia_cudnn
   fi
@@ -37,6 +37,8 @@ function main() {
       systemctl restart "hadoop-yarn-${svc}.service"
     fi
   done
+  echo "main complete"
+  return 0
 }
 
 function exit_handler() {
@@ -48,6 +50,7 @@ function exit_handler() {
 function prepare_to_install(){
   prepare_common_env
   prepare_gpu_env
+  trap exit_handler EXIT
 }
 
 prepare_to_install
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index bc1d98e94..c5691fea9 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -46,6 +46,8 @@ function main() {
       systemctl restart "hadoop-yarn-${svc}.service"
     fi
   done
+  echo "main complete"
+  return 0
 }
 
 function exit_handler() {
@@ -57,6 +59,7 @@ function exit_handler() {
 function prepare_to_install(){
   prepare_common_env
   prepare_gpu_env
+  trap exit_handler EXIT
 
   # Fetch instance roles and runtime
   readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)

From f8a9b7dc82f00a43acda6ac5cc7efd8a04c6f53a Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 23:26:37 -0800
Subject: [PATCH 036/130] moved trap outside of the template

---
 templates/common/util_functions | 1 -
 1 file changed, 1 deletion(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 929eff37a..6a490ad71 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -573,7 +573,6 @@ function prepare_common_env() {
   export DEBIAN_FRONTEND=noninteractive
 
   mkdir -p "${workdir}/complete"
-  trap exit_handler EXIT
   set_proxy
   mount_ramdisk
 

From 19520b4f92fe5c3978c77d6aece88aabacaf9291 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 26 Dec 2024 23:42:16 -0800
Subject: [PATCH 037/130] stop / start instead of restart

---
 templates/gpu/install_gpu_driver.sh.in    | 5 ++++-
 templates/spark-rapids/spark-rapids.sh.in | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 53e2f33c7..6a9fde18e 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -32,9 +32,12 @@ function main() {
   fi
 
   # Restart YARN services if they are running already
+  nodes_include_gcs="gs:/$(get_metadata_attribute dataproc-bucket)/google-cloud-dataproc-metainfo/$(get_metadata_attribute dataproc-cluster-uuid)/nodes_include"
+  gsutil ls "${nodes_include_gcs}"
   for svc in resourcemanager nodemanager; do
     if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl restart "hadoop-yarn-${svc}.service"
+      systemctl  stop "hadoop-yarn-${svc}.service"
+      systemctl start "hadoop-yarn-${svc}.service"
     fi
   done
   echo "main complete"
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index c5691fea9..f20689cf0 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -43,7 +43,8 @@ function main() {
   # Restart YARN services if they are running already
   for svc in resourcemanager nodemanager; do
     if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl restart "hadoop-yarn-${svc}.service"
+      systemctl  stop "hadoop-yarn-${svc}.service"
+      systemctl start "hadoop-yarn-${svc}.service"
     fi
   done
   echo "main complete"

From f659ec55fb2dec937a3e3b9223168f86dc7fe8a4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 01:06:52 -0800
Subject: [PATCH 038/130] skipping install on gpu-less systems more quickly

---
 templates/common/util_functions           | 2 --
 templates/gpu/install_gpu_driver.sh.in    | 3 +++
 templates/gpu/util_functions              | 4 +++-
 templates/spark-rapids/spark-rapids.sh.in | 2 ++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 6a490ad71..114fce6a5 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -598,8 +598,6 @@ function prepare_common_env() {
     time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
   ) fi
 
-  install_dependencies
-
   # Monitor disk usage in a screen session
   df / > "/run/disk-usage.log"
   touch "/run/keep-running-df"
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 6a9fde18e..8ce2088d7 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -14,6 +14,9 @@ set -euxo pipefail
 
 function main() {
   setup_gpu_yarn
+
+  echo "yarn setup complete"
+
   if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
     install_nvidia_nccl
     install_nvidia_cudnn
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 82f05d6e8..cea01b03d 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1100,7 +1100,6 @@ EOF
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
   local gpu_count
   gpu_count="$(lspci | grep NVIDIA | wc -l)"
-  if version_lt "${gpu_count}" "1" ; then return ; fi
 
   local executor_cores
   executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
@@ -1223,6 +1222,7 @@ function install_dependencies() {
   pkg_list="pciutils screen"
   if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
   elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+  lspci | grep -q NVIDIA || exit 0
 }
 
 function prepare_gpu_env(){
@@ -1246,6 +1246,8 @@ function prepare_gpu_env(){
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
   readonly RAPIDS_RUNTIME
 
+  install_dependencies
+
   set_cuda_version
   set_driver_version
   set_cuda_repo_shortname
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index f20689cf0..c5a204703 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -29,6 +29,8 @@ set -euxo pipefail
 function main() {
   setup_gpu_yarn
 
+  echo "yarn setup complete"
+
   if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
     install_spark_rapids
     configure_gpu_script

From af817f0c49b4a8365072a40cf226183b306e6fb1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 01:58:07 -0800
Subject: [PATCH 039/130] install_dependencies is called from base template
 prep function

---
 templates/gpu/install_gpu_driver.sh.in    | 1 +
 templates/gpu/util_functions              | 2 --
 templates/spark-rapids/spark-rapids.sh.in | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 8ce2088d7..7974640ec 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -54,6 +54,7 @@ function exit_handler() {
 }
 
 function prepare_to_install(){
+  install_dependencies
   prepare_common_env
   prepare_gpu_env
   trap exit_handler EXIT
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index cea01b03d..8c7de3b48 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1246,8 +1246,6 @@ function prepare_gpu_env(){
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
   readonly RAPIDS_RUNTIME
 
-  install_dependencies
-
   set_cuda_version
   set_driver_version
   set_cuda_repo_shortname
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index c5a204703..77649fb19 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -60,6 +60,7 @@ function exit_handler() {
 }
 
 function prepare_to_install(){
+  install_dependencies
   prepare_common_env
   prepare_gpu_env
   trap exit_handler EXIT

From 2e7441ba75c245d0e719aa4b911df6718455bff6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 02:29:29 -0800
Subject: [PATCH 040/130] re-thought about the dependencies install time

---
 templates/common/util_functions           | 9 +++++++++
 templates/gpu/install_gpu_driver.sh.in    | 1 -
 templates/gpu/util_functions              | 7 -------
 templates/spark-rapids/spark-rapids.sh.in | 1 -
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 114fce6a5..bb5073cd6 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -541,6 +541,13 @@ function check_secure_boot() {
   configure_dkms_certs
 }
 
+function install_dependencies() {
+  pkg_list="pciutils screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+  lspci | grep -q NVIDIA || exit 0
+}
+
 function prepare_common_env() {
   define_os_comparison_functions
 
@@ -598,6 +605,8 @@ function prepare_common_env() {
     time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
   ) fi
 
+  install_dependencies
+
   # Monitor disk usage in a screen session
   df / > "/run/disk-usage.log"
   touch "/run/keep-running-df"
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 7974640ec..8ce2088d7 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -54,7 +54,6 @@ function exit_handler() {
 }
 
 function prepare_to_install(){
-  install_dependencies
   prepare_common_env
   prepare_gpu_env
   trap exit_handler EXIT
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 8c7de3b48..d8371a258 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1218,13 +1218,6 @@ function install_build_dependencies() {
   touch "${workdir}/complete/build-dependencies"
 }
 
-function install_dependencies() {
-  pkg_list="pciutils screen"
-  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
-  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
-  lspci | grep -q NVIDIA || exit 0
-}
-
 function prepare_gpu_env(){
   nvsmi_works="0"
   NVIDIA_SMI_PATH='/usr/bin'
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 77649fb19..c5a204703 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -60,7 +60,6 @@ function exit_handler() {
 }
 
 function prepare_to_install(){
-  install_dependencies
   prepare_common_env
   prepare_gpu_env
   trap exit_handler EXIT

From 29631a0c615a4730c87f50fc09910ddcf8b8a22f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 14:20:38 -0800
Subject: [PATCH 041/130] refactored configure_gpu_exclusive_mode to fewer
 lines

---
 templates/gpu/util_functions | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index d8371a258..03417020b 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -981,6 +981,7 @@ function install_gpu_agent() {
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
   local venv="${install_dir}/venv"
   python3 -m venv "${venv}"
+
 (
   source "${venv}/bin/activate"
   python3 -m pip install --upgrade pip
@@ -1012,13 +1013,10 @@ EOF
 }
 
 function configure_gpu_exclusive_mode() {
-  # check if running spark 3, if not, enable GPU exclusive mode
-  local spark_version
-  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
-  if [[ ${spark_version} != 3.* ]]; then
-    # include exclusive mode on GPU
-    nvidia-smi -c EXCLUSIVE_PROCESS
-  fi
+  # only run this function when spark < 3.0
+  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
+  # include exclusive mode on GPU
+  nvidia-smi -c EXCLUSIVE_PROCESS
 }
 
 function fetch_mig_scripts() {

From 7ea7653f47fa9bb2c2c6bdaa6c8dc21bdfaf59d5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 15:01:17 -0800
Subject: [PATCH 042/130] refactored gpu-related code out of common function
 library ; less reactive to not having GPU

---
 templates/common/util_functions        |  8 +--
 templates/gpu/install_gpu_driver.sh.in |  1 -
 templates/gpu/util_functions           | 87 +++++++++++++++-----------
 3 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index bb5073cd6..9c2e89372 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -173,11 +173,6 @@ function configure_yarn_resources() {
 
 # This configuration should be applied only if GPU is attached to the node
 function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
   set_hadoop_property 'yarn-site.xml' \
@@ -542,10 +537,11 @@ function check_secure_boot() {
 }
 
 function install_dependencies() {
+  test -f "${workdir}/complete/install-dependencies" && return 0
   pkg_list="pciutils screen"
   if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
   elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
-  lspci | grep -q NVIDIA || exit 0
+  touch "${workdir}/complete/install-dependencies"
 }
 
 function prepare_common_env() {
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 8ce2088d7..ae4693f16 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -21,7 +21,6 @@ function main() {
     install_nvidia_nccl
     install_nvidia_cudnn
   fi
-  install_nvidia_container_toolkit
 
   if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
     install_spark_rapids
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 03417020b..71b5c89d0 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1096,8 +1096,6 @@ EOF
   chmod a+rx "${gpus_resources_script}"
 
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-  local gpu_count
-  gpu_count="$(lspci | grep NVIDIA | wc -l)"
 
   local executor_cores
   executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
@@ -1128,6 +1126,15 @@ spark.yarn.unmanagedAM.enabled=false
 EOF
 }
 
+function configure_yarn_nodemanager_gpu() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
+  configure_yarn_nodemanager
+}
+
 function configure_gpu_isolation() {
   # enable GPU isolation
   sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
@@ -1217,6 +1224,7 @@ function install_build_dependencies() {
 }
 
 function prepare_gpu_env(){
+  gpu_count="$(lspci | grep -q NVIDIA | wc -l)"
   nvsmi_works="0"
   NVIDIA_SMI_PATH='/usr/bin'
   MIG_MAJOR_CAPS=0
@@ -1321,48 +1329,51 @@ function setup_gpu_yarn() {
   # regardless if they have attached GPUs
   configure_yarn_resources
 
-  # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
-    # if this is called without the MIG script then the drivers are not installed
-    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
-    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
+  # When there is no GPU, but the installer is executing on a master node:
+  if [[ "${gpu_count}" == "0" ]] ; then
+    if [[ "${ROLE}" == "Master" ]]; then
+      configure_yarn_nodemanager
     fi
+    return 0
+  fi
 
-    # if mig is enabled drivers would have already been installed
-    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-      install_nvidia_gpu_driver
-      install_cuda
-      load_kernel_module
-
-      #Install GPU metrics collection in Stackdriver if needed
-      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-        install_gpu_agent
-#        install_gpu_monitoring_agent
-        echo 'GPU metrics agent successfully deployed.'
-      else
-        echo 'GPU metrics agent has not been installed.'
+  # if this is called without the MIG script then the drivers are not installed
+  migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
+  if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
+  NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+  if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+    if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+      if (echo "${migquery_result}" | grep Enabled); then
+        IS_MIG_ENABLED=1
+        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+        MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+        fetch_mig_scripts
       fi
-      configure_gpu_exclusive_mode
     fi
+  fi
 
-    configure_yarn_nodemanager
-    configure_gpu_script
-    configure_gpu_isolation
-  elif [[ "${ROLE}" == "Master" ]]; then
-    configure_yarn_nodemanager
-    configure_gpu_script
+  # if mig is enabled drivers would have already been installed
+  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+    install_nvidia_gpu_driver
+    install_cuda
+    load_kernel_module
+
+    #Install GPU metrics collection in Stackdriver if needed
+    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+      install_gpu_agent
+#      install_gpu_monitoring_agent
+      echo 'GPU metrics agent successfully deployed.'
+    else
+      echo 'GPU metrics agent has not been installed.'
+    fi
+    configure_gpu_exclusive_mode
   fi
+
+  install_nvidia_container_toolkit
+  configure_yarn_nodemanager_gpu
+  configure_gpu_script
+  configure_gpu_isolation
 }
 
 function gpu_exit_handler() {

From 70349a655fda55974339a765f371b7cf95289403 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 15:11:24 -0800
Subject: [PATCH 043/130] being more surgical about signing material usage

---
 templates/common/util_functions | 5 -----
 templates/gpu/util_functions    | 9 ++++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 9c2e89372..c093b43f3 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -532,8 +532,6 @@ function check_secure_boot() {
                       mok_der=/var/lib/shim-signed/mok/MOK.der
                  else mok_key=/var/lib/dkms/mok.key
                       mok_der=/var/lib/dkms/mok.pub ; fi
-
-  configure_dkms_certs
 }
 
 function install_dependencies() {
@@ -613,9 +611,6 @@ function prepare_common_env() {
 }
 
 function common_exit_handler() {
-  # Purge private key material until next grant
-  clear_dkms_key
-
   set +ex
   echo "Exit handler invoked"
 
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 71b5c89d0..d5975e566 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -657,12 +657,14 @@ function build_driver_from_github() {
         2> kernel-open/build_error.log
       # Sign kernel modules
       if [[ -n "${PSN}" ]]; then
+        configure_dkms_certs
         for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
           "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
           "${mok_key}" \
           "${mok_der}" \
           "${module}"
         done
+	clear_dkms_key
       fi
       make modules_install \
         >>  kernel-open/build.log \
@@ -702,12 +704,10 @@ function build_driver_from_packages() {
     add_contrib_component
     apt-get update -qq
     execute_with_retries apt-get install -y -qq --no-install-recommends dkms
-    #configure_dkms_certs
     execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
     sync
 
   elif is_rocky ; then
-    #configure_dkms_certs
     if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
       echo "nvidia-driver:${DRIVER}-dkms installed successfully"
     else
@@ -715,7 +715,6 @@ function build_driver_from_packages() {
     fi
     sync
   fi
-  #clear_dkms_key
 }
 
 function install_nvidia_userspace_runfile() {
@@ -767,7 +766,7 @@ function install_nvidia_userspace_runfile() {
         echo "cache hit"
       else
         install_build_dependencies
-
+        configure_dkms_certs
         local signing_options
         signing_options=""
         if [[ -n "${PSN}" ]]; then
@@ -778,7 +777,6 @@ function install_nvidia_userspace_runfile() {
           --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
           "
         fi
-
         runfile_args="--no-dkms ${signing_options}"
       fi
     }
@@ -797,6 +795,7 @@ function install_nvidia_userspace_runfile() {
       gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
       depmod -a
     else
+      clear_dkms_key
       tar czvf "${local_tarball}" \
         /var/log/nvidia-installer.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')

From b5473c58f7fcedf7f6e74d822c706dbed4b00b7c Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 15:48:35 -0800
Subject: [PATCH 044/130] removed dependency on pciutils ; defined is_debuntu
 with other os comparison functions

---
 templates/common/util_functions | 5 ++---
 templates/gpu/util_functions    | 9 ++++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index c093b43f3..b35407074 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -28,10 +28,9 @@ function define_os_comparison_functions() {
       eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
     done
   done
+  eval "function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )"
 }
 
-function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
-
 function os_vercat()   ( set +x
   if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
   elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
@@ -536,7 +535,7 @@ function check_secure_boot() {
 
 function install_dependencies() {
   test -f "${workdir}/complete/install-dependencies" && return 0
-  pkg_list="pciutils screen"
+  pkg_list="screen"
   if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
   elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
   touch "${workdir}/complete/install-dependencies"
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index d5975e566..f10c15f06 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1223,7 +1223,8 @@ function install_build_dependencies() {
 }
 
 function prepare_gpu_env(){
-  gpu_count="$(lspci | grep -q NVIDIA | wc -l)"
+  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+  echo "gpu_count=[${gpu_count}]"
   nvsmi_works="0"
   NVIDIA_SMI_PATH='/usr/bin'
   MIG_MAJOR_CAPS=0
@@ -1294,7 +1295,9 @@ function configure_mig_cgi() {
   if test -n "${META_MIG_CGI_VALUE}"; then
     nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
   else
-    if lspci | grep -q H100 ; then
+    # https://pci-ids.ucw.cz/v2.2/pci.ids
+    local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)"
+    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then
       # run the following command to list placement profiles
       # nvidia-smi mig -lgipp
       #
@@ -1309,7 +1312,7 @@ function configure_mig_cgi() {
 
       # For H100 3D controllers, use profile 19, 7x1G instances
       nvidia-smi mig -cgi 19 -C
-    elif lspci | grep -q A100 ; then
+    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
       # Dataproc only supports A100s right now split in 2 if not specified
       # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
       nvidia-smi mig -cgi 9,9 -C

From be3dbf665a7df16b638ac452e0622f36b68dfae4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 15:55:03 -0800
Subject: [PATCH 045/130] again I meant elif

---
 templates/gpu/util_functions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index f10c15f06..e6d3f38fa 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1312,7 +1312,7 @@ function configure_mig_cgi() {
 
       # For H100 3D controllers, use profile 19, 7x1G instances
       nvidia-smi mig -cgi 19 -C
-    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
+    elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
       # Dataproc only supports A100s right now split in 2 if not specified
       # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
       nvidia-smi mig -cgi 9,9 -C

From 3ca8c913c28935ba95ab980b67bc0f6b0920973f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 16:12:02 -0800
Subject: [PATCH 046/130] fall back on metadata value if modulus_md5sum
 variable undefined

---
 templates/gpu/util_functions | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index e6d3f38fa..90e02b4b8 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -753,10 +753,8 @@ function install_nvidia_userspace_runfile() {
     test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
       local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
       local_tarball="${workdir}/${build_tarball}"
-      local build_dir
-      if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
-        then build_dir="${modulus_md5sum}"
-        else build_dir="unsigned" ; fi
+      local def_dir="${modulus_md5sum:-unsigned}"
+      local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
 
       local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
@@ -980,7 +978,6 @@ function install_gpu_agent() {
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
   local venv="${install_dir}/venv"
   python3 -m venv "${venv}"
-
 (
   source "${venv}/bin/activate"
   python3 -m pip install --upgrade pip

From 83d5ccc91af28e0218d119224fe0822b49ccec12 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 16:21:11 -0800
Subject: [PATCH 047/130] switch to other build_dir variable assignment

---
 templates/gpu/util_functions | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 90e02b4b8..529ea30ea 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -635,10 +635,8 @@ function build_driver_from_github() {
   test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
     local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
     local local_tarball="${workdir}/${build_tarball}"
-    local build_dir
-    if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
-      then build_dir="${modulus_md5sum}"
-      else build_dir="unsigned" ; fi
+    local def_dir="${modulus_md5sum:-unsigned}"
+    local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
 
     local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 

From 93fdb304bdf2311821192dc47f6435db5addd3b5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 16:37:52 -0800
Subject: [PATCH 048/130] parens

---
 templates/gpu/util_functions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 529ea30ea..d2c2fe32e 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1098,7 +1098,7 @@ EOF
   local task_cpus=2
   local gpu_amount
   gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-  if version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ; then gpu_amount="0.5" ; fi
+  if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi
 
   cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######

From 917f4b673876aa84933dea4b83cf9ca234edaba5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 17:01:52 -0800
Subject: [PATCH 049/130] allow failure when grepping PCI devices for 10DE

---
 templates/common/util_functions | 92 +++++++++++++++++----------------
 templates/gpu/util_functions    |  2 +
 2 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index b35407074..cd26dbf83 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -594,17 +594,20 @@ function prepare_common_env() {
   fi
 
   # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+
+ ( set +e
     time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  ) fi
+  )
 
-  install_dependencies
+    install_dependencies
 
-  # Monitor disk usage in a screen session
-  df / > "/run/disk-usage.log"
-  touch "/run/keep-running-df"
-  screen -d -m -LUS keep-running-df \
-    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+    # Monitor disk usage in a screen session
+    df / > "/run/disk-usage.log"
+    touch "/run/keep-running-df"
+    screen -d -m -LUS keep-running-df \
+      bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+ fi
 
   touch "${workdir}/complete/prepare.common"
 }
@@ -644,43 +647,45 @@ function common_exit_handler() {
     dnf clean all
   fi
 
-  # print disk usage statistics for large components
-  if is_ubuntu ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /opt/conda/miniconda3 | sort -h
-  elif is_debian ; then
-    du -x -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
-      /var/lib/{docker,mysql,} \
-      /opt/nvidia/* \
-      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
-      /usr/bin \
-      /usr \
-      /var \
-      / 2>/dev/null | sort -h
-  else
-    du -hs \
-      /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
-      /usr/lib64/google-cloud-sdk \
-      /opt/nvidia/* \
-      /opt/conda/miniconda3
-  fi
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    # print disk usage statistics for large components
+    if is_ubuntu ; then
+      du -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+        /usr/lib \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3 | sort -h
+    elif is_debian ; then
+      du -x -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
+        /var/lib/{docker,mysql,} \
+        /opt/nvidia/* \
+        /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+        /usr/bin \
+        /usr \
+        /var \
+        / 2>/dev/null | sort -h
+    else
+      du -hs \
+        /var/lib/docker \
+        /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
+        /usr/lib64/google-cloud-sdk \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3
+    fi
 
-  # Process disk usage logs from installation period
-  rm -f /run/keep-running-df
-  sync
-  sleep 5.01s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
+    # Process disk usage logs from installation period
+    rm -f /run/keep-running-df
+    sync
+    sleep 5.01s
+    # compute maximum size of disk during installation
+    # Log file contains logs like the following (minus the preceeding #):
 #Filesystem     1K-blocks    Used Available Use% Mounted on
 #/dev/vda2        7096908 2611344   4182932  39% /
-  df / | tee -a "/run/disk-usage.log"
+    df / | tee -a "/run/disk-usage.log"
 
-  perl -e '@siz=( sort { $a => $b }
+    perl -e \
+          '@siz=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
                   grep { m:^/: } <STDIN> );
 $max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
@@ -689,13 +694,12 @@ print( "    samples-taken: ", scalar @siz, $/,
        "minimum-disk-used: $min", $/,
        "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
 
-  echo "exit_handler has completed"
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    # zero free disk space
     dd if=/dev/zero of=/zero
     sync
     sleep 3s
     rm -f /zero
   fi
+  echo "exit_handler has completed"
 }
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index d2c2fe32e..a7f611902 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1218,7 +1218,9 @@ function install_build_dependencies() {
 }
 
 function prepare_gpu_env(){
+  set +e
   gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+  set -e
   echo "gpu_count=[${gpu_count}]"
   nvsmi_works="0"
   NVIDIA_SMI_PATH='/usr/bin'

From 668db727d8078117c2b1fff93e9e52201c891754 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 17:36:38 -0800
Subject: [PATCH 050/130] removed listing of nodes_include ; does not work in
 custom-images context

---
 templates/gpu/install_gpu_driver.sh.in | 2 --
 1 file changed, 2 deletions(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index ae4693f16..be3baca89 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -34,8 +34,6 @@ function main() {
   fi
 
   # Restart YARN services if they are running already
-  nodes_include_gcs="gs:/$(get_metadata_attribute dataproc-bucket)/google-cloud-dataproc-metainfo/$(get_metadata_attribute dataproc-cluster-uuid)/nodes_include"
-  gsutil ls "${nodes_include_gcs}"
   for svc in resourcemanager nodemanager; do
     if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
       systemctl  stop "hadoop-yarn-${svc}.service"

From 2193c2883671594e712597a6981952b024724c2c Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 19:10:10 -0800
Subject: [PATCH 051/130] min spark version supported by newer rapids is
 insufficient ; xgboost version is fine where it is at

---
 templates/gpu/util_functions | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index a7f611902..3c5bb7f06 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1023,8 +1023,7 @@ function fetch_mig_scripts() {
 
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
-  readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
-  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+  DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
@@ -1033,9 +1032,12 @@ function install_spark_rapids() {
   elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
     DEFAULT_XGBOOST_VERSION="1.7.6"
   elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then
-    DEFAULT_XGBOOST_VERSION="1.6.2"
+    DEFAULT_XGBOOST_VERSION="1.7.6"
+    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
 
+  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+
   readonly DEFAULT_XGBOOST_VERSION
   readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 

From 992d83acd3a1cac2a2607f31d46573b1691741ab Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 20:48:57 -0800
Subject: [PATCH 052/130] skipping fewer tests

---
 spark-rapids/test_spark_rapids.py | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 2d67a0df2..9b9481716 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -62,13 +62,6 @@ def verify_spark_job_sql(self):
                             ("STANDARD", ["w-0"], GPU_T4))
   def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
 
-    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      self.skipTest("Not supported in 2.0 and earlier images")
-
-    if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
-         ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases")
-
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -93,13 +86,6 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
                             ("STANDARD", ["w-0"], GPU_T4))
   def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
 
-    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      self.skipTest("Not supported in 2.0 and earlier images")
-
-    if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
-         ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases")
-
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -129,19 +115,16 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
   def test_non_default_cuda_versions(self, configuration, machine_suffixes,
                                      accelerator, cuda_version, driver_version):
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0.1") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.1.1") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.1.1 not supported on older debian/ubuntu releases")
 
     if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
     and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
 
-    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      self.skipTest("Not supported in 2.0 and earlier images")
-
     metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
                 ",cuda-version={0},driver-version={1}".format(cuda_version, driver_version))
 

From 7170872b973851103545811ed04a61964781357a Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 21:15:49 -0800
Subject: [PATCH 053/130] simplified rapids / xgboost default version logic

---
 templates/gpu/util_functions | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 3c5bb7f06..1f6672051 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1023,22 +1023,17 @@ function fetch_mig_scripts() {
 
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
-  DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_XGBOOST_VERSION="1.7.6"
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
-  if   version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
-    DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.3
-  elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
-    DEFAULT_XGBOOST_VERSION="1.7.6"
-  elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then
-    DEFAULT_XGBOOST_VERSION="1.7.6"
+
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
     DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
 
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
-
-  readonly DEFAULT_XGBOOST_VERSION
   readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 
   local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'

From b33cb27b68c09d0591682779a3e0f92beadcf03b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 27 Dec 2024 23:59:40 -0800
Subject: [PATCH 054/130] ubuntu sometimes takes a while to bring gcloud online

---
 templates/common/util_functions | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index cd26dbf83..9f7075f0b 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -589,6 +589,9 @@ function prepare_common_env() {
     apt-get -o DPkg::Lock::Timeout=60 -y autoremove
     if ge_debian12 ; then
     apt-mark unhold systemd libsystemd0 ; fi
+    if is_ubuntu ; then
+      while ! command -v gcloud ; do sleep 5s ; done
+    fi
   else
     dnf clean all
   fi

From c56440a80ab3dc2533021d6bea8473d57b15b482 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 28 Dec 2024 00:00:14 -0800
Subject: [PATCH 055/130] only using 24.08.1 on 2.2 images ; fix a typo in a
 comment

---
 templates/gpu/util_functions | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 1f6672051..26c4d02f9 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -246,7 +246,7 @@ function set_cuda_runfile_url() {
           ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
           ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
           ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
-          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
+          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
           ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
   )
 
@@ -1023,14 +1023,14 @@ function fetch_mig_scripts() {
 
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   local DEFAULT_XGBOOST_VERSION="1.7.6"
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
 
-  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then
+    DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
   fi
 
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})

From f10df49a93603a746274a8eb88fb20c682cd7133 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 29 Dec 2024 15:21:11 -0800
Subject: [PATCH 056/130] refactored ; these files should be quite similar now

---
 templates/gpu/install_gpu_driver.sh.in    |   7 -
 templates/spark-rapids/mig.sh.in          | 212 ++--------------------
 templates/spark-rapids/spark-rapids.sh.in |   8 +-
 3 files changed, 12 insertions(+), 215 deletions(-)

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index be3baca89..ffdda45e4 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -33,13 +33,6 @@ function main() {
     echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
   fi
 
-  # Restart YARN services if they are running already
-  for svc in resourcemanager nodemanager; do
-    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl  stop "hadoop-yarn-${svc}.service"
-      systemctl start "hadoop-yarn-${svc}.service"
-    fi
-  done
   echo "main complete"
   return 0
 }
diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
index 0779a1c28..27da6ffd0 100644
--- a/templates/spark-rapids/mig.sh.in
+++ b/templates/spark-rapids/mig.sh.in
@@ -19,217 +19,27 @@ set -euxo pipefail
 
 [% INSERT gpu/util_functions %]
 
-[% INSERT 'secure-boot/util_functions' %]
-
-function exit_handler() {
-  # Purge private key material until next grant
-  clear_dkms_key
-
-  set +ex
-  echo "Exit handler invoked"
-
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
-  # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-
-    # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
-        umount -f ${shmdir}
-      fi
-    done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-  fi
-
-  if is_debuntu ; then
-    # Clean up OS package cache
-    apt-get -y -qq clean
-    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
-    # re-hold systemd package
-    if ge_debian12 ; then
-    apt-mark hold systemd libsystemd0 ; fi
-    hold_nvidia_packages
-  else
-    dnf clean all
-  fi
-
-  # print disk usage statistics for large components
-  if is_ubuntu ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  elif is_debian ; then
-    du -x -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /var/lib/{docker,mysql,} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
-      /usr/bin \
-      /usr \
-      /var \
-      / 2>/dev/null | sort -h
-  else
-    du -hs \
-      /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
-      /usr/lib64/google-cloud-sdk \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
-  fi
-
-  # Process disk usage logs from installation period
-  rm -f /run/keep-running-df
-  sync
-  sleep 5.01s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
-#Filesystem     1K-blocks    Used Available Use% Mounted on
-#/dev/vda2        7096908 2611344   4182932  39% /
-  df / | tee -a "/run/disk-usage.log"
+function main() {
+  setup_gpu_yarn
 
-  perl -e '@siz=( sort { $a => $b }
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
-print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+  echo "yarn setup complete"
 
-  echo "exit_handler has completed"
+  enable_and_configure_mig
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero
-    sync
-    sleep 3s
-    rm -f /zero
-  fi
+  echo "main complete"
+  return 0
+}
 
+function exit_handler() {
+  gpu_exit_handler
+  common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
-  # Verify OS compatability and Secure boot state
-  check_os
-  check_secure_boot
-
+  prepare_common_env
   prepare_gpu_env
-
-  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
-  ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
-  workdir=/opt/install-dpgce
-  tmpdir=/tmp/
-  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
-  readonly temp_bucket
-  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
-  uname_r=$(uname -r)
-  readonly uname_r
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  export DEBIAN_FRONTEND=noninteractive
-
-  mkdir -p "${workdir}"
   trap exit_handler EXIT
-  set_proxy
-  mount_ramdisk
-
-  readonly install_log="${tmpdir}/install.log"
-
-  if test -f "${workdir}/prepare-complete" ; then return ; fi
-
-  repair_old_backports
-
-  if is_debuntu ; then
-    clean_up_sources_lists
-    apt-get update -qq
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-    if ge_debian12 ; then
-    apt-mark unhold systemd libsystemd0 ; fi
-  else
-    dnf clean all
-  fi
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  ) fi
-
-  install_dependencies
-
-  # Monitor disk usage in a screen session
-  df / > "/run/disk-usage.log"
-  touch "/run/keep-running-df"
-  screen -d -m -LUS keep-running-df \
-    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
-
-  touch "${workdir}/prepare-complete"
-}
-
-function enable_and_configure_mig() {
-  # default MIG to on when this script is used
-  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
-
-  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
-
-  enable_mig
-
-  mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
-
-  NUM_GPUS_WITH_DIFF_MIG_MODES=
-  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled all on GPUs.  Failing"       ; exit 1 ; fi
-  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured on but NOT enabled.  Failing" ; exit 1 ; fi
-
-  echo "MIG is fully enabled"
-  configure_mig_cgi
-}
-
-function main() {
-  # default MIG to on when this script is used
-  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
-
-  if ! (lspci | grep -q NVIDIA) ; then return ; fi
-  if [[ $META_MIG_VALUE -ne 0 ]]; then
-    # if the first invocation, the NVIDIA drivers and tools are not installed
-    if [[ -f "/usr/bin/nvidia-smi" ]]; then
-      # check to see if we already enabled mig mode and rebooted so we don't end
-      # up in infinite reboot loop
-      mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
-      NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)"
-      if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
-        if (echo "${mig_mode_current}" | grep Enabled); then
-          echo "MIG is enabled on all GPUs, configuring instances"
-          configure_mig_cgi
-          exit 0
-        else
-          echo "GPUs present but MIG is not enabled"
-        fi
-      else
-        echo "More than 1 GPU with MIG configured differently between them"
-      fi
-    fi
-  fi
-
-  install_nvidia_gpu_driver
-  enable_and_configure_mig
 }
 
 prepare_to_install
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index c5a204703..ac8ec5c3f 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -19,6 +19,7 @@
 # For details see
 # github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot
 #
+[% PROCESS common/template_disclaimer %]
 
 set -euxo pipefail
 
@@ -42,13 +43,6 @@ function main() {
     echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
   fi
 
-  # Restart YARN services if they are running already
-  for svc in resourcemanager nodemanager; do
-    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl  stop "hadoop-yarn-${svc}.service"
-      systemctl start "hadoop-yarn-${svc}.service"
-    fi
-  done
   echo "main complete"
   return 0
 }

From f3a103ec996f7079a21769ca2d49cb4931285468 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 12:01:51 -0800
Subject: [PATCH 057/130] returning spark-rapids/* to master ; this version of
 these templates was used to generate a spark-rapids/spark-rapids.sh which
 passes all master tests

---
 gpu/install_gpu_driver.sh       | 2418 ++++++++++++++++++++-----------
 spark-rapids/mig.sh             | 2201 ++++------------------------
 spark-rapids/spark-rapids.sh    |   10 +-
 templates/common/util_functions |    8 +
 templates/gpu/util_functions    |   56 +-
 5 files changed, 1917 insertions(+), 2776 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 25efb2a49..8d3d5aa84 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -11,6 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+#
+# This initialization action is generated from
+# initialization-actions/templates/gpu/install_gpu_driver.sh.in
+#
+# Modifications made directly to the generated file will be lost when
+# the template is re-evaluated
+
 #
 # This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
 
@@ -25,27 +33,29 @@ function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
 function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
 
-readonly -A supported_os=(
-  ['debian']="10 11 12"
-  ['rocky']="8 9"
-  ['ubuntu']="18.04 20.04 22.04"
-)
-
-# dynamically define OS version test utility functions
-if [[ "$(os_id)" == "rocky" ]];
-then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
-else _os_version="$(os_version)"; fi
-for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
-  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
-
-  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
-    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
-    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
-    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+function define_os_comparison_functions() {
+
+  readonly -A supported_os=(
+    ['debian']="10 11 12"
+    ['rocky']="8 9"
+    ['ubuntu']="18.04 20.04 22.04"
+  )
+
+  # dynamically define OS version test utility functions
+  if [[ "$(os_id)" == "rocky" ]];
+  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+  else _os_version="$(os_version)"; fi
+  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+    done
   done
-done
-
-function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
+  eval "function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )"
+}
 
 function os_vercat()   ( set +x
   if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
@@ -53,7 +63,7 @@ function os_vercat()   ( set +x
                    else os_version ; fi ; )
 
 function repair_old_backports {
-  if ge_debian12 || ! is_debuntu ; then return ; fi
+  if ! is_debuntu ; then return ; fi
   # This script uses 'apt-get update' and is therefore potentially dependent on
   # backports repositories which have been archived.  In order to mitigate this
   # problem, we will use archive.debian.org for the oldoldstable repo
@@ -94,6 +104,7 @@ function print_metadata_value_if_exists() {
   return ${return_code}
 }
 
+# replicates /usr/share/google/get_metadata_value
 function get_metadata_value() (
   set +x
   local readonly varname=$1
@@ -117,67 +128,719 @@ function get_metadata_attribute() (
   get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
 )
 
-OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-readonly OS_NAME
-
-# node role
-ROLE="$(get_metadata_attribute dataproc-role)"
-readonly ROLE
-
-# CUDA version and Driver version
-# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
-# https://developer.nvidia.com/cuda-downloads
-# Rocky8: 12.0: 525.147.05
-readonly -A DRIVER_FOR_CUDA=(
-          ["11.8"]="560.35.03"
-          ["12.0"]="525.60.13"  ["12.4"]="560.35.03"  ["12.6"]="560.35.03"
-)
-# https://developer.nvidia.com/cudnn-downloads
-if is_debuntu ; then
-readonly -A CUDNN_FOR_CUDA=(
-          ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.4"]="9.5.1.17"   ["12.6"]="9.5.1.17"
-)
-elif is_rocky ; then
-# rocky:
-#   12.0: 8.8.1.3
-#   12.1: 8.9.3.28
-#   12.2: 8.9.7.29
-#   12.3: 9.0.0.312
-#   12.4: 9.1.1.17
-#   12.5: 9.2.1.18
-#   12.6: 9.5.1.17
-readonly -A CUDNN_FOR_CUDA=(
-          ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"   ["12.4"]="9.1.1.17"   ["12.6"]="9.5.1.17"
-)
-fi
-# https://developer.nvidia.com/nccl/nccl-download
-# 12.2: 2.19.3, 12.5: 2.21.5
-readonly -A NCCL_FOR_CUDA=(
-          ["11.8"]="2.15.5"
-          ["12.0"]="2.16.5"  ["12.4"]="2.23.4"     ["12.6"]="2.23.4"
-)
-readonly -A CUDA_SUBVER=(
-          ["11.8"]="11.8.0"
-          ["12.0"]="12.0.0"  ["12.4"]="12.4.1"     ["12.6"]="12.6.2"
+function execute_with_retries() (
+  set +x
+  local -r cmd="$*"
+
+  if [[ "$cmd" =~ "^apt-get install" ]] ; then
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+  fi
+  for ((i = 0; i < 3; i++)); do
+    set -x
+    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    set +x
+    if [[ $retval == 0 ]] ; then return 0 ; fi
+    sleep 5
+  done
+  return 1
 )
 
-RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-readonly DEFAULT_CUDA_VERSION='12.4'
-CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
-if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then
-  # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27
-  CUDA_VERSION="${DEFAULT_CUDA_VERSION}"
-fi
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
 
-if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then
-  # Only CUDA 12.0 supported on older debuntu
-  CUDA_VERSION="12.0"
-fi
-readonly CUDA_VERSION
-readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}"
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
+
+function add_contrib_component() {
+  if ! is_debuntu ; then return ; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
+}
+
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
+}
+
+function configure_yarn_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
+
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
+
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
+
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
+
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    fi
+
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
+
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
+
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
+
+
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
+
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
+
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+      fi
+    done
+  fi
+
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
+
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  fi
+
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+
+}
+
+function set_proxy(){
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+  export METADATA_HTTP_PROXY
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+  local no_proxy_svc
+  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
+                      bigquery composer      pubsub bigquerydatatransfer dataflow \
+                      storage  datafusion    ; do
+    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  done
+
+  export NO_PROXY="${no_proxy}"
+}
+
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Download pip packages to tmpfs
+  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+}
+
+function check_os() {
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
+
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+}
+
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+  fi
+
+  apt-get update -qq
+}
+
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | dd of="${repo_path}" status=progress
+#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+}
+
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
+#
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+  mkdir -p "$(dirname "${kr_path}")"
+
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
+
+function configure_dkms_certs() {
+  if test -v PSN && [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
+
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+
+    return
+  fi
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
+}
+
+function check_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
+
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
+    exit 1
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Please either disable secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return 1
+  fi
+
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
+
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
+}
+
+function install_dependencies() {
+  test -f "${workdir}/complete/install-dependencies" && return 0
+  pkg_list="screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+  touch "${workdir}/complete/install-dependencies"
+}
+
+function prepare_common_env() {
+  define_os_comparison_functions
+
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
+
+  readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
+  # Dataproc configurations
+  readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+  readonly HIVE_CONF_DIR='/etc/hive/conf'
+  readonly SPARK_CONF_DIR='/etc/spark/conf'
+
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
+
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
+
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
+
+  mkdir -p "${workdir}/complete"
+  set_proxy
+  mount_ramdisk
+
+  readonly install_log="${tmpdir}/install.log"
+
+  if test -f "${workdir}/complete/prepare.common" ; then return ; fi
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+    if is_ubuntu ; then
+      while ! command -v gcloud ; do sleep 5s ; done
+    fi
+  else
+    dnf clean all
+  fi
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+
+ ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  )
+
+    install_dependencies
+
+    # Monitor disk usage in a screen session
+    df / > "/run/disk-usage.log"
+    touch "/run/keep-running-df"
+    screen -d -m -LUS keep-running-df \
+      bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+ fi
+
+  touch "${workdir}/complete/prepare.common"
+}
+
+function common_exit_handler() {
+  set +ex
+  echo "Exit handler invoked"
+
+  # Clear pip cache
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Restart YARN services if they are running already
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl  stop "hadoop-yarn-${svc}.service"
+      systemctl start "hadoop-yarn-${svc}.service"
+    fi
+  done
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+  fi
+
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+    hold_nvidia_packages
+  else
+    dnf clean all
+  fi
+
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    # print disk usage statistics for large components
+    if is_ubuntu ; then
+      du -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+        /usr/lib \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3 | sort -h
+    elif is_debian ; then
+      du -x -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
+        /var/lib/{docker,mysql,} \
+        /opt/nvidia/* \
+        /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+        /usr/bin \
+        /usr \
+        /var \
+        / 2>/dev/null | sort -h
+    else
+      du -hs \
+        /var/lib/docker \
+        /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
+        /usr/lib64/google-cloud-sdk \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3
+    fi
+
+    # Process disk usage logs from installation period
+    rm -f /run/keep-running-df
+    sync
+    sleep 5.01s
+    # compute maximum size of disk during installation
+    # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+    df / | tee -a "/run/disk-usage.log"
+
+    perl -e \
+          '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+
+
+    # zero free disk space
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
+  fi
+  echo "exit_handler has completed"
+}
+
+
+function set_support_matrix() {
+  # CUDA version and Driver version
+  # https://docs.nvidia.com/deploy/cuda-compatibility/
+  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+  # https://developer.nvidia.com/cuda-downloads
+
+  # Minimum supported version for open kernel driver is 515.43.04
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
+  # Rocky8: 12.0: 525.147.05
+  local latest
+  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+  readonly -A DRIVER_FOR_CUDA=(
+          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
+          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+  )
+  readonly -A DRIVER_SUBVER=(
+          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+  )
+  # https://developer.nvidia.com/cudnn-downloads
+  if is_debuntu ; then
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+  )
+  elif is_rocky ; then
+  # rocky:
+  #   12.0: 8.8.1.3
+  #   12.1: 8.9.3.28
+  #   12.2: 8.9.7.29
+  #   12.3: 9.0.0.312
+  #   12.4: 9.1.1.17
+  #   12.5: 9.2.1.18
+  #   12.6: 9.5.1.17
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
+  )
+  fi
+  # https://developer.nvidia.com/nccl/nccl-download
+  # 12.2: 2.19.3, 12.5: 2.21.5
+  readonly -A NCCL_FOR_CUDA=(
+          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
+          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+  )
+  readonly -A CUDA_SUBVER=(
+          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
+          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
+  )
+}
+
+set_support_matrix
+
+function set_cuda_version() {
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+  if [[ -n "${cuda_url}" ]] ; then
+    # if cuda-url metadata variable has been passed, extract default version from url
+    local CUDA_URL_VERSION
+    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
+      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
+    fi
+  fi
+
+  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
+    DEFAULT_CUDA_VERSION='12.4.1'
+  fi
+  # EXCEPTIONS
+  # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+  case "${DATAPROC_IMAGE_VERSION}" in
+    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;;
+    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    *   )
+      echo "unrecognized Dataproc image version"
+      exit 1
+      ;;
+  esac
+
+  if le_ubuntu18 ; then
+    DEFAULT_CUDA_VERSION="12.1.1"
+    CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}"  #12.1
+  fi
+  readonly DEFAULT_CUDA_VERSION
+
+  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
+    CUDA_FULL_VERSION="${CUDA_VERSION}"
+    CUDA_VERSION="${CUDA_VERSION%.*}"
+  fi
+  readonly CUDA_VERSION
+  if ( ! test -v CUDA_FULL_VERSION ) ; then
+    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
+  fi
+  readonly CUDA_FULL_VERSION
+
+}
 
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
 function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
@@ -187,110 +850,179 @@ function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
 function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
 function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
 
-DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
-if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
-                                         DEFAULT_DRIVER="560.28.03"  ; fi
-if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03"  ; fi
-if ( is_rocky    && le_cuda11 )   ; then DEFAULT_DRIVER="525.147.05" ; fi
-if ( is_ubuntu20 && le_cuda11 )   ; then DEFAULT_DRIVER="535.183.06" ; fi
-if ( is_rocky9   && ge_cuda12 )   ; then DEFAULT_DRIVER="565.57.01"  ; fi
-DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
+function set_driver_version() {
+  local gpu_driver_url
+  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
+
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+  local DEFAULT_DRIVER
+  # Take default from gpu-driver-url metadata value
+  if [[ -n "${gpu_driver_url}" ]] ; then
+    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
+    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
+  # Take default from cuda-url metadata value as a backup
+  elif [[ -n "${cuda_url}" ]] ; then
+    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
+      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
+      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
+      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the version indicated by the cuda url as the default if it exists
+	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the maximum sub-version available for the major version indicated in cuda url as the default
+	DEFAULT_DRIVER="${driver_max_maj_version}"
+      fi
+    fi
+  fi
 
-readonly DRIVER_VERSION
-readonly DRIVER=${DRIVER_VERSION%%.*}
+  if ( ! test -v DEFAULT_DRIVER ) ; then
+    # If a default driver version has not been extracted, use the default for this version of CUDA
+    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
+  fi
 
-readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
 
-# Parameters for NVIDIA-provided cuDNN library
-readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
-  # cuDNN v8 is not distribution for ubuntu20+, debian12
-  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
-  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-  CUDNN_VERSION="8.8.0.121"
-fi
-readonly CUDNN_VERSION
+  readonly DRIVER_VERSION
+  readonly DRIVER="${DRIVER_VERSION%%.*}"
 
-readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+  export DRIVER_VERSION DRIVER
 
-# Parameters for NVIDIA-provided Debian GPU driver
-readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
+    exit 1
+  fi
+}
 
-readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+function set_cudnn_version() {
+  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+  # Parameters for NVIDIA-provided cuDNN library
+  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
+    # cuDNN v8 is not distribution for ubuntu20+, debian12
+    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
+    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+    CUDNN_VERSION="8.8.0.121"
+  fi
+  readonly CUDNN_VERSION
+}
 
-# Short name for urls
-if is_ubuntu22  ; then
-    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
-    # https://developer.download.nvidia.com/compute/machine-learning/repos/
-    # use packages from previous release until such time as nvidia
-    # release ubuntu2204 builds
 
-    nccl_shortname="ubuntu2004"
-    shortname="$(os_id)$(os_vercat)"
-elif ge_rocky9 ; then
-    # use packages from previous release until such time as nvidia
-    # release rhel9 builds
+function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
+function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
 
-    nccl_shortname="rhel8"
-    shortname="rhel9"
-elif is_rocky ; then
+function set_cuda_repo_shortname() {
+# Short name for urls
+# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
+  if is_rocky ; then
     shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-    nccl_shortname="${shortname}"
-else
+  else
     shortname="$(os_id)$(os_vercat)"
-    nccl_shortname="${shortname}"
-fi
+  fi
+}
 
-# Parameters for NVIDIA-provided package repositories
-readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+function set_nv_urls() {
+  # Parameters for NVIDIA-provided package repositories
+  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
 
-# Parameters for NVIDIA-provided NCCL library
-readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
-NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
-readonly NCCL_REPO_URL
-readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
+  # Parameter for NVIDIA-provided Rocky Linux GPU driver
+  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+}
 
 function set_cuda_runfile_url() {
-  local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}"
-  local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}"
-
-  if ge_cuda12 ; then
-    if ( le_debian11 || le_ubuntu18 ) ; then
-      RUNFILE_DRIVER_VERSION="525.60.13"
-      RUNFILE_CUDA_VERSION="12.0.0"
-    elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then
-      RUNFILE_DRIVER_VERSION="525.147.05"
-      RUNFILE_CUDA_VERSION="12.0.0"
+  local MAX_DRIVER_VERSION
+  local MAX_CUDA_VERSION
+
+  local MIN_OPEN_DRIVER_VER="515.48.07"
+  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
+  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
+
+  if is_cuda12 ; then
+    if is_debian12 ; then
+      MIN_DRIVER_VERSION="545.23.06"
+      MIN_CUDA_VERSION="12.3.0"
+    elif is_debian10 ; then
+      MAX_DRIVER_VERSION="555.42.02"
+      MAX_CUDA_VERSION="12.5.0"
+    elif is_ubuntu18 ; then
+      MAX_DRIVER_VERSION="530.30.02"
+      MAX_CUDA_VERSION="12.1.1"
+    fi
+  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    if le_debian10 ; then
+      # cuda 11 is not supported for <= debian10
+      MAX_CUDA_VERSION="0"
+      MAX_DRIVER_VERSION="0"
     fi
   else
-    RUNFILE_DRIVER_VERSION="520.61.05"
-    RUNFILE_CUDA_VERSION="11.8.0"
+    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+
+  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
+    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
+    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
+    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
   fi
 
-  readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run"
-  CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}"
-  DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}"
-  readonly DEFAULT_NVIDIA_CUDA_URL
+  # driver version named in cuda runfile filename
+  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+  readonly -A drv_for_cuda=(
+          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+          ["11.8.0"]="520.61.05"
+          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+          ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+  )
+
+  # Verify that the file with the indicated combination exists
+  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
+  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
+  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
+  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
 
   NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
   readonly NVIDIA_CUDA_URL
-}
 
-set_cuda_runfile_url
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
+  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    exit 1
+  fi
 
-# Parameter for NVIDIA-provided Rocky Linux GPU driver
-readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
+  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+  fi
+}
 
+function set_cudnn_tarball_url() {
 CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
 CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
 if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
@@ -298,59 +1030,23 @@ if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
   CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
   if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
     # When cuDNN version is greater than or equal to 8.4.1.50 use this format
-    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
-  fi
-  # Use legacy url format with one of the tarball name formats depending on version as above
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
-fi
-if ( version_ge "${CUDA_VERSION}" "12.0" ); then
-  # Use modern url format When cuda version is greater than or equal to 12.0
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
-fi
-readonly CUDNN_TARBALL
-readonly CUDNN_TARBALL_URL
-
-# Whether to install NVIDIA-provided or OS-provided GPU driver
-GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
-readonly GPU_DRIVER_PROVIDER
-
-# Stackdriver GPU agent parameters
-readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
-# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-readonly INSTALL_GPU_AGENT
-
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
-
-NVIDIA_SMI_PATH='/usr/bin'
-MIG_MAJOR_CAPS=0
-IS_MIG_ENABLED=0
-
-function execute_with_retries() (
-  set +x
-  local -r cmd="$*"
-
-  if [[ "$cmd" =~ "^apt-get install" ]] ; then
-    apt-get -y clean
-    apt-get -y autoremove
-  fi
-  for ((i = 0; i < 3; i++)); do
-    set -x
-    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
-    set +x
-    if [[ $retval == 0 ]] ; then return 0 ; fi
-    sleep 5
-  done
-  return 1
-)
+    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
+  fi
+  # Use legacy url format with one of the tarball name formats depending on version as above
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
+fi
+if ( version_ge "${CUDA_VERSION}" "12.0" ); then
+  # Use modern url format When cuda version is greater than or equal to 12.0
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
+fi
+readonly CUDNN_TARBALL
+readonly CUDNN_TARBALL_URL
+}
 
-CUDA_KEYRING_PKG_INSTALLED="0"
 function install_cuda_keyring_pkg() {
-  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
+  if ( test -v CUDA_KEYRING_PKG_INSTALLED &&
+       [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi
   local kr_ver=1.1
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
@@ -365,8 +1061,9 @@ function uninstall_cuda_keyring_pkg() {
   CUDA_KEYRING_PKG_INSTALLED="0"
 }
 
-CUDA_LOCAL_REPO_INSTALLED="0"
 function install_local_cuda_repo() {
+  if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi
+
   if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
   CUDA_LOCAL_REPO_INSTALLED="1"
   pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
@@ -387,20 +1084,20 @@ function install_local_cuda_repo() {
       "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
+
+  touch "${workdir}/complete/install-local-cuda-repo"
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  CUDA_LOCAL_REPO_INSTALLED="0"
+  rm -f "${workdir}/complete/install-local-cuda-repo"
 }
 
-CUDNN_LOCAL_REPO_INSTALLED="0"
-CUDNN_PKG_NAME=""
 function install_local_cudnn_repo() {
-  if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  pkgname="cudnn-local-repo-${shortname}-${CUDNN}"
+  if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
+  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
   CUDNN_PKG_NAME="${pkgname}"
   local_deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
 
   # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
   curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
@@ -410,20 +1107,19 @@ function install_local_cudnn_repo() {
 
   rm -f "${tmpdir}/local-installer.deb"
 
-  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
-  CUDNN_LOCAL_REPO_INSTALLED="1"
+  touch "${workdir}/complete/install-local-cudnn-repo"
 }
 
 function uninstall_local_cudnn_repo() {
   apt-get purge -yq "${CUDNN_PKG_NAME}"
-  CUDNN_LOCAL_REPO_INSTALLED="0"
+  rm -f "${workdir}/complete/install-local-cudnn-repo"
 }
 
-CUDNN8_LOCAL_REPO_INSTALLED="0"
-CUDNN8_PKG_NAME=""
 function install_local_cudnn8_repo() {
-  if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
+
   if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
   elif is_debian ; then cudnn8_shortname="debian11"
   else return 0 ; fi
@@ -437,61 +1133,136 @@ function install_local_cudnn8_repo() {
 
   deb_fn="${pkgname}_1.0-1_amd64.deb"
   local_deb_fn="${tmpdir}/${deb_fn}"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-      "${local_deb_url}" -o "${local_deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+
+  # cache the cudnn package
+  cache_fetched_package "${local_deb_url}" \
+                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${local_deb_fn}"
+
+  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
+  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
+  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
+    mkdir -p "${cudnn_path}"
+    mount -t tmpfs tmpfs "${cudnn_path}"
+  fi
 
   dpkg -i "${local_deb_fn}"
 
   rm -f "${local_deb_fn}"
 
-  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  CUDNN8_LOCAL_REPO_INSTALLED="1"
+  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  touch "${workdir}/complete/install-local-cudnn8-repo"
 }
 
 function uninstall_local_cudnn8_repo() {
   apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  CUDNN8_LOCAL_REPO_INSTALLED="0"
+  rm -f "${workdir}/complete/install-local-cudnn8-repo"
 }
 
 function install_nvidia_nccl() {
+  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
+  if test -f "${workdir}/complete/nccl" ; then return ; fi
+
+  if is_cuda11 && is_debian12 ; then
+    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
+    return
+  fi
+
   local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
 
-  if is_rocky ; then
-    execute_with_retries \
-      dnf -y -q install \
-        "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}"
-    sync
-  elif is_ubuntu ; then
-    install_cuda_keyring_pkg
+  # https://github.com/NVIDIA/nccl/blob/master/README.md
+  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Fermi:     SM_20,             compute_30
+  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+  # The following architectures are suppored by open kernel driver
+  # Volta:     SM_70,SM_72,       compute_70,compute_72
+  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+  # The following architectures are supported by CUDA v11.8+
+  # Ada:       SM_89,             compute_89
+  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+  # Blackwell: SM_100,            compute_100
+                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
+  if version_ge "${CUDA_VERSION}" "11.8" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
+  fi
+  if version_ge "${CUDA_VERSION}" "12.0" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
+  fi
 
-    apt-get update -qq
+  mkdir -p "${workdir}"
+  pushd "${workdir}"
 
-    if is_ubuntu18 ; then
-      execute_with_retries \
-        apt-get install -q -y \
-          libnccl2 libnccl-dev
-      sync
+  test -d "${workdir}/nccl" || {
+    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "nccl-${NCCL_VERSION}-1" nccl
+  }
+
+  local build_path
+  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
+                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
+
+  test -d "${workdir}/nccl/build" || {
+    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+
+    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+    if echo "${output}" | grep -q "${gcs_tarball}" ; then
+      # cache hit - unpack from cache
+      echo "cache hit"
     else
-      execute_with_retries \
-        apt-get install -q -y \
-          "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}"
-      sync
+      # build and cache
+      pushd nccl
+      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+      install_build_dependencies
+      if is_debuntu ; then
+        # These packages are required to build .deb packages from source
+        execute_with_retries \
+          apt-get install -y -qq build-essential devscripts debhelper fakeroot
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.debian.build
+      elif is_rocky ; then
+        # These packages are required to build .rpm packages from source
+        execute_with_retries \
+          dnf -y -q install rpm-build rpmdevtools
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.redhat.build
+      fi
+      tar czvf "/${local_tarball}" "../${build_path}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
     fi
-  else
-    echo "Unsupported OS: '${OS_NAME}'"
-    # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems
-    # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
-    # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
-    # nvhpc_2024_247_Linux_x86_64_cuda_multi/install
-    return
+    gcloud storage cat "${gcs_tarball}" | tar xz
+  }
+
+  if is_debuntu ; then
+    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
+  elif is_rocky ; then
+    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
   fi
+
+  popd
+  touch "${workdir}/complete/nccl"
 }
 
 function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
 function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
+  if test -f "${workdir}/complete/cudnn" ; then return ; fi
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -515,7 +1286,6 @@ function install_nvidia_cudnn() {
     if ge_debian12 && is_src_os ; then
       apt-get -y install nvidia-cudnn
     else
-      local CUDNN="${CUDNN_VERSION%.*}"
       if is_cudnn8 ; then
         install_local_cudnn8_repo
 
@@ -525,6 +1295,8 @@ function install_nvidia_cudnn() {
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
             "libcudnn8-dev=${cudnn_pkg_version}"
+
+        uninstall_local_cudnn8_repo
 	sync
       elif is_cudnn9 ; then
 	install_cuda_keyring_pkg
@@ -541,118 +1313,15 @@ function install_nvidia_cudnn() {
         echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
       fi
     fi
-  elif is_ubuntu ; then
-    local -a packages
-    packages=(
-      "libcudnn${major_version}=${cudnn_pkg_version}"
-      "libcudnn${major_version}-dev=${cudnn_pkg_version}")
-    execute_with_retries \
-      apt-get install -q -y --no-install-recommends "${packages[*]}"
-    sync
   else
-    echo "Unsupported OS: '${OS_NAME}'"
+    echo "Unsupported OS: '${_shortname}'"
     exit 1
   fi
 
   ldconfig
 
-  echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
-}
-
-CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-PSN="$(get_metadata_attribute private_secret_name)"
-readonly PSN
-function configure_dkms_certs() {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping";
-      return 0
-  fi
-
-  mkdir -p "${CA_TMPDIR}"
-
-  # If the private key exists, verify it
-  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
-    echo "Private key material exists"
-
-    local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
-      modulus_md5sum="${expected_modulus_md5sum}"
-    else
-      modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
-    fi
-
-    # Verify that cert md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key modulus"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
-
-    # Verify that key md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert modulus"
-    fi
-
-    return
-  fi
-
-
-  # Retrieve cloud secrets keys
-  local sig_priv_secret_name
-  sig_priv_secret_name="${PSN}"
-  local sig_pub_secret_name
-  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
-  local sig_secret_project
-  sig_secret_project="$(get_metadata_attribute secret_project)"
-  local sig_secret_version
-  sig_secret_version="$(get_metadata_attribute secret_version)"
-
-  # If metadata values are not set, do not write mok keys
-  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
-
-  # Write private material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_priv_secret_name}" \
-      | dd status=none of="${CA_TMPDIR}/db.rsa"
-
-  # Write public material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_pub_secret_name}" \
-      | base64 --decode \
-      | dd status=none of="${CA_TMPDIR}/db.der"
-
-  # symlink private key and copy public cert from volatile storage for DKMS
-  if is_ubuntu ; then
-    mkdir -p /var/lib/shim-signed/mok
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der
-  else
-    mkdir -p /var/lib/dkms/
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub
-  fi
-}
-
-function clear_dkms_key {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping" >&2
-      return 0
-  fi
-  rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv
-}
-
-function add_contrib_component() {
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-kernel-open-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib"
-
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
-  fi
+  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
+  touch "${workdir}/complete/cudnn"
 }
 
 function add_nonfree_components() {
@@ -668,76 +1337,93 @@ function add_nonfree_components() {
   fi
 }
 
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 function add_repo_nvidia_container_toolkit() {
-  if is_debuntu ; then
-      local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-      local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
-      # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
-      test -f "${kr_path}" ||
-        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
-          | gpg --dearmor -o "${kr_path}"
+  local nvctk_root="https://nvidia.github.io/libnvidia-container"
+  local signing_key_url="${nvctk_root}/gpgkey"
+  local repo_data
 
-      test -f "${sources_list_path}" ||
-        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
-          | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
-          | tee "${sources_list_path}"
-  fi
+  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
+                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+
+  os_add_repo nvidia-container-toolkit \
+              "${signing_key_url}" \
+              "${repo_data}" \
+              "no"
 }
 
 function add_repo_cuda() {
   if is_debuntu ; then
-    local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
-    local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
-    echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
-    | sudo tee "${sources_list_path}"
-    curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
-      -o "${kr_path}"
+    install_cuda_keyring_pkg # 11.7+, 12.0+
   elif is_rocky ; then
     execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
-    execute_with_retries "dnf clean all"
   fi
 }
 
-readonly uname_r=$(uname -r)
 function build_driver_from_github() {
-  if is_ubuntu ; then
-    mok_key=/var/lib/shim-signed/mok/MOK.priv
-    mok_der=/var/lib/shim-signed/mok/MOK.der
-  else
-    mok_key=/var/lib/dkms/mok.key
-    mok_der=/var/lib/dkms/mok.pub
-  fi
-  workdir=/opt/install-nvidia-driver
-  mkdir -p "${workdir}"
+  # non-GPL driver will have been built on rocky8
+  if is_rocky8 ; then return 0 ; fi
   pushd "${workdir}"
+
   test -d "${workdir}/open-gpu-kernel-modules" || {
-    tarball_fn="${DRIVER_VERSION}.tar.gz"
+    local tarball_fn="${DRIVER_VERSION}.tar.gz"
     curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
       "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
       | tar xz
     mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
   }
-  cd open-gpu-kernel-modules
 
-  time make -j$(nproc) modules \
-    >  /var/log/open-gpu-kernel-modules-build.log \
-    2> /var/log/open-gpu-kernel-modules-build_error.log
-  sync
+  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local def_dir="${modulus_md5sum:-unsigned}"
+    local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
 
-  if [[ -n "${PSN}" ]]; then
-    #configure_dkms_certs
-    for module in $(find kernel-open -name '*.ko'); do
-      "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
-      "${mok_key}" \
-      "${mok_der}" \
-      "${module}"
-    done
-    #clear_dkms_key
-  fi
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+      echo "cache hit"
+    else
+      # build the kernel modules
+      pushd open-gpu-kernel-modules
+      install_build_dependencies
+      if ( is_cuda11 && is_ubuntu22 ) ; then
+        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
+        exit 1
+      fi
+      execute_with_retries make -j$(nproc) modules \
+        >  kernel-open/build.log \
+        2> kernel-open/build_error.log
+      # Sign kernel modules
+      if [[ -n "${PSN}" ]]; then
+        configure_dkms_certs
+        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
+          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+          "${mok_key}" \
+          "${mok_der}" \
+          "${module}"
+        done
+	clear_dkms_key
+      fi
+      make modules_install \
+        >>  kernel-open/build.log \
+        2>> kernel-open/build_error.log
+      # Collect build logs and installed binaries
+      tar czvf "${local_tarball}" \
+        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+    depmod -a
+  }
 
-  make modules_install \
-    >> /var/log/open-gpu-kernel-modules-build.log \
-    2>> /var/log/open-gpu-kernel-modules-build_error.log
   popd
 }
 
@@ -760,12 +1446,10 @@ function build_driver_from_packages() {
     add_contrib_component
     apt-get update -qq
     execute_with_retries apt-get install -y -qq --no-install-recommends dkms
-    #configure_dkms_certs
     execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
     sync
 
   elif is_rocky ; then
-    #configure_dkms_certs
     if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
       echo "nvidia-driver:${DRIVER}-dkms installed successfully"
     else
@@ -773,26 +1457,108 @@ function build_driver_from_packages() {
     fi
     sync
   fi
-  #clear_dkms_key
 }
 
 function install_nvidia_userspace_runfile() {
-  if test -f "${tmpdir}/userspace-complete" ; then return ; fi
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
-  execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
-  rm -f "${tmpdir}/userspace.run"
-  touch "${tmpdir}/userspace-complete"
+  # Parameters for NVIDIA-provided Debian GPU driver
+  readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+  readonly USERSPACE_FILENAME
+
+  # This .run file contains NV's OpenGL implementation as well as
+  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
+  # including glib (https://docs.gtk.org/glib/), and what appears to
+  # be a copy of the source from the kernel-open directory of for
+  # example DRIVER_VERSION=560.35.03
+  #
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
+  #
+  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
+  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
+  if test -f "${workdir}/complete/userspace" ; then return ; fi
+  local local_fn="${tmpdir}/userspace.run"
+
+  cache_fetched_package "${USERSPACE_URL}" \
+                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
+                        "${local_fn}"
+
+  local runfile_args
+  runfile_args=""
+  local cache_hit="0"
+  local local_tarball
+
+  if is_rocky8 ; then
+    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+      local_tarball="${workdir}/${build_tarball}"
+      local def_dir="${modulus_md5sum:-unsigned}"
+      local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
+
+      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+        cache_hit="1"
+        runfile_args="--no-kernel-modules"
+        echo "cache hit"
+      else
+        install_build_dependencies
+        configure_dkms_certs
+        local signing_options
+        signing_options=""
+        if [[ -n "${PSN}" ]]; then
+          signing_options="--module-signing-hash sha256 \
+          --module-signing-x509-hash sha256 \
+          --module-signing-secret-key \"${mok_key}\" \
+          --module-signing-public-key \"${mok_der}\" \
+          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
+          "
+        fi
+        runfile_args="--no-dkms ${signing_options}"
+      fi
+    }
+  else
+    runfile_args="--no-kernel-modules"
+  fi
+
+  execute_with_retries bash "${local_fn}" -e -q \
+    ${runfile_args} \
+    --ui=none \
+    --install-libglvnd \
+    --tmpdir="${tmpdir}"
+
+  if is_rocky8 ; then
+    if [[ "${cache_hit}" == "1" ]] ; then
+      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+      depmod -a
+    else
+      clear_dkms_key
+      tar czvf "${local_tarball}" \
+        /var/log/nvidia-installer.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    fi
+  fi
+
+  rm -f "${local_fn}"
+  touch "${workdir}/complete/userspace"
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${tmpdir}/cuda-complete" ; then return ; fi
-  time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
-  execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
-  rm -f "${tmpdir}/cuda.run"
-  touch "${tmpdir}/cuda-complete"
+  if test -f "${workdir}/complete/cuda" ; then return ; fi
+  local local_fn="${tmpdir}/cuda.run"
+
+  cache_fetched_package "${NVIDIA_CUDA_URL}" \
+			"${pkg_bucket}/${CUDA_RUNFILE}" \
+                        "${local_fn}"
+
+  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
+  rm -f "${local_fn}"
+  touch "${workdir}/complete/cuda"
   sync
 }
 
@@ -808,12 +1574,11 @@ function install_cuda_toolkit() {
   if is_debuntu ; then
 #    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
     execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
-    sync
   elif is_rocky ; then
     # rocky9: cuda-11-[7,8], cuda-12-[1..6]
     execute_with_retries dnf -y -q install "${cudatk_package}"
-    sync
   fi
+  sync
 }
 
 function load_kernel_module() {
@@ -830,57 +1595,120 @@ function load_kernel_module() {
   # TODO: if peermem is available, also modprobe nvidia-peermem
 }
 
+function install_cuda(){
+  if test -f "${workdir}/complete/cuda-repo" ; then return ; fi
+
+  if ( ge_debian12 && is_src_os ) ; then
+    echo "installed with the driver on ${_shortname}"
+    return 0
+  fi
+
+  # The OS package distributions are unreliable
+  install_cuda_runfile
+
+  # Includes CUDA packages
+  add_repo_cuda
+
+  touch "${workdir}/complete/cuda-repo"
+}
+
+function install_nvidia_container_toolkit() {
+  local container_runtime_default
+    if command -v docker     ; then container_runtime_default='docker'
+  elif command -v containerd ; then container_runtime_default='containerd'
+  elif command -v crio       ; then container_runtime_default='crio'
+                               else container_runtime_default='' ; fi
+  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
+
+  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
+
+  add_repo_nvidia_container_toolkit
+  if is_debuntu ; then
+    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
+    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
+  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
+  systemctl restart "${CONTAINER_RUNTIME}"
+}
+
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
+  if test -f "${workdir}/complete/gpu-driver" ; then return ; fi
+
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
-    add_repo_nvidia_container_toolkit
     apt-get update -qq
-    #configure_dkms_certs
     apt-get -yq install \
-          nvidia-container-toolkit \
-          dkms \
-          nvidia-open-kernel-dkms \
-          nvidia-open-kernel-support \
-          nvidia-smi \
-          libglvnd0 \
-          libcuda1
-    #clear_dkms_key
-  elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then
+        dkms \
+        nvidia-open-kernel-dkms \
+        nvidia-open-kernel-support \
+        nvidia-smi \
+        libglvnd0 \
+        libcuda1
+    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
+    return 0
+  fi
 
-    install_nvidia_userspace_runfile
+  # OS driver packages do not produce reliable driver ; use runfile
+  install_nvidia_userspace_runfile
 
-    build_driver_from_github
+  build_driver_from_github
 
-    install_cuda_runfile
-  elif is_debuntu ; then
-    install_cuda_keyring_pkg
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+  touch "${workdir}/complete/gpu-driver"
+}
 
-    build_driver_from_packages
+function install_ops_agent(){
+  if test -f "${workdir}/complete/ops-agent" ; then return ; fi
 
-    install_cuda_toolkit
-  elif is_rocky ; then
-    add_repo_cuda
+  mkdir -p /opt/google
+  cd /opt/google
+  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
+  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
-    build_driver_from_packages
+  touch "${workdir}/complete/ops-agent"
+}
 
-    install_cuda_toolkit
-  else
-    echo "Unsupported OS: '${OS_NAME}'"
-    exit 1
-  fi
-  ldconfig
-  if is_src_os ; then
-    echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully"
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_monitoring_agent() {
+  download_gpu_monitoring_agent
+  install_gpu_monitoring_agent_dependency
+  start_gpu_monitoring_agent_service
+}
+
+function download_gpu_monitoring_agent(){
+  if is_rocky ; then
+    execute_with_retries "dnf -y -q install git"
   else
-    echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+    execute_with_retries "apt-get install git -y"
   fi
+  mkdir -p /opt/google
+  chmod 777 /opt/google
+  cd /opt/google
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+}
+
+function install_gpu_monitoring_agent_dependency(){
+  cd /opt/google/compute-gpu-monitoring/linux
+  python3 -m venv venv
+  venv/bin/pip install wheel
+  venv/bin/pip install -Ur requirements.txt
+}
+
+function start_gpu_monitoring_agent_service(){
+  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
+  systemctl daemon-reload
+  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
 function install_gpu_agent() {
-  if ! command -v pip; then
-    execute_with_retries "apt-get install -y -qq python-pip"
+  # Stackdriver GPU agent parameters
+#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
+  if ( ! command -v pip && is_debuntu ) ; then
+    execute_with_retries "apt-get install -y -qq python3-pip"
   fi
   local install_dir=/opt/gpu-utilization-agent
   mkdir -p "${install_dir}"
@@ -890,7 +1718,13 @@ function install_gpu_agent() {
     "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
-  execute_with_retries pip install -r "${install_dir}/requirements.txt"
+  local venv="${install_dir}/venv"
+  python3 -m venv "${venv}"
+(
+  source "${venv}/bin/activate"
+  python3 -m pip install --upgrade pip
+  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
+)
   sync
 
   # Generate GPU service.
@@ -901,7 +1735,7 @@ Description=GPU Utilization Metric Agent
 [Service]
 Type=simple
 PIDFile=/run/gpu_agent.pid
-ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"'
+ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
 User=root
 Group=root
 WorkingDirectory=/
@@ -916,75 +1750,50 @@ EOF
   systemctl --no-reload --now enable gpu-utilization-agent.service
 }
 
-function set_hadoop_property() {
-  local -r config_file=$1
-  local -r property=$2
-  local -r value=$3
-  "${bdcfg}" set_property \
-    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
-    --name "${property}" --value "${value}" \
-    --clobber
-}
-
-function configure_yarn() {
-  if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
-
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
-
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
-
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
-
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
-
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
-  fi
-}
-
 function configure_gpu_exclusive_mode() {
-  # check if running spark 3, if not, enable GPU exclusive mode
-  local spark_version
-  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
-  if [[ ${spark_version} != 3.* ]]; then
-    # include exclusive mode on GPU
-    nvsmi -c EXCLUSIVE_PROCESS
-  fi
+  # only run this function when spark < 3.0
+  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
+  # include exclusive mode on GPU
+  nvidia-smi -c EXCLUSIVE_PROCESS
+  clear_nvsmi_cache
 }
 
 function fetch_mig_scripts() {
   mkdir -p /usr/local/yarn-mig-scripts
-  sudo chmod 755 /usr/local/yarn-mig-scripts
+  chmod 755 /usr/local/yarn-mig-scripts
   wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
   wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  sudo chmod 755 /usr/local/yarn-mig-scripts/*
+  chmod 755 /usr/local/yarn-mig-scripts/*
+}
+
+function install_spark_rapids() {
+  # Update SPARK RAPIDS config
+  local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+  local DEFAULT_XGBOOST_VERSION="1.7.6"
+
+  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
+  local -r scala_ver="2.12"
+
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then
+    DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  fi
+
+  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
 }
 
 function configure_gpu_script() {
@@ -1023,9 +1832,43 @@ EOF
   chmod a+rx "${gpus_resources_script}"
 
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-  if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
-    echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
-  fi
+
+  local executor_cores
+  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+  local executor_memory
+  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+  local task_cpus=2
+  local gpu_amount
+  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+  if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi
+
+  cat >>"${spark_defaults_conf}" <<EOF
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
+# query explain output won't show GPU operator, if the user has doubts
+# they can uncomment the line before seeing the GPU plan explain;
+# having AQE enabled gives user the best performance.
+spark.executor.resource.gpu.amount=${gpu_count}
+spark.plugins=com.nvidia.spark.SQLPlugin
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
+spark.executor.cores=${executor_cores}
+spark.executor.memory=${executor_memory_gb}G
+spark.dynamicAllocation.enabled=false
+# please update this config according to your application
+spark.task.resource.gpu.amount=${gpu_amount}
+spark.task.cpus=2
+spark.yarn.unmanagedAM.enabled=false
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
+}
+
+function configure_yarn_nodemanager_gpu() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
+  configure_yarn_nodemanager
 }
 
 function configure_gpu_isolation() {
@@ -1056,9 +1899,20 @@ EOF
   systemctl start dataproc-cgroup-device-permissions
 }
 
+function clear_nvsmi_cache() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
+    rm "${nvsmi_query_xml}"
+  fi
+}
+
+function query_nvsmi() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
+  /usr/bin/nvidia-smi -q -x --dtd > "${nvsmi_query_xml}"
+}
+
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
-  if   [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
   elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
   elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
   else nvsmi_works="1" ; fi
@@ -1074,14 +1928,23 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
-function install_dependencies() {
+function install_build_dependencies() {
+  if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
+
   if is_debuntu ; then
-    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
+
   elif is_rocky ; then
-    execute_with_retries dnf -y -q install pciutils gcc screen
+    execute_with_retries dnf -y -q install gcc
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    local install_log="${tmpdir}/install.log"
     set +e
     eval "${dnf_cmd}" > "${install_log}" 2>&1
     local retval="$?"
@@ -1104,364 +1967,247 @@ function install_dependencies() {
 
     execute_with_retries "${dnf_cmd}"
   fi
+  touch "${workdir}/complete/build-dependencies"
 }
 
-function main() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn
-
-  # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
-    # if this is called without the MIG script then the drivers are not installed
-    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
-    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
-
-    # if mig is enabled drivers would have already been installed
-    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-      install_nvidia_gpu_driver
+function prepare_gpu_env(){
+  set +e
+  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+  set -e
+  echo "gpu_count=[${gpu_count}]"
+  nvsmi_works="0"
+  nvsmi_query_xml="${tmpdir}/nvsmi.xml"
+  xmllint="/opt/conda/miniconda3/bin/xmllint"
+  NVIDIA_SMI_PATH='/usr/bin'
+  MIG_MAJOR_CAPS=0
+  IS_MIG_ENABLED=0
+  CUDNN_PKG_NAME=""
+  CUDNN8_PKG_NAME=""
+  CUDA_LOCAL_REPO_INSTALLED="0"
 
-      load_kernel_module
+  # Whether to install NVIDIA-provided or OS-provided GPU driver
+  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+  readonly GPU_DRIVER_PROVIDER
 
-      if [[ -n ${CUDNN_VERSION} ]]; then
-        install_nvidia_nccl
-        install_nvidia_cudnn
-      fi
-      #Install GPU metrics collection in Stackdriver if needed
-      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-        install_gpu_agent
-        echo 'GPU metrics agent successfully deployed.'
-      else
-        echo 'GPU metrics agent will not be installed.'
-      fi
+  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+  readonly INSTALL_GPU_AGENT
 
-      # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
-      for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-        rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
-      done
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+  readonly RAPIDS_RUNTIME
 
-      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
-      if test -n "$(nvsmi -L)" ; then
-	# cache the result of the gpu query
-        ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
-        echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
-      fi
-      NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
-      if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-        # enable MIG on every GPU
-	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
-	  nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
-	done
+  set_cuda_version
+  set_driver_version
+  set_cuda_repo_shortname
+  set_nv_urls
+  set_cuda_runfile_url
+  set_cudnn_version
+  set_cudnn_tarball_url
 
-        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-        MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
-        fetch_mig_scripts
-      else
-        configure_gpu_exclusive_mode
-      fi
-    fi
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+}
 
-    configure_yarn_nodemanager
-    configure_gpu_script
-    configure_gpu_isolation
-  elif [[ "${ROLE}" == "Master" ]]; then
-    configure_yarn_nodemanager
-    configure_gpu_script
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
   fi
+}
 
-  # Restart YARN services if they are running already
-  if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-resourcemanager.service
-  fi
-  if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-nodemanager.service
+function delete_mig_instances() (
+  # delete all instances
+  set +e
+  nvidia-smi mig -dci
+
+  case "${?}" in
+    "0" ) echo "compute instances deleted"            ;;
+    "2" ) echo "invalid argument"                     ;;
+    "6" ) echo "No compute instances found to delete" ;;
+    *   ) echo "unrecognized return code"             ;;
+  esac
+
+  nvidia-smi mig -dgi
+  case "${?}" in
+    "0" ) echo "compute instances deleted"        ;;
+    "2" ) echo "invalid argument"                 ;;
+    "6" ) echo "No GPU instances found to delete" ;;
+    *   ) echo "unrecognized return code"         ;;
+  esac
+)
+
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
+function configure_mig_cgi() {
+  delete_mig_instances
+  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
+  if test -n "${META_MIG_CGI_VALUE}"; then
+    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
+  else
+    # https://pci-ids.ucw.cz/v2.2/pci.ids
+    local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)"
+    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then
+      # run the following command to list placement profiles
+      # nvidia-smi mig -lgipp
+      #
+      # This is the result when using H100 instances on 20241220
+      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
+      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
+      # GPU  0 Profile ID  9 Placements: {0,4}:4
+      # GPU  0 Profile ID  5 Placement : {0}:4
+      # GPU  0 Profile ID  0 Placement : {0}:8
+
+      # For H100 3D controllers, consider profile 19, 7x1G instances
+      nvidia-smi mig -cgi 9,9 -C
+    elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
+      # Dataproc only supports H100s right now ; split in 2 if not specified
+      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
+      nvidia-smi mig -cgi 9,9 -C
+    else
+      echo "unrecognized 3D controller"
+    fi
   fi
+  clear_nvsmi_cache
 }
 
-function clean_up_sources_lists() {
-  #
-  # bigtop (primary)
-  #
-  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+function enable_mig() {
+  if test -f "${workdir}/complete/enable-mig" ; then return ; fi
 
-  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
-    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+  # Start persistenced if it's not already running
+  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
+  for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
+    # Write an ascii zero to the numa node indicator
+    echo "0" | dd of="${f}" status=none
+  done
+  time nvidia-smi --gpu-reset # 30s
+  nvidia-smi -mig 1
+  clear_nvsmi_cache
 
-    local regional_bigtop_repo_uri
-    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
-      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
-      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
-      cut -d ' ' -f 2 |
-      head -1)
+  touch "${workdir}/complete/enable-mig"
+}
 
-    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
-    else
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
-    fi
+function enable_and_configure_mig() {
+  # default MIG to on when this script is used
+  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
 
-    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
-    rm -f "${bigtop_kr_path}"
-    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
+  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
 
-    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-  fi
+  enable_mig
 
-  #
-  # adoptium
-  #
-  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
-  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
-  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
-  rm -f "${adoptium_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
-   | gpg --dearmor -o "${adoptium_kr_path}"
-  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
-   > /etc/apt/sources.list.d/adoptium.list
+  xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
+  query_nvsmi
+  mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
 
+  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
+  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured but NOT enabled.  Failing" ; exit 1 ; fi
 
-  #
-  # docker
-  #
-  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
-  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
-  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
+  echo "MIG is fully enabled"
+  configure_mig_cgi
+}
 
-  rm -f "${docker_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
-    | gpg --dearmor -o "${docker_kr_path}"
-  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
-    > ${docker_repo_file}
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_resources
 
-  #
-  # google cloud + logging/monitoring
-  #
-  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
-    rm -f /usr/share/keyrings/cloud.google.gpg
-    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
-    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
-      list_file="/etc/apt/sources.list.d/${list}.list"
-      if [[ -f "${list_file}" ]]; then
-        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
-      fi
-    done
+  # When there is no GPU, but the installer is executing on a master node:
+  if [[ "${gpu_count}" == "0" ]] ; then
+    if [[ "${ROLE}" == "Master" ]]; then
+      configure_yarn_nodemanager
+    fi
+    return 0
   fi
 
-  #
-  # cran-r
-  #
-  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
-    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
-    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
-    rm -f /usr/share/keyrings/cran-r.gpg
-    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
-      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
-    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
-  fi
+  # if this is called without the MIG script then the drivers are not installed
+  query_nvsmi
+  migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
+  NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
 
-  #
-  # mysql
-  #
-  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
-    rm -f /usr/share/keyrings/mysql.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
-      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
-    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+    if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+      if (echo "${migquery_result}" | grep Enabled); then
+        IS_MIG_ENABLED=1
+        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+        MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+        fetch_mig_scripts
+      fi
+    fi
   fi
 
-  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+  # if mig is enabled drivers would have already been installed
+  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+    install_nvidia_gpu_driver
+    install_cuda
+    load_kernel_module
+
+    #Install GPU metrics collection in Stackdriver if needed
+    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+      install_gpu_agent
+#      install_gpu_monitoring_agent
+      echo 'GPU metrics agent successfully deployed.'
+    else
+      echo 'GPU metrics agent has not been installed.'
+    fi
+    configure_gpu_exclusive_mode
+  fi
 
+  install_nvidia_container_toolkit
+  configure_yarn_nodemanager_gpu
+  configure_gpu_script
+  configure_gpu_isolation
 }
 
-function exit_handler() {
-  set +ex
-  echo "Exit handler invoked"
-
-  # Purge private key material until next grant
-  clear_dkms_key
-
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
-  # If system memory was sufficient to mount memory-backed filesystems
+function gpu_exit_handler() {
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-
-    # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
-      if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then
+    for shmdir in /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
         umount -f ${shmdir}
       fi
     done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
   fi
+}
 
-  if is_debuntu ; then
-    # Clean up OS package cache
-    apt-get -y -qq clean
-    apt-get -y -qq autoremove
-    # re-hold systemd package
-    if ge_debian12 ; then
-    apt-mark hold systemd libsystemd0 ; fi
-  else
-    dnf clean all
-  fi
 
-  # print disk usage statistics for large components
-  if is_ubuntu ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  elif is_debian ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  else
-    du -hs \
-      /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
-      /usr/lib64/google-cloud-sdk \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
-  fi
-
-  # Process disk usage logs from installation period
-  rm -f /run/keep-running-df
-  sync
-  sleep 5.01s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
-#Filesystem     1K-blocks    Used Available Use% Mounted on
-#/dev/vda2        7096908 2611344   4182932  39% /
-  df / | tee -a "/run/disk-usage.log"
+function main() {
+  setup_gpu_yarn
 
-  perl -e '@siz=( sort { $a => $b }
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
-print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+  echo "yarn setup complete"
 
-  echo "exit_handler has completed"
+  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
+    install_nvidia_nccl
+    install_nvidia_cudnn
+  fi
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero
-    sync
-    sleep 3s
-    rm -f /zero
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    install_spark_rapids
+    configure_gpu_script
+    echo "RAPIDS initialized with Spark runtime"
+  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
+    # we are not currently tooled for installing dask in this action.
+    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
+  else
+    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
   fi
 
+  echo "main complete"
   return 0
 }
 
-function set_proxy(){
-  export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)"
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  export no_proxy=metadata.google.internal,169.254.169.254
-  export NO_PROXY=metadata.google.internal,169.254.169.254
-}
-
-function mount_ramdisk(){
-  local free_mem
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
-
-  # Write to a ramdisk instead of churning the persistent disk
-
-  tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
-  mount -t tmpfs tmpfs "${tmpdir}"
-
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  pip cache purge || echo "unable to purge pip cache"
-
-  # Download pip packages to tmpfs
-  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
-
-  # Download OS packages to tmpfs
-  if is_debuntu ; then
-    mount -t tmpfs tmpfs /var/cache/apt/archives
-  else
-    mount -t tmpfs tmpfs /var/cache/dnf
-  fi
+function exit_handler() {
+  gpu_exit_handler
+  common_exit_handler
+  return 0
 }
 
 function prepare_to_install(){
-  nvsmi_works="0"
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  tmpdir=/tmp/
-  if ! is_debuntu && ! is_rocky ; then
-    echo "Unsupported OS: '$(os_name)'"
-    exit 1
-  fi
-
-  repair_old_backports
-
-  export DEBIAN_FRONTEND=noninteractive
-
+  prepare_common_env
+  prepare_gpu_env
   trap exit_handler EXIT
-  mount_ramdisk
-  install_log="${tmpdir}/install.log"
-
-  set_proxy
-
-  if is_debuntu ; then
-    clean_up_sources_lists
-    apt-get update -qq
-    apt-get -y clean
-    sleep 5s
-    apt-get -y -qq autoremove
-    if ge_debian12 ; then
-    apt-mark unhold systemd libsystemd0 ; fi
-  else
-    dnf clean all
-  fi
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  ) fi
-
-  configure_dkms_certs
-
-  install_dependencies
-
-  # Monitor disk usage in a screen session
-  df / > "/run/disk-usage.log"
-  touch "/run/keep-running-df"
-  screen -d -m -US keep-running-df \
-    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 }
 
 prepare_to_install
diff --git a/spark-rapids/mig.sh b/spark-rapids/mig.sh
index 473513438..85300348d 100644
--- a/spark-rapids/mig.sh
+++ b/spark-rapids/mig.sh
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 # This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures.
-#
 # This script should be specified in --metadata=startup-script-url= option and
 # --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it.
 # The script does a reboot to fully enable MIG and then configures the MIG device based on the
@@ -22,2030 +21,370 @@
 # It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the
 # YARN setup to fully utilize the MIG instances on YARN.
 #
-# This initialization action is generated from
-# initialization-actions/templates/spark-rapids/mig.sh.in
-#
-# Modifications made directly to the generated file will be lost when
-# the template is re-evaluated
-
+# Much of this code is copied from install_gpu_driver.sh to do the driver and CUDA installation.
+# It's copied in order to not affect the existing scripts when not using MIG.
 
 set -euxo pipefail
 
-function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
-function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
-function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
-
-function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
-function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
-function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
-function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
-
-function define_os_comparison_functions() {
-
-  readonly -A supported_os=(
-    ['debian']="10 11 12"
-    ['rocky']="8 9"
-    ['ubuntu']="18.04 20.04 22.04"
-  )
-
-  # dynamically define OS version test utility functions
-  if [[ "$(os_id)" == "rocky" ]];
-  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
-  else _os_version="$(os_version)"; fi
-  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
-    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
-
-    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
-      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
-      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
-      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
-    done
-  done
-}
-
-define_os_comparison_functions
-
-function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
-
-function os_vercat()   ( set +x
-  if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
-  elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
-                   else os_version ; fi ; )
-
-function repair_old_backports {
-  if ! is_debuntu ; then return ; fi
-  # This script uses 'apt-get update' and is therefore potentially dependent on
-  # backports repositories which have been archived.  In order to mitigate this
-  # problem, we will use archive.debian.org for the oldoldstable repo
-
-  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
-  debdists="https://deb.debian.org/debian/dists"
-  oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
-  oldstable=$(   curl -s "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
-  stable=$(      curl -s "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');
-
-  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
-
-  for filename in "${matched_files[@]}"; do
-    # Fetch from archive.debian.org for ${oldoldstable}-backports
-    perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
-                  {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
-  done
-}
-
-function print_metadata_value() {
-  local readonly tmpfile=$(mktemp)
-  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
-    -s -o ${tmpfile} 2>/dev/null)
-  local readonly return_code=$?
-  # If the command completed successfully, print the metadata value to stdout.
-  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
-    cat ${tmpfile}
-  fi
-  rm -f ${tmpfile}
-  return ${return_code}
-}
-
-function print_metadata_value_if_exists() {
-  local return_code=1
-  local readonly url=$1
-  print_metadata_value ${url}
-  return_code=$?
-  return ${return_code}
-}
-
-# replicates /usr/share/google/get_metadata_value
-function get_metadata_value() (
-  set +x
-  local readonly varname=$1
-  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
-  # Print the instance metadata value.
-  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
-  return_code=$?
-  # If the instance doesn't have the value, try the project.
-  if [[ ${return_code} != 0 ]]; then
-    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
-    return_code=$?
-  fi
-
-  return ${return_code}
-)
-
-function get_metadata_attribute() (
-  set +x
-  local -r attribute_name="$1"
-  local -r default_value="${2:-}"
-  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
-)
+function get_metadata_attribute() {
+  local -r attribute_name=$1
+  local -r default_value=$2
+  /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
+}
+
+# Fetch Linux Family distro and Dataproc Image version
+readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
+readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
+DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value image|grep -Eo 'dataproc-[0-9]-[0-9]'|grep -Eo '[0-9]-[0-9]'|sed -e 's/-/./g')
+echo "${DATAPROC_IMAGE_VERSION}" >> /usr/local/share/startup-mig-log
+
+# CUDA version and Driver version config
+CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.2.2')  #12.2.2
+NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '535.104.05') #535.104.05
+CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
+
+# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+if [[ "${OS_NAME}" == "ubuntu" ]]; then
+    UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04
+    UBUNTU_VERSION=${UBUNTU_VERSION%.*}
+    if [[ "${UBUNTU_VERSION}" == "18" ]]; then
+      CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
+      NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
+      CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
+    fi
+fi
 
-function execute_with_retries() (
-  set +x
-  local -r cmd="$*"
+SECURE_BOOT="disabled"
+SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
 
-  if [[ "$cmd" =~ "^apt-get install" ]] ; then
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-  fi
-  for ((i = 0; i < 3; i++)); do
-    set -x
-    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
-    set +x
-    if [[ $retval == 0 ]] ; then return 0 ; fi
+function execute_with_retries() {
+  local -r cmd=$1
+  for ((i = 0; i < 10; i++)); do
+    if eval "$cmd"; then
+      return 0
+    fi
     sleep 5
   done
   return 1
-)
-
-function cache_fetched_package() {
-  local src_url="$1"
-  local gcs_fn="$2"
-  local local_fn="$3"
-
-  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
-    time gcloud storage cp "${gcs_fn}" "${local_fn}"
-  else
-    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
-           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
-  fi
 }
 
-function add_contrib_component() {
-  if ! is_debuntu ; then return ; fi
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-kernel-open-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib"
-
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
-  fi
-}
+# Enables a systemd service on bootup to install new headers.
+# This service recompiles kernel modules for Ubuntu and Debian, which are necessary for the functioning of nvidia-smi.
+function setup_systemd_update_headers() {
+  cat <<EOF >/lib/systemd/system/install-headers.service
+[Unit]
+Description=Install Linux headers for the current kernel
+After=network-online.target
 
-function set_hadoop_property() {
-  local -r config_file=$1
-  local -r property=$2
-  local -r value=$3
-  "${bdcfg}" set_property \
-    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
-    --name "${property}" --value "${value}" \
-    --clobber
-}
+[Service]
+ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done'
+Type=oneshot
+RemainAfterExit=yes
 
-function configure_yarn_resources() {
-  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
-  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+[Install]
+WantedBy=multi-user.target
+EOF
 
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+  # Reload systemd to recognize the new unit file
+  systemctl daemon-reload
 
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+  # Enable and start the service
+  systemctl enable --now install-headers.service
 }
 
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+# Install NVIDIA GPU driver provided by NVIDIA
+function install_nvidia_gpu_driver() {
 
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
+  ## common steps for all linux family distros
+  readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*}
 
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+  ## installation steps based OS_NAME
+  if [[ ${OS_NAME} == "debian" ]]; then
 
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
-  fi
-}
+    DEBIAN_VERSION=$(lsb_release -r|awk '{print $2}') # 10 or 11
+    export DEBIAN_FRONTEND=noninteractive
 
-function clean_up_sources_lists() {
-  #
-  # bigtop (primary)
-  #
-  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
 
-  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
-    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+    readonly LOCAL_INSTALLER_DEB="cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
+    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+      "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb
 
-    local regional_bigtop_repo_uri
-    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
-      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
-      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
-      cut -d ' ' -f 2 |
-      head -1)
+    dpkg -i /tmp/local-installer.deb
+    cp /var/cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/
+    add-apt-repository contrib
+    execute_with_retries "apt-get update"
 
-    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
-    else
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    if [[ ${DEBIAN_VERSION} == 10 ]]; then
+      apt remove -y libglvnd0
     fi
 
-    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
-    rm -f "${bigtop_kr_path}"
-    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
-
-    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-  fi
-
-  #
-  # adoptium
-  #
-  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
-  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
-  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
-  rm -f "${adoptium_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
-   | gpg --dearmor -o "${adoptium_kr_path}"
-  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
-   > /etc/apt/sources.list.d/adoptium.list
-
-
-  #
-  # docker
-  #
-  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
-  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
-  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
-
-  rm -f "${docker_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
-    | gpg --dearmor -o "${docker_kr_path}"
-  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
-    > ${docker_repo_file}
-
-  #
-  # google cloud + logging/monitoring
-  #
-  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
-    rm -f /usr/share/keyrings/cloud.google.gpg
-    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
-    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
-      list_file="/etc/apt/sources.list.d/${list}.list"
-      if [[ -f "${list_file}" ]]; then
-        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
-      fi
-    done
-  fi
-
-  #
-  # cran-r
-  #
-  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
-    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
-    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
-    rm -f /usr/share/keyrings/cran-r.gpg
-    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
-      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
-    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
-  fi
-
-  #
-  # mysql
-  #
-  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
-    rm -f /usr/share/keyrings/mysql.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
-      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
-    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
-  fi
-
-  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
+    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
 
-}
-
-function set_proxy(){
-  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+    # enable a systemd service that updates kernel headers after reboot
+    setup_systemd_update_headers
+   
+  elif [[ ${OS_NAME} == "ubuntu" ]]; then
 
-  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+    UBUNTU_VERSION=$(lsb_release -r|awk '{print $2}') # 20.04 or 22.04
+    UBUNTU_VERSION=${UBUNTU_VERSION%.*} # 20 or 22
 
-  export METADATA_HTTP_PROXY
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
-  local no_proxy_svc
-  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
-                      bigquery composer      pubsub bigquerydatatransfer dataflow \
-                      storage  datafusion    ; do
-    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
-  done
+    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
 
-  export NO_PROXY="${no_proxy}"
-}
+    readonly UBUNTU_REPO_CUDA_PIN="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-ubuntu${UBUNTU_VERSION}04.pin"
+    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+      "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600
 
-function mount_ramdisk(){
-  local free_mem
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+    readonly LOCAL_INSTALLER_DEB="cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
+    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+      "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb
 
-  # Write to a ramdisk instead of churning the persistent disk
+    dpkg -i /tmp/local-installer.deb
+    cp /var/cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/
+    execute_with_retries "apt-get update"    
+    
+    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
+    execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
 
-  tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
-  mount -t tmpfs tmpfs "${tmpdir}"
+    # enable a systemd service that updates kernel headers after reboot
+    setup_systemd_update_headers
 
-  # Download conda packages to tmpfs
-  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+  elif [[ ${OS_NAME} == "rocky" ]]; then
 
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  pip cache purge || echo "unable to purge pip cache"
+    ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8.8 or 9.1
+    ROCKY_VERSION=${ROCKY_VERSION%.*} # 8 or 9
 
-  # Download pip packages to tmpfs
-  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+    readonly NVIDIA_ROCKY_REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/rhel${ROCKY_VERSION}/x86_64/cuda-rhel${ROCKY_VERSION}.repo"
+    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
+    execute_with_retries "dnf clean all"
+    execute_with_retries "dnf -y -q module install nvidia-driver:${NVIDIA_DRIVER_VERSION_PREFIX}"
+    execute_with_retries "dnf -y -q install cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
+    modprobe nvidia
 
-  # Download OS packages to tmpfs
-  if is_debuntu ; then
-    mount -t tmpfs tmpfs /var/cache/apt/archives
   else
-    mount -t tmpfs tmpfs /var/cache/dnf
-  fi
-}
-
-function check_os() {
-  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
-      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
-      exit 1
-  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
-      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
-      exit 1
-  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
-      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
-      exit 1
-  fi
-
-  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-  readonly SPARK_VERSION
-  if version_lt "${SPARK_VERSION}" "3.1" || \
-     version_ge "${SPARK_VERSION}" "4.0" ; then
-    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    echo "Unsupported OS: '${OS_NAME}'"
     exit 1
   fi
-
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
-}
-
-#
-# Generate repo file under /etc/apt/sources.list.d/
-#
-function apt_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local -r include_src="${4:-yes}"
-  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
-
-  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
-  if [[ "${include_src}" == "yes" ]] ; then
-    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
-  fi
-
-  apt-get update -qq
-}
-
-#
-# Generate repo file under /etc/yum.repos.d/
-#
-function dnf_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
-  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
-
-  curl -s -L "${repo_url}" \
-    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
-    | dd of="${repo_path}" status=progress
-}
-
-#
-# Install package signing key and add corresponding repository
-# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
-#
-# Keyrings default to
-# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
-# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
-#
-function os_add_repo() {
-  local -r repo_name="$1"
-  local -r signing_key_url="$2"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local kr_path
-  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
-
-  mkdir -p "$(dirname "${kr_path}")"
-
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
-    | gpg --import --no-default-keyring --keyring "${kr_path}"
-
-  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
-                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
-}
-
-
-readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
-
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
-
-
-function set_support_matrix() {
-  # CUDA version and Driver version
-  # https://docs.nvidia.com/deploy/cuda-compatibility/
-  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
-  # https://developer.nvidia.com/cuda-downloads
-
-  # Minimum supported version for open kernel driver is 515.43.04
-  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
-  # Rocky8: 12.0: 525.147.05
-  local latest
-  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
-  readonly -A DRIVER_FOR_CUDA=(
-          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
-          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
-  )
-  readonly -A DRIVER_SUBVER=(
-          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
-          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
-  )
-  # https://developer.nvidia.com/cudnn-downloads
-  if is_debuntu ; then
-  readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
-  )
-  elif is_rocky ; then
-  # rocky:
-  #   12.0: 8.8.1.3
-  #   12.1: 8.9.3.28
-  #   12.2: 8.9.7.29
-  #   12.3: 9.0.0.312
-  #   12.4: 9.1.1.17
-  #   12.5: 9.2.1.18
-  #   12.6: 9.5.1.17
-  readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
-  )
-  fi
-  # https://developer.nvidia.com/nccl/nccl-download
-  # 12.2: 2.19.3, 12.5: 2.21.5
-  readonly -A NCCL_FOR_CUDA=(
-          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
-          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
-  )
-  readonly -A CUDA_SUBVER=(
-          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
-          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
-  )
-}
-
-set_support_matrix
-
-function set_cuda_version() {
-  local cuda_url
-  cuda_url=$(get_metadata_attribute 'cuda-url' '')
-  if [[ -n "${cuda_url}" ]] ; then
-    # if cuda-url metadata variable has been passed, extract default version from url
-    local CUDA_URL_VERSION
-    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
-    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
-      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
-      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
-    fi
-  fi
-
-  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
-    DEFAULT_CUDA_VERSION='12.4'
-  fi
-  readonly DEFAULT_CUDA_VERSION
-
-  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
-  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
-    CUDA_FULL_VERSION="${CUDA_VERSION}"
-    CUDA_VERSION="${CUDA_VERSION%.*}"
-  fi
-  readonly CUDA_VERSION
-  if ( ! test -v CUDA_FULL_VERSION ) ; then
-    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
-  fi
-  readonly CUDA_FULL_VERSION
-
+  ldconfig
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
 }
 
-set_cuda_version
-
-function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
-function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
-function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
-
-function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
-function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
-function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
-
-function set_driver_version() {
-  local gpu_driver_url
-  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
-
-  local cuda_url
-  cuda_url=$(get_metadata_attribute 'cuda-url' '')
-
-  local DEFAULT_DRIVER
-  # Take default from gpu-driver-url metadata value
-  if [[ -n "${gpu_driver_url}" ]] ; then
-    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
-    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
-  # Take default from cuda-url metadata value as a backup
-  elif [[ -n "${cuda_url}" ]] ; then
-    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
-    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
-      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
-      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
-      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
-        # use the version indicated by the cuda url as the default if it exists
-	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
-      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
-        # use the maximum sub-version available for the major version indicated in cuda url as the default
-	DEFAULT_DRIVER="${driver_max_maj_version}"
-      fi
-    fi
-  fi
-
-  if ( ! test -v DEFAULT_DRIVER ) ; then
-    # If a default driver version has not been extracted, use the default for this version of CUDA
-    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
-  fi
-
-  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-
-  readonly DRIVER_VERSION
-  readonly DRIVER="${DRIVER_VERSION%%.*}"
-
-  export DRIVER_VERSION DRIVER
-
-  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
-    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
-    exit 1
-  fi
+function enable_mig() {
+  nvidia-smi -mig 1
 }
 
-set_driver_version
-
-readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
-
-# Parameters for NVIDIA-provided cuDNN library
-readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
-  # cuDNN v8 is not distribution for ubuntu20+, debian12
-  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
-  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-  CUDNN_VERSION="8.8.0.121"
-fi
-readonly CUDNN_VERSION
-
-readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
-
-# Parameters for NVIDIA-provided Debian GPU driver
-readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-
-readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
-
-USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
-readonly USERSPACE_FILENAME
-
-# Short name for urls
-if is_ubuntu22  ; then
-    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
-    # https://developer.download.nvidia.com/compute/machine-learning/repos/
-    # use packages from previous release until such time as nvidia
-    # release ubuntu2204 builds
-
-    shortname="$(os_id)$(os_vercat)"
-    nccl_shortname="ubuntu2004"
-elif ge_rocky9 ; then
-    # use packages from previous release until such time as nvidia
-    # release rhel9 builds
-
-    shortname="rhel9"
-    nccl_shortname="rhel8"
-elif is_rocky ; then
-    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-    nccl_shortname="${shortname}"
-else
-    shortname="$(os_id)$(os_vercat)"
-    nccl_shortname="${shortname}"
-fi
-
-# Parameters for NVIDIA-provided package repositories
-readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
-
-# Parameters for NVIDIA-provided NCCL library
-readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
-NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
-readonly NCCL_REPO_URL
-readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
-
-function set_cuda_runfile_url() {
-  local MAX_DRIVER_VERSION
-  local MAX_CUDA_VERSION
-
-  local MIN_OPEN_DRIVER_VER="515.48.07"
-  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
-  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
-
-  if is_cuda12 ; then
-    if is_debian12 ; then
-      MIN_DRIVER_VERSION="545.23.06"
-      MIN_CUDA_VERSION="12.3.0"
-    elif is_debian10 ; then
-      MAX_DRIVER_VERSION="555.42.02"
-      MAX_CUDA_VERSION="12.5.0"
-    elif is_ubuntu18 ; then
-      MAX_DRIVER_VERSION="530.30.02"
-      MAX_CUDA_VERSION="12.1.1"
-    fi
-  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
-    if le_debian10 ; then
-      # cuda 11 is not supported for <= debian10
-      MAX_CUDA_VERSION="0"
-      MAX_DRIVER_VERSION="0"
-    fi
+function configure_mig_cgi() {
+  if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then
+    META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI)
+    nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C
   else
-    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  fi
-
-  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
-    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
-    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  fi
-  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
-    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
-  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
-    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
-  fi
-
-  # driver version named in cuda runfile filename
-  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
-  readonly -A drv_for_cuda=(
-          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
-          ["11.8.0"]="520.61.05"
-          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
-          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
-          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
-          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
-          ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
-          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
-          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
-  )
-
-  # Verify that the file with the indicated combination exists
-  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
-  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
-  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
-  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
-
-  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
-  readonly NVIDIA_CUDA_URL
-
-  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
-  readonly CUDA_RUNFILE
-
-  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
-    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
-    exit 1
-  fi
-
-  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
-    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
-  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
-    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
-    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
-    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
-  fi
-}
-
-set_cuda_runfile_url
-
-# Parameter for NVIDIA-provided Rocky Linux GPU driver
-readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
-
-CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
-CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
-if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
-  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
-  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
-    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
-    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
-  fi
-  # Use legacy url format with one of the tarball name formats depending on version as above
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
-fi
-if ( version_ge "${CUDA_VERSION}" "12.0" ); then
-  # Use modern url format When cuda version is greater than or equal to 12.0
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
-fi
-readonly CUDNN_TARBALL
-readonly CUDNN_TARBALL_URL
-
-# Whether to install NVIDIA-provided or OS-provided GPU driver
-GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
-readonly GPU_DRIVER_PROVIDER
-
-# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-readonly INSTALL_GPU_AGENT
-
-NVIDIA_SMI_PATH='/usr/bin'
-MIG_MAJOR_CAPS=0
-IS_MIG_ENABLED=0
-
-CUDA_KEYRING_PKG_INSTALLED="0"
-function install_cuda_keyring_pkg() {
-  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
-  local kr_ver=1.1
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
-    -o "${tmpdir}/cuda-keyring.deb"
-  dpkg -i "${tmpdir}/cuda-keyring.deb"
-  rm -f "${tmpdir}/cuda-keyring.deb"
-  CUDA_KEYRING_PKG_INSTALLED="1"
-}
-
-function uninstall_cuda_keyring_pkg() {
-  apt-get purge -yq cuda-keyring
-  CUDA_KEYRING_PKG_INSTALLED="0"
-}
-
-function install_local_cuda_repo() {
-  if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
-
-  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  CUDA_LOCAL_REPO_INSTALLED="1"
-  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
-  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
-  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
-  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
-  readonly DIST_KEYRING_DIR="/var/${pkgname}"
-
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-
-  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
-
-  if is_ubuntu ; then
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
-      -o /etc/apt/preferences.d/cuda-repository-pin-600
-  fi
-
-  touch "${workdir}/install-local-cuda-repo-complete"
-}
-function uninstall_local_cuda_repo(){
-  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  rm -f "${workdir}/install-local-cuda-repo-complete"
-}
-
-CUDNN_PKG_NAME=""
-function install_local_cudnn_repo() {
-  if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi
-  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
-  CUDNN_PKG_NAME="${pkgname}"
-  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
-
-  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
-
-  dpkg -i "${tmpdir}/local-installer.deb"
-
-  rm -f "${tmpdir}/local-installer.deb"
-
-  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
-
-  touch "${workdir}/install-local-cudnn-repo-complete"
-}
-
-function uninstall_local_cudnn_repo() {
-  apt-get purge -yq "${CUDNN_PKG_NAME}"
-  rm -f "${workdir}/install-local-cudnn-repo-complete"
-}
-
-CUDNN8_LOCAL_REPO_INSTALLED="0"
-CUDNN8_PKG_NAME=""
-function install_local_cudnn8_repo() {
-  if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi
-
-  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
-  elif is_debian ; then cudnn8_shortname="debian11"
-  else return 0 ; fi
-  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
-  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
-  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
-
-  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
-  CUDNN8_PKG_NAME="${pkgname}"
-
-  deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_fn="${tmpdir}/${deb_fn}"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
-
-  # cache the cudnn package
-  cache_fetched_package "${local_deb_url}" \
-                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
-                        "${local_deb_fn}"
-
-  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
-  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
-  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
-    mkdir -p "${cudnn_path}"
-    mount -t tmpfs tmpfs "${cudnn_path}"
-  fi
-
-  dpkg -i "${local_deb_fn}"
-
-  rm -f "${local_deb_fn}"
-
-  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  touch "${workdir}/install-local-cudnn8-repo-complete"
-}
-
-function uninstall_local_cudnn8_repo() {
-  apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  rm -f "${workdir}/install-local-cudnn8-repo-complete"
-}
-
-function install_nvidia_nccl() {
-  if test -f "${workdir}/nccl-complete" ; then return ; fi
-
-  if is_cuda11 && is_debian12 ; then
-    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
-    return
-  fi
-
-  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
-
-  # https://github.com/NVIDIA/nccl/blob/master/README.md
-  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Fermi:     SM_20,             compute_30
-  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
-  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
-  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
-
-  # The following architectures are suppored by open kernel driver
-  # Volta:     SM_70,SM_72,       compute_70,compute_72
-  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
-
-  # The following architectures are supported by CUDA v11.8+
-  # Ada:       SM_89,             compute_89
-  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
-  # Blackwell: SM_100,            compute_100
-                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
-  if version_ge "${CUDA_VERSION}" "11.8" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
-  fi
-  if version_ge "${CUDA_VERSION}" "12.0" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
-  fi
-
-  mkdir -p "${workdir}"
-  pushd "${workdir}"
-
-  test -d "${workdir}/nccl" || {
-    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
-      | tar xz
-    mv "nccl-${NCCL_VERSION}-1" nccl
-  }
-
-  local build_path
-  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
-                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
-
-  test -d "${workdir}/nccl/build" || {
-    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
-    local local_tarball="${workdir}/${build_tarball}"
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
-
-    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
-    if echo "${output}" | grep -q "${gcs_tarball}" ; then
-      # cache hit - unpack from cache
-      echo "cache hit"
-    else
-      # build and cache
-      pushd nccl
-      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
-      install_build_dependencies
-      if is_debuntu ; then
-        # These packages are required to build .deb packages from source
-        execute_with_retries \
-          apt-get install -y -qq build-essential devscripts debhelper fakeroot
-        export NVCC_GENCODE
-        execute_with_retries make -j$(nproc) pkg.debian.build
-      elif is_rocky ; then
-        # These packages are required to build .rpm packages from source
-        execute_with_retries \
-          dnf -y -q install rpm-build rpmdevtools
-        export NVCC_GENCODE
-        execute_with_retries make -j$(nproc) pkg.redhat.build
-      fi
-      tar czvf "/${local_tarball}" "../${build_path}"
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
-      make clean
-      popd
-    fi
-    gcloud storage cat "${gcs_tarball}" | tar xz
-  }
-
-  if is_debuntu ; then
-    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
-  elif is_rocky ; then
-    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
+    # Dataproc only supports A100's right now split in 2 if not specified
+    nvidia-smi mig -cgi 9,9  -C
   fi
-
-  popd
-  touch "${workdir}/nccl-complete"
 }
 
-function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
-function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
-
-function install_nvidia_cudnn() {
-  if test -f "${workdir}/cudnn-complete" ; then return ; fi
-  local major_version
-  major_version="${CUDNN_VERSION%%.*}"
-  local cudnn_pkg_version
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
-
-  if is_rocky ; then
-    if is_cudnn8 ; then
-      execute_with_retries dnf -y -q install \
-        "libcudnn${major_version}" \
-        "libcudnn${major_version}-devel"
-      sync
-    elif is_cudnn9 ; then
-      execute_with_retries dnf -y -q install \
-        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
-        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
-      sync
-    else
-      echo "Unsupported cudnn version: '${major_version}'"
-    fi
-  elif is_debuntu; then
-    if ge_debian12 && is_src_os ; then
-      apt-get -y install nvidia-cudnn
-    else
-      if is_cudnn8 ; then
-        install_local_cudnn8_repo
-
-        apt-get update -qq
-
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-            "libcudnn8=${cudnn_pkg_version}" \
-            "libcudnn8-dev=${cudnn_pkg_version}"
-
-        uninstall_local_cudnn8_repo
-	sync
-      elif is_cudnn9 ; then
-	install_cuda_keyring_pkg
-
-        apt-get update -qq
-
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
-	sync
-      else
-        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
-      fi
-    fi
+function upgrade_kernel() {
+  # Determine which kernel is installed
+  if [[ "${OS_NAME}" == "debian" ]]; then
+    CURRENT_KERNEL_VERSION=`cat /proc/version  | perl -ne 'print( / Debian (\S+) / )'`
+  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
+    CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( /^Linux version (\S+) / )'`
+  elif [[ ${OS_NAME} == rocky ]]; then
+    KERN_VER=$(yum info --installed kernel | awk '/^Version/ {print $3}')
+    KERN_REL=$(yum info --installed kernel | awk '/^Release/ {print $3}')
+    CURRENT_KERNEL_VERSION="${KERN_VER}-${KERN_REL}"
   else
-    echo "Unsupported OS: '${_shortname}'"
-    exit 1
-  fi
-
-  ldconfig
-
-  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
-  touch "${workdir}/cudnn-complete"
-}
-
-function add_nonfree_components() {
-  if is_src_nvidia ; then return; fi
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-open-kernel-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib non-free non-free-firmware"
-
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
-  fi
-}
-
-function add_repo_nvidia_container_toolkit() {
-  local nvctk_root="https://nvidia.github.io/libnvidia-container"
-  local signing_key_url="${nvctk_root}/gpgkey"
-  local repo_data
-
-  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
-                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
-
-  os_add_repo nvidia-container-toolkit \
-              "${signing_key_url}" \
-              "${repo_data}" \
-              "no"
-}
-
-function add_repo_cuda() {
-  if is_debuntu ; then
-    install_cuda_keyring_pkg # 11.7+, 12.0+
-  elif is_rocky ; then
-    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
-  fi
-}
-
-function build_driver_from_github() {
-  # non-GPL driver will have been built on rocky8
-  if is_rocky8 ; then return 0 ; fi
-  pushd "${workdir}"
-
-  test -d "${workdir}/open-gpu-kernel-modules" || {
-    local tarball_fn="${DRIVER_VERSION}.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
-      | tar xz
-    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
-  }
-
-  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
-  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
-    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
-    local local_tarball="${workdir}/${build_tarball}"
-    local build_dir
-    if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
-      then build_dir="${modulus_md5sum}"
-      else build_dir="unsigned" ; fi
-
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
-
-    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
-      echo "cache hit"
-    else
-      # build the kernel modules
-      pushd open-gpu-kernel-modules
-      install_build_dependencies
-      if is_cuda11 && is_ubuntu22 ; then
-        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
-        exit 1
-      fi
-      execute_with_retries make -j$(nproc) modules \
-        >  kernel-open/build.log \
-        2> kernel-open/build_error.log
-      # Sign kernel modules
-      if [[ -n "${PSN}" ]]; then
-        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
-          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
-          "${mok_key}" \
-          "${mok_der}" \
-          "${module}"
-        done
-      fi
-      make modules_install \
-        >>  kernel-open/build.log \
-        2>> kernel-open/build_error.log
-      # Collect build logs and installed binaries
-      tar czvf "${local_tarball}" \
-        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
-        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
-      make clean
-      popd
-    fi
-    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
-    depmod -a
-  }
-
-  popd
-}
-
-function build_driver_from_packages() {
-  if is_debuntu ; then
-    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
-      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
-      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
-    if is_debian ; then
-      pkglist=(
-        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
-        "nvidia-smi=${DRIVER_VERSION}-1"
-        "nvidia-alternative=${DRIVER_VERSION}-1"
-        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
-        "nvidia-kernel-support=${DRIVER_VERSION}-1"
-        "nvidia-modprobe=${DRIVER_VERSION}-1"
-        "libnvidia-ml1=${DRIVER_VERSION}-1"
-      )
-    fi
-    add_contrib_component
-    apt-get update -qq
-    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
-    #configure_dkms_certs
-    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
-    sync
-
-  elif is_rocky ; then
-    #configure_dkms_certs
-    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
-      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
+    echo "unsupported OS: ${OS_NAME}!"
+    exit -1
+  fi
+
+  # Get latest version available in repos
+  if [[ "${OS_NAME}" == "debian" ]]; then
+    apt-get -qq update
+    TARGET_VERSION=$(apt-cache show --no-all-versions linux-image-amd64 | awk '/^Version/ {print $2}')
+  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
+    apt-get -qq update
+    LATEST_VERSION=$(apt-cache show --no-all-versions linux-image-gcp | awk '/^Version/ {print $2}')
+    TARGET_VERSION=`echo ${LATEST_VERSION} | perl -ne 'printf(q{%s-%s-gcp},/(\d+\.\d+\.\d+)\.(\d+)/)'`
+  elif [[ "${OS_NAME}" == "rocky" ]]; then
+    if yum info --available kernel ; then
+      KERN_VER=$(yum info --available kernel | awk '/^Version/ {print $3}')
+      KERN_REL=$(yum info --available kernel | awk '/^Release/ {print $3}')
+      TARGET_VERSION="${KERN_VER}-${KERN_REL}"
     else
-      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
+      TARGET_VERSION="${CURRENT_KERNEL_VERSION}"
     fi
-    sync
-  fi
-  #clear_dkms_key
-}
-
-function install_nvidia_userspace_runfile() {
-
-  # This .run file contains NV's OpenGL implementation as well as
-  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
-  # including glib (https://docs.gtk.org/glib/), and what appears to
-  # be a copy of the source from the kernel-open directory of for
-  # example DRIVER_VERSION=560.35.03
-  #
-  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
-  #
-  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
-  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  if test -f "${workdir}/userspace-complete" ; then return ; fi
-  local local_fn="${tmpdir}/userspace.run"
-
-  cache_fetched_package "${USERSPACE_URL}" \
-                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
-                        "${local_fn}"
-
-  local runfile_args
-  runfile_args=""
-  local cache_hit="0"
-  local local_tarball
-
-  if is_rocky8 ; then
-    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
-    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
-      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
-      local_tarball="${workdir}/${build_tarball}"
-      local build_dir
-      if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
-        then build_dir="${modulus_md5sum}"
-        else build_dir="unsigned" ; fi
-
-      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
-
-      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
-        cache_hit="1"
-        runfile_args="--no-kernel-modules"
-        echo "cache hit"
-      else
-        install_build_dependencies
-
-        local signing_options
-        signing_options=""
-        if [[ -n "${PSN}" ]]; then
-          signing_options="--module-signing-hash sha256 \
-          --module-signing-x509-hash sha256 \
-          --module-signing-secret-key \"${mok_key}\" \
-          --module-signing-public-key \"${mok_der}\" \
-          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
-          "
-        fi
-
-        runfile_args="--no-dkms ${signing_options}"
-      fi
-    }
-  else
-    runfile_args="--no-kernel-modules"
   fi
 
-  execute_with_retries bash "${local_fn}" -e -q \
-    ${runfile_args} \
-    --ui=none \
-    --install-libglvnd \
-    --tmpdir="${tmpdir}"
+  # Skip this script if we are already on the target version
+  if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then
+    echo "target kernel version [${TARGET_VERSION}] is installed"
 
-  if is_rocky8 ; then
-    if [[ "${cache_hit}" == "1" ]] ; then
-      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
-      depmod -a
-    else
-      tar czvf "${local_tarball}" \
-        /var/log/nvidia-installer.log \
-        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    # Reboot may have interrupted dpkg.  Bring package system to a good state
+    if [[ "${OS_NAME}" == "debian" || "${OS_NAME}" == "ubuntu" ]]; then
+      dpkg --configure -a
     fi
-  fi
-
-  rm -f "${local_fn}"
-  touch "${workdir}/userspace-complete"
-  sync
-}
-
-function install_cuda_runfile() {
-  if test -f "${workdir}/cuda-complete" ; then return ; fi
-  local local_fn="${tmpdir}/cuda.run"
-
-  cache_fetched_package "${NVIDIA_CUDA_URL}" \
-			"${pkg_bucket}/${CUDA_RUNFILE}" \
-                        "${local_fn}"
-
-  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
-  rm -f "${local_fn}"
-  touch "${workdir}/cuda-complete"
-  sync
-}
-
-function install_cuda_toolkit() {
-  local cudatk_package=cuda-toolkit
-  if ge_debian12 && is_src_os ; then
-    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
-  elif [[ -n "${CUDA_VERSION}" ]]; then
-    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
-  fi
-  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
-  readonly cudatk_package
-  if is_debuntu ; then
-#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
-    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
-  elif is_rocky ; then
-    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
-    execute_with_retries dnf -y -q install "${cudatk_package}"
-  fi
-  sync
-}
-
-function load_kernel_module() {
-  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
-  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
-  done
-
-  depmod -a
-  modprobe nvidia
-  for suffix in uvm modeset drm; do
-    modprobe "nvidia-${suffix}"
-  done
-  # TODO: if peermem is available, also modprobe nvidia-peermem
-}
-
-function install_cuda(){
-  if test -f "${workdir}/cuda-repo-complete" ; then return ; fi
-
-  if ( ge_debian12 && is_src_os ) ; then
-    echo "installed with the driver on ${_shortname}"
-    return 0
-  fi
-
-  # The OS package distributions are unreliable
-  install_cuda_runfile
-
-  # Includes CUDA packages
-  add_repo_cuda
-
-  touch "${workdir}/cuda-repo-complete"
-}
-
-function install_nvidia_container_toolkit() {
-  local container_runtime_default
-    if command -v docker     ; then container_runtime_default='docker'
-  elif command -v containerd ; then container_runtime_default='containerd'
-  elif command -v crio       ; then container_runtime_default='crio'
-                               else container_runtime_default='' ; fi
-  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
-
-  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
-
-  add_repo_nvidia_container_toolkit
-  if is_debuntu ; then
-    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
-    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
-  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
-  systemctl restart "${CONTAINER_RUNTIME}"
-}
-
-# Install NVIDIA GPU driver provided by NVIDIA
-function install_nvidia_gpu_driver() {
-  if test -f "${workdir}/gpu-driver-complete" ; then return ; fi
 
-  if ( ge_debian12 && is_src_os ) ; then
-    add_nonfree_components
-    apt-get update -qq
-    apt-get -yq install \
-        dkms \
-        nvidia-open-kernel-dkms \
-        nvidia-open-kernel-support \
-        nvidia-smi \
-        libglvnd0 \
-        libcuda1
-    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
     return 0
   fi
 
-  # OS driver packages do not produce reliable driver ; use runfile
-  install_nvidia_userspace_runfile
-
-  build_driver_from_github
-
-  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  touch "${workdir}/gpu-driver-complete"
-}
-
-function install_ops_agent(){
-  if test -f "${workdir}/ops-agent-complete" ; then return ; fi
-
-  mkdir -p /opt/google
-  cd /opt/google
-  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
-  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
-  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
-
-  touch "${workdir}/ops-agent-complete"
-}
-
-# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
-function install_gpu_agent() {
-  # Stackdriver GPU agent parameters
-#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
-  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
-  if ( ! command -v pip && is_debuntu ) ; then
-    execute_with_retries "apt-get install -y -qq python3-pip"
-  fi
-  local install_dir=/opt/gpu-utilization-agent
-  mkdir -p "${install_dir}"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
-    | sed -e 's/-u --format=/--format=/' \
-    | dd status=none of="${install_dir}/report_gpu_metrics.py"
-  local venv="${install_dir}/venv"
-  python3 -m venv "${venv}"
-(
-  source "${venv}/bin/activate"
-  python3 -m pip install --upgrade pip
-  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
-)
-  sync
-
-  # Generate GPU service.
-  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
-[Unit]
-Description=GPU Utilization Metric Agent
-
-[Service]
-Type=simple
-PIDFile=/run/gpu_agent.pid
-ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
-User=root
-Group=root
-WorkingDirectory=/
-Restart=always
-
-[Install]
-WantedBy=multi-user.target
-EOF
-  # Reload systemd manager configuration
-  systemctl daemon-reload
-  # Enable gpu-utilization-agent service
-  systemctl --no-reload --now enable gpu-utilization-agent.service
-}
-
-function configure_gpu_exclusive_mode() {
-  # check if running spark 3, if not, enable GPU exclusive mode
-  local spark_version
-  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
-  if [[ ${spark_version} != 3.* ]]; then
-    # include exclusive mode on GPU
-    nvidia-smi -c EXCLUSIVE_PROCESS
-  fi
-}
-
-function fetch_mig_scripts() {
-  mkdir -p /usr/local/yarn-mig-scripts
-  sudo chmod 755 /usr/local/yarn-mig-scripts
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  sudo chmod 755 /usr/local/yarn-mig-scripts/*
-}
-
-function configure_gpu_script() {
-  # Download GPU discovery script
-  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
-  mkdir -p ${spark_gpu_script_dir}
-  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
-  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
-  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
-  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
-  cat > "${gpus_resources_script}" <<'EOF'
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
-
-echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
-EOF
-
-  chmod a+rx "${gpus_resources_script}"
-
-  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-  if version_ge "${SPARK_VERSION}" "3.0" ; then
-    local gpu_count
-    gpu_count="$(lspci | grep NVIDIA | wc -l)"
-    local executor_cores
-    executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
-    local executor_memory
-    executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
-    local task_cpus=2
-    local gpu_amount
-    gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-
-    cat >>"${spark_defaults_conf}" <<EOF
-###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
-# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
-# query explain output won't show GPU operator, if the user has doubts
-# they can uncomment the line before seeing the GPU plan explain;
-# having AQE enabled gives user the best performance.
-spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
-spark.executor.resource.gpu.amount=${gpu_count}
-spark.executor.cores=${executor_cores}
-spark.executor.memory=${executor_memory_gb}G
-spark.dynamicAllocation.enabled=false
-# please update this config according to your application
-spark.task.resource.gpu.amount=${gpu_amount}
-spark.task.cpus=2
-spark.yarn.unmanagedAM.enabled=false
-###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
-EOF
-  fi
-}
-
-function configure_gpu_isolation() {
-  # enable GPU isolation
-  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
-  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
-    # configure the container-executor.cfg to have major caps
-    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
-    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-  else
-    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+  # Install the latest kernel
+  if [[ ${OS_NAME} == debian ]]; then
+    apt-get install -y linux-image-amd64
+  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
+    apt-get install -y linux-image-gcp
+  elif [[ "${OS_NAME}" == "rocky" ]]; then
+    dnf -y -q install kernel
   fi
 
-  # Configure a systemd unit to ensure that permissions are set on restart
-  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
-[Unit]
-Description=Set permissions to allow YARN to access device directories
+  # Make it possible to reboot before init actions are complete - #1033
+  DP_ROOT=/usr/local/share/google/dataproc
+  STARTUP_SCRIPT="${DP_ROOT}/startup-script.sh"
+  POST_HDFS_STARTUP_SCRIPT="${DP_ROOT}/post-hdfs-startup-script.sh"
 
-[Service]
-ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-  systemctl enable dataproc-cgroup-device-permissions
-  systemctl start dataproc-cgroup-device-permissions
-}
-
-function nvsmi() {
-  local nvsmi="/usr/bin/nvidia-smi"
-  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
-  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
-  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
-  else nvsmi_works="1" ; fi
-
-  if [[ "$1" == "-L" ]] ; then
-    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
-    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
-    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
+  for startup_script in ${STARTUP_SCRIPT} ${POST_HDFS_STARTUP_SCRIPT} ; do
+    sed -i -e 's:/usr/bin/env bash:/usr/bin/env bash\nexit 0:' ${startup_script}
+  done
 
-    return 0
-  fi
+  cp /var/log/dataproc-initialization-script-0.log /var/log/dataproc-initialization-script-0.log.0
 
-  "${nvsmi}" $*
+  systemctl reboot
 }
 
-function install_build_dependencies() {
-  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
-
-  if is_debuntu ; then
-    if is_ubuntu22 && is_cuda12 ; then
-      # On ubuntu22, the default compiler does not build some kernel module versions
-      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
-      execute_with_retries apt-get install -y -qq gcc-12
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-12
+# Verify if compatible linux distros and secure boot options are used
+function check_os_and_secure_boot() {
+  if [[ "${OS_NAME}" == "debian" ]]; then
+    DEBIAN_VERSION=$(lsb_release -r | awk '{print $2}') # 10 or 11
+    if [[ "${DEBIAN_VERSION}" != "10" && "${DEBIAN_VERSION}" != "11" ]]; then
+      echo "Error: The Debian version (${DEBIAN_VERSION}) is not supported. Please use a compatible Debian version."
+      exit 1
     fi
-
-  elif is_rocky ; then
-    execute_with_retries dnf -y -q install gcc
-
-    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    set +e
-    eval "${dnf_cmd}" > "${install_log}" 2>&1
-    local retval="$?"
-    set -e
-
-    if [[ "${retval}" == "0" ]] ; then return ; fi
-
-    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
-      # this kernel-devel may have been migrated to the vault
-      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
-      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
-      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
-        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
-       )"
+  elif [[ "${OS_NAME}" == "ubuntu" ]]; then
+    UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04
+    UBUNTU_VERSION=${UBUNTU_VERSION%.*}
+    if [[ "${UBUNTU_VERSION}" != "18" && "${UBUNTU_VERSION}" != "20" && "${UBUNTU_VERSION}" != "22" ]]; then
+      echo "Error: The Ubuntu version (${UBUNTU_VERSION}) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+    fi
+  elif [[ "${OS_NAME}" == "rocky" ]]; then
+    ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8 or 9
+    ROCKY_VERSION=${ROCKY_VERSION%.*}
+    if [[ "${ROCKY_VERSION}" != "8" && "${ROCKY_VERSION}" != "9" ]]; then
+      echo "Error: The Rocky Linux version (${ROCKY_VERSION}) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
     fi
-
-    execute_with_retries "${dnf_cmd}"
   fi
-  touch "${workdir}/build-dependencies-complete"
-}
-
-function install_dependencies() {
-  pkg_list="pciutils screen"
-  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
-  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
-}
 
-function prepare_gpu_env(){
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-
-  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
-  nvsmi_works="0"
-
-  if   is_cuda11 ; then gcc_ver="11"
-  elif is_cuda12 ; then gcc_ver="12" ; fi
-}
-
-# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
-# Users should run apt-mark unhold before they wish to upgrade these packages
-function hold_nvidia_packages() {
-  apt-mark hold nvidia-*
-  apt-mark hold libnvidia-*
-  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
-    apt-mark hold xserver-xorg-video-nvidia*
+  if [[ "${SECURE_BOOT}" == "enabled" ]]; then 
+    echo "Error: Secure Boot is enabled. Please disable Secure Boot while creating the cluster."
+    exit 1
   fi
 }
 
-function delete_mig_instances() (
-  # delete all instances
-  set +e
-  nvidia-smi mig -dci
-
-  case "${?}" in
-    "0" ) echo "compute instances deleted"            ;;
-    "2" ) echo "invalid argument"                     ;;
-    "6" ) echo "No compute instances found to delete" ;;
-    *   ) echo "unrecognized return code"             ;;
-  esac
-
-  nvidia-smi mig -dgi
-  case "${?}" in
-    "0" ) echo "compute instances deleted"        ;;
-    "2" ) echo "invalid argument"                 ;;
-    "6" ) echo "No GPU instances found to delete" ;;
-    *   ) echo "unrecognized return code"         ;;
-  esac
-)
+# Detect dataproc image version from its various names
+if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then
+  DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+fi
 
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
-function configure_mig_cgi() {
-  delete_mig_instances
-  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
-  if test -n "${META_MIG_CGI_VALUE}"; then
-    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
-  else
-    if lspci | grep -q H100 ; then
-      # run the following command to list placement profiles
-      # nvidia-smi mig -lgipp
-      #
-      # This is the result when using H100 instances on 20241220
-      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
-      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
-      # GPU  0 Profile ID  9 Placements: {0,4}:4
-      # GPU  0 Profile ID  5 Placement : {0}:4
-      # GPU  0 Profile ID  0 Placement : {0}:8
+function remove_old_backports {
+  # This script uses 'apt-get update' and is therefore potentially dependent on
+  # backports repositories which have been archived.  In order to mitigate this
+  # problem, we will remove any reference to backports repos older than oldstable
 
-      # For H100 3D controllers, use profile 19, 7x1G instances
-      nvidia-smi mig -cgi 19 -C
-    elif lspci | grep -q A100 ; then
-      # Dataproc only supports A100s right now split in 2 if not specified
-      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
-      nvidia-smi mig -cgi 9,9 -C
-    else
-      echo "unrecognized 3D controller"
-    fi
+  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
+  oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}');
+  stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}');
+
+  matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)"
+  if [[ -n "$matched_files" ]]; then
+    for filename in "$matched_files"; do
+      grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \
+        sed -i -e 's/^.*-backports.*$//' "$filename"
+    done
   fi
 }
 
-function enable_mig() {
-  nvidia-smi -mig 1
-}
-
-
-function configure_dkms_certs() {
-  if test -v PSN && [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping";
-      return 0
+function main() {
+  if [[ ${OS_NAME} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
+    remove_old_backports
   fi
 
-  mkdir -p "${CA_TMPDIR}"
-
-  # If the private key exists, verify it
-  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
-    echo "Private key material exists"
-
-    local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
-      modulus_md5sum="${expected_modulus_md5sum}"
-
-      # Verify that cert md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key"
-      fi
-
-      # Verify that key md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert"
-      fi
+  check_os_and_secure_boot
+    
+  if [[ "${OS_NAME}" == "rocky" ]]; then
+    if dnf list kernel-devel-$(uname -r) && dnf list kernel-headers-$(uname -r); then
+      echo "kernel devel and headers packages are available.  Proceed without kernel upgrade."
     else
-      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
+      upgrade_kernel
     fi
-    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-
-    return
+  fi  
+  
+  if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then
+    export DEBIAN_FRONTEND=noninteractive
+    execute_with_retries "apt-get update"
+    execute_with_retries "apt-get install -y -q pciutils"
+  elif [[ ${OS_NAME} == rocky ]] ; then
+    execute_with_retries "dnf -y -q install pciutils"
   fi
 
-  # Retrieve cloud secrets keys
-  local sig_priv_secret_name
-  sig_priv_secret_name="${PSN}"
-  local sig_pub_secret_name
-  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
-  local sig_secret_project
-  sig_secret_project="$(get_metadata_attribute secret_project)"
-  local sig_secret_version
-  sig_secret_version="$(get_metadata_attribute secret_version)"
-
-  # If metadata values are not set, do not write mok keys
-  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
-
-  # Write private material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_priv_secret_name}" \
-      | dd status=none of="${CA_TMPDIR}/db.rsa"
-
-  # Write public material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_pub_secret_name}" \
-      | base64 --decode \
-      | dd status=none of="${CA_TMPDIR}/db.der"
-
-  local mok_directory="$(dirname "${mok_key}")"
-  mkdir -p "${mok_directory}"
-
-  # symlink private key and copy public cert from volatile storage to DKMS directory
-  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
-
-  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
-}
-
-function clear_dkms_key {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping" >&2
-      return 0
-  fi
-  rm -rf "${CA_TMPDIR}" "${mok_key}"
-}
-
-function check_secure_boot() {
-  local SECURE_BOOT="disabled"
-  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
-
-  PSN="$(get_metadata_attribute private_secret_name)"
-  readonly PSN
-
-  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
-    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
-    exit 1
-  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
-    echo "Secure boot is enabled, but no signing material provided."
-    echo "Please either disable secure boot or provide signing material as per"
-    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-    return 1
-  fi
-
-  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-  readonly CA_TMPDIR
-
-  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
-                      mok_der=/var/lib/shim-signed/mok/MOK.der
-                 else mok_key=/var/lib/dkms/mok.key
-                      mok_der=/var/lib/dkms/mok.pub ; fi
-
-  configure_dkms_certs
-}
-
-
-function exit_handler() {
-  # Purge private key material until next grant
-  clear_dkms_key
-
-  set +ex
-  echo "Exit handler invoked"
-
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
-  # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-
-    # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
-        umount -f ${shmdir}
+  # default MIG to on when this script is used
+  META_MIG_VALUE=1
+  if (/usr/share/google/get_metadata_value attributes/ENABLE_MIG); then
+    META_MIG_VALUE=$(/usr/share/google/get_metadata_value attributes/ENABLE_MIG)
+  fi
+
+  if (lspci | grep -q NVIDIA); then
+    if [[ $META_MIG_VALUE -ne 0 ]]; then
+      # if the first invocation, the NVIDIA drivers and tools are not installed
+      if [[ -f "/usr/bin/nvidia-smi" ]]; then
+        # check to see if we already enabled mig mode and rebooted so we don't end
+        # up in infinite reboot loop
+        NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l`
+        if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
+          if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
+            echo "MIG is enabled on all GPUs, configuring instances"
+            configure_mig_cgi
+            exit 0
+          else
+            echo "GPUs present but MIG is not enabled"
+          fi
+        else
+          echo "More than 1 GPU with MIG configured differently between them"
+        fi
       fi
-    done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-  fi
-
-  if is_debuntu ; then
-    # Clean up OS package cache
-    apt-get -y -qq clean
-    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
-    # re-hold systemd package
-    if ge_debian12 ; then
-    apt-mark hold systemd libsystemd0 ; fi
-    hold_nvidia_packages
-  else
-    dnf clean all
-  fi
-
-  # print disk usage statistics for large components
-  if is_ubuntu ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  elif is_debian ; then
-    du -x -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /var/lib/{docker,mysql,} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
-      /usr/bin \
-      /usr \
-      /var \
-      / 2>/dev/null | sort -h
-  else
-    du -hs \
-      /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
-      /usr/lib64/google-cloud-sdk \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
-  fi
-
-  # Process disk usage logs from installation period
-  rm -f /run/keep-running-df
-  sync
-  sleep 5.01s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
-#Filesystem     1K-blocks    Used Available Use% Mounted on
-#/dev/vda2        7096908 2611344   4182932  39% /
-  df / | tee -a "/run/disk-usage.log"
-
-  perl -e '@siz=( sort { $a => $b }
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
-print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
-
-  echo "exit_handler has completed"
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero
-    sync
-    sleep 3s
-    rm -f /zero
-  fi
-
-  return 0
-}
-
-function prepare_to_install(){
-  # Verify OS compatability and Secure boot state
-  check_os
-  check_secure_boot
-
-  prepare_gpu_env
-
-  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
-  ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
-  workdir=/opt/install-dpgce
-  tmpdir=/tmp/
-  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
-  readonly temp_bucket
-  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
-  uname_r=$(uname -r)
-  readonly uname_r
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  export DEBIAN_FRONTEND=noninteractive
-
-  mkdir -p "${workdir}"
-  trap exit_handler EXIT
-  set_proxy
-  mount_ramdisk
-
-  readonly install_log="${tmpdir}/install.log"
-
-  if test -f "${workdir}/prepare-complete" ; then return ; fi
-
-  repair_old_backports
-
-  if is_debuntu ; then
-    clean_up_sources_lists
-    apt-get update -qq
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-    if ge_debian12 ; then
-    apt-mark unhold systemd libsystemd0 ; fi
-  else
-    dnf clean all
+    fi
   fi
+  
+  # Detect NVIDIA GPU
+  if (lspci | grep -q NVIDIA); then
+    if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then
+      execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
+    elif [[ ${OS_NAME} == rocky ]]; then
+      echo "kernel devel and headers not required on rocky.  installing from binary"
+    fi
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  ) fi
-
-  install_dependencies
-
-  # Monitor disk usage in a screen session
-  df / > "/run/disk-usage.log"
-  touch "/run/keep-running-df"
-  screen -d -m -LUS keep-running-df \
-    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
-
-  touch "${workdir}/prepare-complete"
-}
-
-function main() {
-  # default MIG to on when this script is used
-  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+    install_nvidia_gpu_driver
 
-  if ! (lspci | grep -q NVIDIA) ; then return ; fi
-  if [[ $META_MIG_VALUE -ne 0 ]]; then
-    # if the first invocation, the NVIDIA drivers and tools are not installed
-    if [[ -f "/usr/bin/nvidia-smi" ]]; then
-      # check to see if we already enabled mig mode and rebooted so we don't end
-      # up in infinite reboot loop
-      mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
-      NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)"
-      if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
-        if (echo "${mig_mode_current}" | grep Enabled); then
-          echo "MIG is enabled on all GPUs, configuring instances"
+    if [[ ${META_MIG_VALUE} -ne 0 ]]; then
+      enable_mig
+      NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l`
+      if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
+        if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
+          echo "MIG is fully enabled, we don't need to reboot"
           configure_mig_cgi
-          exit 0
         else
-          echo "GPUs present but MIG is not enabled"
+          echo "MIG is configured on but NOT enabled, we need to reboot"
+          reboot
         fi
       else
-        echo "More than 1 GPU with MIG configured differently between them"
+        echo "MIG is NOT enabled all on GPUs, we need to reboot"
+        reboot
       fi
+    else
+      echo "Not enabling MIG"
     fi
   fi
-
-  install_nvidia_gpu_driver
-
-  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
-
-  enable_mig
-
-  mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)"
-
-  NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)"
-  if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -ne 1 ]]      ; then echo "MIG is NOT enabled all on GPUs.  Failing"       ; exit 1 ; fi
-  if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured on but NOT enabled.  Failing" ; exit 1 ; fi
-
-  echo "MIG is fully enabled"
-  configure_mig_cgi
 }
 
-prepare_to_install
-
 main
diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
index 6fdfbb78c..0b4aabd57 100644
--- a/spark-rapids/spark-rapids.sh
+++ b/spark-rapids/spark-rapids.sh
@@ -232,10 +232,12 @@ CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
 
 # EXCEPTIONS
 # Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
-if is_ubuntu18 ; then
-  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
-  NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
-  CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
+if [[ "${OS_NAME}" == "ubuntu" ]]; then
+    if is_ubuntu18 ; then
+      CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
+      NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
+      CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
+    fi
 fi
 
 # Verify Secure boot
diff --git a/templates/common/util_functions b/templates/common/util_functions
index 9f7075f0b..80ce5c09f 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -622,6 +622,14 @@ function common_exit_handler() {
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
 
+  # Restart YARN services if they are running already
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl  stop "hadoop-yarn-${svc}.service"
+      systemctl start "hadoop-yarn-${svc}.service"
+    fi
+  done
+
   # If system memory was sufficient to mount memory-backed filesystems
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
     # remove the tmpfs pip cache-dir
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 26c4d02f9..7c8b47b32 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1011,6 +1011,7 @@ function configure_gpu_exclusive_mode() {
   if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
   # include exclusive mode on GPU
   nvidia-smi -c EXCLUSIVE_PROCESS
+  clear_nvsmi_cache
 }
 
 function fetch_mig_scripts() {
@@ -1154,6 +1155,17 @@ EOF
   systemctl start dataproc-cgroup-device-permissions
 }
 
+function clear_nvsmi_cache() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
+    rm "${nvsmi_query_xml}"
+  fi
+}
+
+function query_nvsmi() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
+  /usr/bin/nvidia-smi -q -x --dtd > "${nvsmi_query_xml}"
+}
+
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
   if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
@@ -1220,6 +1232,8 @@ function prepare_gpu_env(){
   set -e
   echo "gpu_count=[${gpu_count}]"
   nvsmi_works="0"
+  nvsmi_query_xml="${tmpdir}/nvsmi.xml"
+  xmllint="/opt/conda/miniconda3/bin/xmllint"
   NVIDIA_SMI_PATH='/usr/bin'
   MIG_MAJOR_CAPS=0
   IS_MIG_ENABLED=0
@@ -1304,20 +1318,52 @@ function configure_mig_cgi() {
       # GPU  0 Profile ID  5 Placement : {0}:4
       # GPU  0 Profile ID  0 Placement : {0}:8
 
-      # For H100 3D controllers, use profile 19, 7x1G instances
-      nvidia-smi mig -cgi 19 -C
+      # For H100 3D controllers, consider profile 19, 7x1G instances
+      nvidia-smi mig -cgi 9,9 -C
     elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
-      # Dataproc only supports A100s right now split in 2 if not specified
+      # Dataproc only supports H100s right now ; split in 2 if not specified
       # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
       nvidia-smi mig -cgi 9,9 -C
     else
       echo "unrecognized 3D controller"
     fi
   fi
+  clear_nvsmi_cache
 }
 
 function enable_mig() {
+  if test -f "${workdir}/complete/enable-mig" ; then return ; fi
+
+  # Start persistenced if it's not already running
+  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
+  for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
+    # Write an ascii zero to the numa node indicator
+    echo "0" | dd of="${f}" status=none
+  done
+  time nvidia-smi --gpu-reset # 30s
   nvidia-smi -mig 1
+  clear_nvsmi_cache
+
+  touch "${workdir}/complete/enable-mig"
+}
+
+function enable_and_configure_mig() {
+  # default MIG to on when this script is used
+  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+
+  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
+
+  enable_mig
+
+  xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
+  query_nvsmi
+  mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
+
+  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
+  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured but NOT enabled.  Failing" ; exit 1 ; fi
+
+  echo "MIG is fully enabled"
+  configure_mig_cgi
 }
 
 function setup_gpu_yarn() {
@@ -1334,8 +1380,8 @@ function setup_gpu_yarn() {
   fi
 
   # if this is called without the MIG script then the drivers are not installed
-  migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
-  if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
+  query_nvsmi
+  migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
   NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
 
   if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then

From 0ac57a070220d56d58030d02643f954fb4822a85 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 12:02:47 -0800
Subject: [PATCH 058/130] return test suite to master

---
 spark-rapids/test_spark_rapids.py | 47 +++++++++++++++----------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 9b9481716..7af8e3154 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -20,10 +20,6 @@ class SparkRapidsTestCase(DataprocTestCase):
   def verify_spark_instance(self, name):
     self.assert_instance_command(name, "nvidia-smi")
 
-  def verify_pyspark(self, name):
-    # Verify that pyspark works
-    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
-
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -62,6 +58,12 @@ def verify_spark_job_sql(self):
                             ("STANDARD", ["w-0"], GPU_T4))
   def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
 
+    if self.getImageOs() == "rocky":
+      self.skipTest("Not supported for Rocky OS")
+
+    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
+      self.skipTest("Not supported in 2.0 and earlier images")
+
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -70,10 +72,10 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
         self.INIT_ACTIONS,
         optional_components=optional_components,
         metadata=metadata,
-        machine_type="n1-standard-32",
+        machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="50GB",
+        boot_disk_size="1024GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -86,6 +88,12 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
                             ("STANDARD", ["w-0"], GPU_T4))
   def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
 
+    if self.getImageOs() == "rocky":
+      self.skipTest("Not supported for Rocky OS")
+
+    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
+      self.skipTest("Not supported in 2.0 and earlier images")
+
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -94,10 +102,10 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
       self.INIT_ACTIONS,
       optional_components=optional_components,
       metadata=metadata,
-      machine_type="n1-standard-32",
+      machine_type="n1-standard-4",
       master_accelerator=accelerator if configuration == "SINGLE" else None,
       worker_accelerator=accelerator,
-      boot_disk_size="50GB",
+      boot_disk_size="1024GB",
       timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -106,24 +114,15 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
     # Only need to do this once
     self.verify_spark_job_sql()
 
-  @parameterized.parameters(
-    ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"),
-    ("STANDARD", ["w-0"], GPU_T4, "12.0.1", "525.147.05"),
-    ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"),
-    ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03")
-  )
+  @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
   def test_non_default_cuda_versions(self, configuration, machine_suffixes,
                                      accelerator, cuda_version, driver_version):
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.1.1") \
-    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
-          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.1.1 not supported on older debian/ubuntu releases")
+    if self.getImageOs() == "rocky":
+      self.skipTest("Not supported for Rocky OS")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
-    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
+      self.skipTest("Not supported in 2.0 and earlier images")
 
     metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
                 ",cuda-version={0},driver-version={1}".format(cuda_version, driver_version))
@@ -132,10 +131,10 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
         configuration,
         self.INIT_ACTIONS,
         metadata=metadata,
-        machine_type="n1-standard-32",
+        machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="50GB",
+        boot_disk_size="1024GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:

From b3e5618112c06caf699546161ec460357d7678f7 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 12:10:08 -0800
Subject: [PATCH 059/130] do not run all tests ; also do not retry failures

---
 cloudbuild/presubmit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index eec7adb76..d9ae3c9bb 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -70,6 +70,7 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
+      continue
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0
@@ -104,7 +105,6 @@ run_tests() {
   bazel test \
     --jobs="${max_parallel_tests}" \
     --local_test_jobs="${max_parallel_tests}" \
-    --flaky_test_attempts=3 \
     --action_env="INTERNAL_IP_SSH=true" \
     --test_output="all" \
     --noshow_progress \

From b4e99ee90874f8d51f85b9fc63336b4e689e7958 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 25 Dec 2024 19:33:19 -0800
Subject: [PATCH 060/130] expanding non-default version tests ; adding utility
 function to verify pyspark ; disk size correction

---
 spark-rapids/test_spark_rapids.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 7af8e3154..3c9b2a2d6 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -20,6 +20,10 @@ class SparkRapidsTestCase(DataprocTestCase):
   def verify_spark_instance(self, name):
     self.assert_instance_command(name, "nvidia-smi")
 
+  def verify_pyspark(self, name):
+    # Verify that pyspark works
+    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -114,13 +118,22 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
     # Only need to do this once
     self.verify_spark_job_sql()
 
-  @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
+  @parameterized.parameters(
+    ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"),
+    ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"),
+    ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03")
+  )
   def test_non_default_cuda_versions(self, configuration, machine_suffixes,
                                      accelerator, cuda_version, driver_version):
 
     if self.getImageOs() == "rocky":
       self.skipTest("Not supported for Rocky OS")
 
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+
     if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
       self.skipTest("Not supported in 2.0 and earlier images")
 

From e4eab7b41ccd9b471cd0c51fc1a8143edeb3bf83 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 12:16:11 -0800
Subject: [PATCH 061/130] reverting to master

---
 spark-rapids/test_spark_rapids.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 3c9b2a2d6..7af8e3154 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -20,10 +20,6 @@ class SparkRapidsTestCase(DataprocTestCase):
   def verify_spark_instance(self, name):
     self.assert_instance_command(name, "nvidia-smi")
 
-  def verify_pyspark(self, name):
-    # Verify that pyspark works
-    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
-
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -118,22 +114,13 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
     # Only need to do this once
     self.verify_spark_job_sql()
 
-  @parameterized.parameters(
-    ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"),
-    ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"),
-    ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03")
-  )
+  @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
   def test_non_default_cuda_versions(self, configuration, machine_suffixes,
                                      accelerator, cuda_version, driver_version):
 
     if self.getImageOs() == "rocky":
       self.skipTest("Not supported for Rocky OS")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
-    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
-          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
-
     if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
       self.skipTest("Not supported in 2.0 and earlier images")
 

From 95b17ac10d260de329f0876d8b44deeae4527381 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 12:25:54 -0800
Subject: [PATCH 062/130] reverting test_spark-rapids.py to master

---
 spark-rapids/test_spark_rapids.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
index 7af8e3154..6e03f2d62 100644
--- a/spark-rapids/test_spark_rapids.py
+++ b/spark-rapids/test_spark_rapids.py
@@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="1024GB",
+        boot_disk_size="50GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
       machine_type="n1-standard-4",
       master_accelerator=accelerator if configuration == "SINGLE" else None,
       worker_accelerator=accelerator,
-      boot_disk_size="1024GB",
+      boot_disk_size="50GB",
       timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="1024GB",
+        boot_disk_size="50GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:

From 212b9af4d1c39dfbc8c6b8947d88b8796c37bbf9 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 12:31:30 -0800
Subject: [PATCH 063/130] do not consider templates as changed files

---
 cloudbuild/presubmit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index d9ae3c9bb..2b2e978b0 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -49,7 +49,7 @@ initialize_git_repo() {
 determine_tests_to_run() {
   # Infer the files that changed
   mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD)
-  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only)
+  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only | grep -v template)
   echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}"
   echo "Changed files: ${CHANGED_FILES[*]}"
 

From e9b9e5de59966924ab2cb8110f86e35da24b143a Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 12:44:21 -0800
Subject: [PATCH 064/130] using nvsmi for some error protection

---
 gpu/install_gpu_driver.sh    | 22 +++++++++++-----------
 templates/gpu/util_functions | 22 +++++++++++-----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 8d3d5aa84..da30fcfe8 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1899,17 +1899,6 @@ EOF
   systemctl start dataproc-cgroup-device-permissions
 }
 
-function clear_nvsmi_cache() {
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
-    rm "${nvsmi_query_xml}"
-  fi
-}
-
-function query_nvsmi() {
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
-  /usr/bin/nvidia-smi -q -x --dtd > "${nvsmi_query_xml}"
-}
-
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
   if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
@@ -1928,6 +1917,17 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
+function clear_nvsmi_cache() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
+    rm "${nvsmi_query_xml}"
+  fi
+}
+
+function query_nvsmi() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
+  nvsmi -q -x --dtd > "${nvsmi_query_xml}"
+}
+
 function install_build_dependencies() {
   if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
 
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 7c8b47b32..328b89196 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1155,17 +1155,6 @@ EOF
   systemctl start dataproc-cgroup-device-permissions
 }
 
-function clear_nvsmi_cache() {
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
-    rm "${nvsmi_query_xml}"
-  fi
-}
-
-function query_nvsmi() {
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
-  /usr/bin/nvidia-smi -q -x --dtd > "${nvsmi_query_xml}"
-}
-
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
   if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
@@ -1184,6 +1173,17 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
+function clear_nvsmi_cache() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
+    rm "${nvsmi_query_xml}"
+  fi
+}
+
+function query_nvsmi() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
+  nvsmi -q -x --dtd > "${nvsmi_query_xml}"
+}
+
 function install_build_dependencies() {
   if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
 

From adf4312102c8c44b6a59b5a52057d9019a8d8fdd Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 16:41:45 -0800
Subject: [PATCH 065/130] corrected comments

---
 templates/spark-rapids/mig.sh.in | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
index 27da6ffd0..28a463602 100644
--- a/templates/spark-rapids/mig.sh.in
+++ b/templates/spark-rapids/mig.sh.in
@@ -1,15 +1,16 @@
 #!/bin/bash
 #
 [% INSERT legal/license_header %]
-# This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures.
 #
-# This script should be specified in --metadata=startup-script-url= option and
-# --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it.
-# The script does a reboot to fully enable MIG and then configures the MIG device based on the
-# user specified MIG_CGI profiles specified via: --metadata=^:^MIG_CGI='9,9'. If MIG_CGI
-# is not specified it assumes it's using an A100 and configures 2 instances with profile id 9.
-# It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the
-# YARN setup to fully utilize the MIG instances on YARN.
+# This script installs NVIDIA GPU drivers and enables MIG on Hopper
+# GPU architectures.
+#
+# This script should be specified in --initialization-actions= option
+# and --metadata=ENABLE_MIG can be used to enable or disable MIG. The
+# default is to enable it.  The script configures the MIG device based
+# on the user specified MIG_CGI profiles specified via:
+# --metadata=^:^MIG_CGI='9,9'. If MIG_CGI is not specified it assumes
+# it's using an H100 and configures 2 instances with profile id 9.
 #
 [% PROCESS common/template_disclaimer %]
 

From dfcd8b02eadc8a253d99fadc25020614ce314275 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 16:45:40 -0800
Subject: [PATCH 066/130] defining xpath variables as local

---
 gpu/install_gpu_driver.sh    | 3 ++-
 templates/gpu/util_functions | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index da30fcfe8..7cb4a1817 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -2099,8 +2099,8 @@ function enable_and_configure_mig() {
 
   enable_mig
 
-  xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   query_nvsmi
+  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
 
   if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
@@ -2125,6 +2125,7 @@ function setup_gpu_yarn() {
 
   # if this is called without the MIG script then the drivers are not installed
   query_nvsmi
+  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
   NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
 
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 328b89196..1019a8f78 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1355,8 +1355,8 @@ function enable_and_configure_mig() {
 
   enable_mig
 
-  xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   query_nvsmi
+  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
 
   if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
@@ -1381,6 +1381,7 @@ function setup_gpu_yarn() {
 
   # if this is called without the MIG script then the drivers are not installed
   query_nvsmi
+  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
   NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
 

From 9bb4d6664a8f5db95b8cd0e69496ab2a973cee98 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 18:16:42 -0800
Subject: [PATCH 067/130] tested on 2.1-ubuntu20

---
 gpu/install_gpu_driver.sh       | 74 ++++++++++++++++++++-------------
 templates/common/util_functions |  4 ++
 templates/gpu/util_functions    | 70 ++++++++++++++++++-------------
 3 files changed, 92 insertions(+), 56 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 7cb4a1817..76a6703ef 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -28,6 +28,10 @@ function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | x
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
 
+# For version (or real number) comparison
+# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second
+# ( version_ge 2.0 2.1 ) evaluates to false
+# ( version_ge 2.2 2.1 ) evaluates to true
 function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
 function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
@@ -1754,7 +1758,7 @@ function configure_gpu_exclusive_mode() {
   # only run this function when spark < 3.0
   if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
   # include exclusive mode on GPU
-  nvidia-smi -c EXCLUSIVE_PROCESS
+  nvsmi -c EXCLUSIVE_PROCESS
   clear_nvsmi_cache
 }
 
@@ -1769,7 +1773,7 @@ function fetch_mig_scripts() {
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
   local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
-  local DEFAULT_XGBOOST_VERSION="1.7.6"
+  local DEFAULT_XGBOOST_VERSION="2.0.2"
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
@@ -1785,15 +1789,22 @@ function install_spark_rapids() {
   local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
   local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
 
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
+  local jar_basename
+
+  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
+  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
 }
 
 function configure_gpu_script() {
@@ -1906,7 +1917,7 @@ function nvsmi() {
   elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
   else nvsmi_works="1" ; fi
 
-  if [[ "$1" == "-L" ]] ; then
+  if test -v 1 && [[ "$1" == "-L" ]] ; then
     local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
     if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
     else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
@@ -1924,6 +1935,7 @@ function clear_nvsmi_cache() {
 }
 
 function query_nvsmi() {
+  if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi
   if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
   nvsmi -q -x --dtd > "${nvsmi_query_xml}"
 }
@@ -1997,6 +2009,9 @@ function prepare_gpu_env(){
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
   readonly RAPIDS_RUNTIME
 
+  # determine whether we have nvidia-smi installed and working
+  nvsmi
+
   set_cuda_version
   set_driver_version
   set_cuda_repo_shortname
@@ -2084,8 +2099,8 @@ function enable_mig() {
     # Write an ascii zero to the numa node indicator
     echo "0" | dd of="${f}" status=none
   done
-  time nvidia-smi --gpu-reset # 30s
-  nvidia-smi -mig 1
+  time nvsmi --gpu-reset # 30s
+  nvsmi -mig 1
   clear_nvsmi_cache
 
   touch "${workdir}/complete/enable-mig"
@@ -2098,7 +2113,6 @@ function enable_and_configure_mig() {
   if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
 
   enable_mig
-
   query_nvsmi
   local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
@@ -2123,19 +2137,23 @@ function setup_gpu_yarn() {
     return 0
   fi
 
-  # if this is called without the MIG script then the drivers are not installed
-  query_nvsmi
-  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-  migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
-  NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-  if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-    if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-      if (echo "${migquery_result}" | grep Enabled); then
-        IS_MIG_ENABLED=1
-        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-        MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-        fetch_mig_scripts
+  if [[ "${nvsmi_works}" == "1" ]] ; then
+    # if this is called without the MIG script then the drivers are not installed
+    query_nvsmi
+    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
+    set +e
+    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
+    set -e
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
       fi
     fi
   fi
diff --git a/templates/common/util_functions b/templates/common/util_functions
index 80ce5c09f..93b276a68 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -2,6 +2,10 @@ function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | x
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
 
+# For version (or real number) comparison
+# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second
+# ( version_ge 2.0 2.1 ) evaluates to false
+# ( version_ge 2.2 2.1 ) evaluates to true
 function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
 function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 1019a8f78..dca97b316 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1010,7 +1010,7 @@ function configure_gpu_exclusive_mode() {
   # only run this function when spark < 3.0
   if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
   # include exclusive mode on GPU
-  nvidia-smi -c EXCLUSIVE_PROCESS
+  nvsmi -c EXCLUSIVE_PROCESS
   clear_nvsmi_cache
 }
 
@@ -1025,7 +1025,7 @@ function fetch_mig_scripts() {
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
   local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
-  local DEFAULT_XGBOOST_VERSION="1.7.6"
+  local DEFAULT_XGBOOST_VERSION="2.0.2"
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
@@ -1041,15 +1041,22 @@ function install_spark_rapids() {
   local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
   local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
 
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
+  local jar_basename
+
+  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
+  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
 }
 
 function configure_gpu_script() {
@@ -1162,7 +1169,7 @@ function nvsmi() {
   elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
   else nvsmi_works="1" ; fi
 
-  if [[ "$1" == "-L" ]] ; then
+  if test -v 1 && [[ "$1" == "-L" ]] ; then
     local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
     if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
     else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
@@ -1180,6 +1187,7 @@ function clear_nvsmi_cache() {
 }
 
 function query_nvsmi() {
+  if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi
   if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
   nvsmi -q -x --dtd > "${nvsmi_query_xml}"
 }
@@ -1253,6 +1261,9 @@ function prepare_gpu_env(){
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
   readonly RAPIDS_RUNTIME
 
+  # determine whether we have nvidia-smi installed and working
+  nvsmi
+
   set_cuda_version
   set_driver_version
   set_cuda_repo_shortname
@@ -1340,8 +1351,8 @@ function enable_mig() {
     # Write an ascii zero to the numa node indicator
     echo "0" | dd of="${f}" status=none
   done
-  time nvidia-smi --gpu-reset # 30s
-  nvidia-smi -mig 1
+  time nvsmi --gpu-reset # 30s
+  nvsmi -mig 1
   clear_nvsmi_cache
 
   touch "${workdir}/complete/enable-mig"
@@ -1354,7 +1365,6 @@ function enable_and_configure_mig() {
   if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
 
   enable_mig
-
   query_nvsmi
   local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
   mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
@@ -1379,19 +1389,23 @@ function setup_gpu_yarn() {
     return 0
   fi
 
-  # if this is called without the MIG script then the drivers are not installed
-  query_nvsmi
-  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-  migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
-  NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-  if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-    if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-      if (echo "${migquery_result}" | grep Enabled); then
-        IS_MIG_ENABLED=1
-        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-        MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-        fetch_mig_scripts
+  if [[ "${nvsmi_works}" == "1" ]] ; then
+    # if this is called without the MIG script then the drivers are not installed
+    query_nvsmi
+    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
+    set +e
+    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
+    set -e
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
       fi
     fi
   fi

From 17f0fe86ec97fb16e6e980fd379a8503a35ca1f4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 18:33:10 -0800
Subject: [PATCH 068/130] using tests from
 https://github.com/GoogleCloudDataproc/initialization-actions/pull/1275

---
 gpu/test_gpu.py | 327 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 242 insertions(+), 85 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f8438915f..f260d5927 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -4,26 +4,77 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 
+import unittest
+
 from integration_tests.dataproc_test_case import DataprocTestCase
 
+DEFAULT_TIMEOUT = 15  # minutes
+DEFAULT_CUDA_VERSION = "12.4"
 
 class NvidiaGpuDriverTestCase(DataprocTestCase):
   COMPONENT = "gpu"
   INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
-  GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a
-  GPU_A100 = "type=nvidia-tesla-a100"
+  GPU_V100 = "type=nvidia-tesla-v100"
+  GPU_A100 = "type=nvidia-tesla-a100,count=2"
   GPU_H100 = "type=nvidia-h100-80gb,count=8"
 
+  # Tests for PyTorch
+  TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
+
+  # Tests for TensorFlow
+  TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
+
+  def assert_instance_command(self,
+                             instance,
+                             cmd,
+                             timeout_in_minutes=DEFAULT_TIMEOUT):
+
+    retry_count = 5
+
+    ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
+      instance, self.cluster_zone, cmd)
+
+    while retry_count > 0:
+      try:
+        ret_code, stdout, stderr = self.assert_command( ssh_cmd, timeout_in_minutes )
+        return ret_code, stdout, stderr
+      except Exception as e:
+        print("An error occurred: ", e)
+        retry_count -= 1
+        if retry_count > 0:
+          time.sleep(10)
+          continue
+        else:
+          raise
+
   def verify_instance(self, name):
     # Verify that nvidia-smi works
-    time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience
+    import random
+    # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions
+    time.sleep( 3 + random.randint(1, 30) )
     self.assert_instance_command(name, "nvidia-smi", 1)
 
-  def verify_pyspark(self, name):
-    # Verify that pyspark works
-    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+  def verify_pytorch(self, name):
+    test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                               self.TORCH_TEST_SCRIPT_FILE_NAME)
+    self.upload_test_file(test_filename, name)
+
+    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+        self.TORCH_TEST_SCRIPT_FILE_NAME)
+    self.assert_instance_command(name, verify_cmd)
+    self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
+
+  def verify_tensorflow(self, name):
+    test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                               self.TF_TEST_SCRIPT_FILE_NAME)
+    self.upload_test_file(test_filename, name)
+
+    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+        self.TF_TEST_SCRIPT_FILE_NAME)
+    self.assert_instance_command(name, verify_cmd)
+    self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)
 
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
@@ -41,6 +92,18 @@ def verify_instance_nvcc(self, name, cuda_version):
     self.assert_instance_command(
         name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) )
 
+  def verify_instance_pyspark(self, name):
+    # Verify that pyspark works
+    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
+  def verify_instance_cuda_version(self, name, cuda_version):
+    self.assert_instance_command(
+        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) )
+
+  def verify_instance_driver_version(self, name, driver_version):
+    self.assert_instance_command(
+        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) )
+
   def verify_instance_spark(self):
     self.assert_dataproc_job(
       self.getClusterName(),
@@ -56,6 +119,22 @@ def verify_instance_spark(self):
       +   "spark.yarn.unmanagedAM.enabled=false"
     )
 
+  def verify_driver_signature(self, name):
+    cert_path='/var/lib/dkms/mok.pub'
+    if self.getImageOs() == 'ubuntu':
+      cert_path='/var/lib/shim-signed/mok/MOK.der'
+
+    cert_verification_cmd = """
+perl -Mv5.10 -e '
+my $cert = ( qx{openssl x509 -inform DER -in {} -text}
+             =~ /Serial Number:.*? +(.+?)\s*$/ms );
+my $kmod = ( qx{modinfo nvidia}
+             =~ /^sig_key:\s+(\S+)/ms );
+exit 1 unless $cert eq lc $kmod
+'
+"""
+    self.assert_instance_command( name, cert_verification_cmd.format(cert_path) )
+
   @parameterized.parameters(
       ("SINGLE",   ["m"], GPU_T4, None, None),
 #      ("STANDARD", ["m"], GPU_T4, None, None),
@@ -64,8 +143,14 @@ def verify_instance_spark(self):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -73,17 +158,18 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-8",
+        machine_type="n1-highmem-32",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90,
-        boot_disk_size="50GB")
+        timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl
+        boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
-      if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ):
-        self.verify_pyspark(machine_name)
+      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
+      self.verify_instance_pyspark(machine_name)
+      self.verify_instance_spark()
 
   @parameterized.parameters(
       ("SINGLE", ["m"], GPU_T4, None, None),
@@ -91,13 +177,16 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
   def test_install_gpu_without_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-
     self.skipTest("No need to regularly test not installing the agent")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     metadata = "install-gpu-agent=false"
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
+
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
     self.createCluster(
@@ -107,22 +196,27 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
 
   @parameterized.parameters(
-      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
+      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
 #      ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"),
 #      ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"),
   )
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
+    if configuration == 'KERBEROS' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -134,40 +228,46 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB",
         scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
-      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
-                                                    machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
 
   @parameterized.parameters(
-#       ("SINGLE", ["m"],               GPU_T4, None,   "12.0"),
-        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
+#        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
       ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.8"),
+      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
+    if configuration == 'KERBEROS' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
+
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -177,12 +277,15 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB")
+
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
       self.verify_instance_nvcc(machine_name, cuda_version)
+      self.verify_instance_pyspark(machine_name)
+    self.verify_instance_spark()
 
   @parameterized.parameters(
       ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
@@ -192,25 +295,23 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
   def test_install_gpu_with_mig(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider, cuda_version):
-
-    self.skipTest("Test is known to fail.  Skipping so that we can exercise others")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
-
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    # Operation [projects/.../regions/.../operations/...] failed:
+    # Invalid value for field 'resource.machineType': \
+    # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \
+    # 'machineTypes/a3-highgpu-8g'. \
+    # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature..
+    # ('This use case not thoroughly tested')
+    unittest.expectedFailure(self)
+    self.skipTest("known to fail")
+
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -222,7 +323,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB",
         startup_script="gpu/mig.sh")
 
@@ -236,12 +337,13 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   )
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -255,7 +357,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         boot_disk_size="50GB",
-        timeout_in_minutes=30)
+        timeout_in_minutes=90)
 
     self.verify_instance_spark()
 
@@ -270,26 +372,21 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty")
-
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
-
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -299,14 +396,74 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
-      timeout_in_minutes=30,
+      timeout_in_minutes=90,
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
+
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
-      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
-                                                    machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
+    self.verify_instance_spark()
+
+  @parameterized.parameters(
+#    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
+#    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'),
+    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
+#    ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
+  )
+  def tests_driver_signing(self, configuration, machine_suffixes,
+                           master_accelerator, worker_accelerator,
+                           cuda_version, image_os, image_version):
+
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'KERBEROS' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
+
+    kvp_array=[]
+    import os
+
+    if "private_secret_name" in os.environ:
+      for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']:
+        kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) )
+
+      if kvp_array[0] == "public_secret_name=":
+        self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+    else:
+      self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+
+    metadata = ",".join( kvp_array )
+
+    if self.getImageOs() != image_os:
+      self.skipTest("This test is only run on os {}".format(image_os))
+    if self.getImageVersion() != image_version:
+      self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os))
+
+    self.createCluster(
+      configuration,
+      self.INIT_ACTIONS,
+      machine_type="n1-highmem-8",
+      master_accelerator=master_accelerator,
+      worker_accelerator=worker_accelerator,
+      metadata=metadata,
+      timeout_in_minutes=90,
+      boot_disk_size="50GB",
+      scopes="https://www.googleapis.com/auth/monitoring.write")
+    for machine_suffix in machine_suffixes:
+      hostname="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(hostname)
+      self.verify_instance_gpu_agent(hostname)
+#      self.verify_driver_signature(hostname)
 
     self.verify_instance_spark()
 

From f42a86d803f8469c93f07d8d4784e7a524f6ea46 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 2 Jan 2025 19:04:00 -0800
Subject: [PATCH 069/130] reducing resources for build cluster ; pause for
 gcloud

---
 gpu/install_gpu_driver.sh       | 2 ++
 gpu/test_gpu.py                 | 4 ++--
 templates/common/util_functions | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 76a6703ef..70242aad9 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -155,6 +155,8 @@ function cache_fetched_package() {
   local gcs_fn="$2"
   local local_fn="$3"
 
+  while ! command -v gcloud ; do sleep 5s ; done
+
   if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
     time gcloud storage cp "${gcs_fn}" "${local_fn}"
   else
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f260d5927..0f6550ad7 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -158,11 +158,11 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-32",
+        machine_type="n1-standard-32",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl
+        timeout_in_minutes=90, # This cluster is sized and timed appropriately to build the kernel driver and nccl
         boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
diff --git a/templates/common/util_functions b/templates/common/util_functions
index 93b276a68..6d58103a7 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -129,6 +129,8 @@ function cache_fetched_package() {
   local gcs_fn="$2"
   local local_fn="$3"
 
+  while ! command -v gcloud ; do sleep 5s ; done
+
   if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
     time gcloud storage cp "${gcs_fn}" "${local_fn}"
   else

From 811ad03c2b32e8873cc7691768c80931a448a03e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 18:25:59 -0800
Subject: [PATCH 070/130] exercising spark-rapids from this template

---
 templates/gpu/util_functions | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index dca97b316..0bc844e1f 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1024,14 +1024,14 @@ function fetch_mig_scripts() {
 
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
-  local DEFAULT_XGBOOST_VERSION="2.0.2"
+  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
 
-  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then
-    DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
 
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
@@ -1086,6 +1086,7 @@ function configure_gpu_script() {
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
 ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
@@ -1102,8 +1103,15 @@ EOF
   executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
   local task_cpus=2
   local gpu_amount
-  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-  if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi
+
+  # The current setting of spark.task.resource.gpu.amount (0.333) is
+  # not ideal to get the best performance from the RAPIDS Accelerator
+  # plugin. It's recommended to be 1/{executor core count} unless you
+  # have a special use case.
+#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
+
+# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
 
   cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######

From 4378eddb1e43f5e9b2193a9deff587c275b9b254 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 18:47:57 -0800
Subject: [PATCH 071/130] improved header documentation

---
 templates/spark-rapids/spark-rapids.sh.in | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index ac8ec5c3f..004080690 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -4,13 +4,16 @@
 #
 [% PROCESS common/template_disclaimer %]
 #
-# This script installs NVIDIA GPU drivers (version 550.135) along with
-# CUDA 12.4.
+# This script installs NVIDIA GPU drivers.
+#
+# Dataproc 2.0:  Driver version 530.30.02, CUDA version 12.1.1, Rapids 23.08.2
+# Dataproc 2.1:  Driver version   550.135, CUDA version 12.4.1, Rapids 24.08.1
+# Dataproc 2.2:  Driver version 560.35.03, CUDA version 12.6.2, Rapids 24.08.1
 #
 # Additionally, it installs the RAPIDS Spark plugin, configures Spark
-# and YARN, installs an agent to collect GPU utilization metrics.  The
-# installer is compatible with Debian, Ubuntu, and Rocky Linux
-# distributions.
+# and YARN, and installs an agent to collect GPU utilization metrics.
+# The installer is regularly exercised with Debian, Ubuntu, and Rocky
+# Linux distributions.
 #
 # Note that the script is designed to work both when secure boot is
 # enabled with a custom image and when disabled during cluster

From 992bd146a76e11afc065b938e6d2d365337897ad Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 19:44:46 -0800
Subject: [PATCH 072/130] generated from templates in commit
 d5f7ffb7cf19852e48ce17c9ffae3640e7b19ca2

---
 gpu/install_gpu_driver.sh | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 70242aad9..59a592d30 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1774,14 +1774,14 @@ function fetch_mig_scripts() {
 
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
-  local DEFAULT_XGBOOST_VERSION="2.0.2"
+  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
 
-  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then
-    DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
 
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
@@ -1836,6 +1836,7 @@ function configure_gpu_script() {
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
 ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
@@ -1852,8 +1853,15 @@ EOF
   executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
   local task_cpus=2
   local gpu_amount
-  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-  if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi
+
+  # The current setting of spark.task.resource.gpu.amount (0.333) is
+  # not ideal to get the best performance from the RAPIDS Accelerator
+  # plugin. It's recommended to be 1/{executor core count} unless you
+  # have a special use case.
+#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
+
+# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
 
   cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######

From 1d84952a1bbf57861c0a71e144161e9e01a446eb Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 21:17:07 -0800
Subject: [PATCH 073/130] replacing java spark tests with pyspark tests

---
 gpu/test_gpu.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 0f6550ad7..7eb1ac400 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -104,6 +104,15 @@ def verify_instance_driver_version(self, name, driver_version):
     self.assert_instance_command(
         name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) )
 
+  def verify_pyspark(self, name):
+    self.assert_dataproc_job(
+      self.getClusterName(),
+      "pyspark",
+      """--properties="spark:spark.executor.resource.gpu.amount=1" \
+         --properties="spark:spark.task.resource.gpu.amount=0.01" \
+         'gs://{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO)
+    )
+
   def verify_instance_spark(self):
     self.assert_dataproc_job(
       self.getClusterName(),
@@ -169,7 +178,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
       self.verify_instance(machine_name)
       self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
       self.verify_instance_pyspark(machine_name)
-      self.verify_instance_spark()
+    self.verify_pyspark()
 
   @parameterized.parameters(
       ("SINGLE", ["m"], GPU_T4, None, None),
@@ -285,7 +294,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
       self.verify_instance(machine_name)
       self.verify_instance_nvcc(machine_name, cuda_version)
       self.verify_instance_pyspark(machine_name)
-    self.verify_instance_spark()
+    self.verify_pyspark()
 
   @parameterized.parameters(
       ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
@@ -359,7 +368,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
         boot_disk_size="50GB",
         timeout_in_minutes=90)
 
-    self.verify_instance_spark()
+    self.verify_pyspark()
 
   @parameterized.parameters(
     ("SINGLE", ["m"], GPU_T4, None, "11.8"),
@@ -404,7 +413,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
       self.verify_instance_gpu_agent(machine_name)
-    self.verify_instance_spark()
+    self.verify_pyspark()
 
   @parameterized.parameters(
 #    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
@@ -465,7 +474,7 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       self.verify_instance_gpu_agent(hostname)
 #      self.verify_driver_signature(hostname)
 
-    self.verify_instance_spark()
+    self.verify_pyspark()
 
 if __name__ == "__main__":
   absltest.main()

From 88ccfec26939e3cfca9e1347abfb74f5988d1ea2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 21:30:58 -0800
Subject: [PATCH 074/130] pyspark test code

---
 gpu/verify_pyspark.py | 46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 gpu/verify_pyspark.py

diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py
new file mode 100644
index 000000000..9cd0ca2c8
--- /dev/null
+++ b/gpu/verify_pyspark.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3 
+# 
+# Copyright 2025 Google LLC and contributors
+# 
+# Licensed under the Apache License, Version 2.0 (the "License"); 
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+# 
+#      http://www.apache.org/licenses/LICENSE-2.0 
+# 
+# Unless required by applicable law or agreed to in writing, software 
+# distributed under the License is distributed on an "AS-IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and 
+# limitations under the License. 
+# 
+import matplotlib.pyplot as plt
+import numpy as np
+
+from pyspark import SparkContext
+from pyspark.sql import SparkSession
+from pyspark import SparkConf, StorageLevel
+from tqdm import tqdm
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
+import pyspark.sql.functions as f
+import nltk
+
+spark = SparkSession.builder.appName("spark-rapids").getOrCreate()
+
+#from utils import SimpleTimer, ResultsLogger, visualize_data 
+
+conf = (SparkConf().setMaster("local[*]")
+                   .setAppName("SparkVectorizer")
+                   .set('spark.driver.memory', '300G')
+                   .set('spark.driver.maxResultSize', '20G')
+                   .set('spark.network.timeout', '7200s')
+        )
+
+sc = SparkContext.getOrCreate(conf=conf)
+sc.setLogLevel("FATAL")
+spark = SparkSession(sc)
+print(sc._conf.getAll()) # check context settings 
+
+x = np.linspace(0, 3*np.pi, 500)
+plt.plot(x, np.sin(x**2))
+plt.title('A simple chirp');

From 282ca0c86361b7f5a308a5ecb7314410f009b4d8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 21:32:52 -0800
Subject: [PATCH 075/130] corrected function signature

---
 gpu/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 7eb1ac400..9f2f4c17e 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -104,7 +104,7 @@ def verify_instance_driver_version(self, name, driver_version):
     self.assert_instance_command(
         name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) )
 
-  def verify_pyspark(self, name):
+  def verify_pyspark(self):
     self.assert_dataproc_job(
       self.getClusterName(),
       "pyspark",

From d6e9809207aa5a8ed79316220a81c2ca6c054dc5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 21:55:33 -0800
Subject: [PATCH 076/130] fixing order of operations for setting default cuda
 version ; removed excess cuda default logic ; too many gs:// ; testing
 2.0-rocky8 instead of 2.1-rocky8

---
 gpu/test_gpu.py              | 16 ++++++++--------
 templates/gpu/util_functions | 31 +++++++++----------------------
 2 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 9f2f4c17e..0910c1942 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -110,7 +110,7 @@ def verify_pyspark(self):
       "pyspark",
       """--properties="spark:spark.executor.resource.gpu.amount=1" \
          --properties="spark:spark.task.resource.gpu.amount=0.01" \
-         'gs://{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO)
+         '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO)
     )
 
   def verify_instance_spark(self):
@@ -175,9 +175,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
         boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
-      self.verify_instance_pyspark(machine_name)
+#      self.verify_instance(machine_name)
+#      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
+#      self.verify_instance_pyspark(machine_name)
     self.verify_pyspark()
 
   @parameterized.parameters(
@@ -418,8 +418,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
   @parameterized.parameters(
 #    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
 #    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
-#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'),
-    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
+    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.1.1", 'rocky', '2.0'),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
 #    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
 #    ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
@@ -470,8 +470,8 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
       hostname="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(hostname)
-      self.verify_instance_gpu_agent(hostname)
+#      self.verify_instance(hostname)
+#      self.verify_instance_gpu_agent(hostname)
 #      self.verify_driver_signature(hostname)
 
     self.verify_pyspark()
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 0bc844e1f..6409d3fb1 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -52,6 +52,15 @@ function set_support_matrix() {
 set_support_matrix
 
 function set_cuda_version() {
+  case "${DATAPROC_IMAGE_VERSION}" in
+    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    *   )
+      echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
+      exit 1
+      ;;
+  esac
   local cuda_url
   cuda_url=$(get_metadata_attribute 'cuda-url' '')
   if [[ -n "${cuda_url}" ]] ; then
@@ -60,29 +69,8 @@ function set_cuda_version() {
     CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
     if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
       DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
-      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
     fi
   fi
-
-  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
-    DEFAULT_CUDA_VERSION='12.4.1'
-  fi
-  # EXCEPTIONS
-  # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
-  case "${DATAPROC_IMAGE_VERSION}" in
-    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;;
-    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
-    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
-    *   )
-      echo "unrecognized Dataproc image version"
-      exit 1
-      ;;
-  esac
-
-  if le_ubuntu18 ; then
-    DEFAULT_CUDA_VERSION="12.1.1"
-    CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}"  #12.1
-  fi
   readonly DEFAULT_CUDA_VERSION
 
   CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
@@ -95,7 +83,6 @@ function set_cuda_version() {
     CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
   fi
   readonly CUDA_FULL_VERSION
-
 }
 
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )

From e3df6f21a19900a9df0a8cd1520d0b18b2c79948 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 22:09:39 -0800
Subject: [PATCH 077/130] including verify_pyspark.py in data list

---
 gpu/BUILD | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gpu/BUILD b/gpu/BUILD
index b481c5b33..bd5500ccb 100644
--- a/gpu/BUILD
+++ b/gpu/BUILD
@@ -6,7 +6,11 @@ py_test(
     name = "test_gpu",
     size = "enormous",
     srcs = ["test_gpu.py"],
-    data = ["install_gpu_driver.sh", "mig.sh"],
+    data = [
+      "install_gpu_driver.sh",
+      "verify_pyspark.py",
+      "mig.sh"
+    ],
     local = True,
     shard_count = 15,
     deps = [

From 89fe31b415f52971f1580a07aaf6ddb9b30ff036 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 23:24:34 -0800
Subject: [PATCH 078/130] verifying with gcloud dataproc jobs submit pyspark
 instead of spark ; skipping all tests that use ssh

---
 gpu/test_gpu.py       | 23 ++++++++++++-----------
 gpu/verify_pyspark.py |  1 -
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 0910c1942..19fb7fe81 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -108,8 +108,8 @@ def verify_pyspark(self):
     self.assert_dataproc_job(
       self.getClusterName(),
       "pyspark",
-      """--properties="spark:spark.executor.resource.gpu.amount=1" \
-         --properties="spark:spark.task.resource.gpu.amount=0.01" \
+      """--properties="spark.executor.resource.gpu.amount=1" \
+         --properties="spark.task.resource.gpu.amount=0.01" \
          '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO)
     )
 
@@ -209,8 +209,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
         boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-
+#      self.verify_instance(machine_name)
+    self.verify_pyspark()
   @parameterized.parameters(
       ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
 #      ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"),
@@ -242,8 +242,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_gpu_agent(machine_name)
+#      self.verify_instance(machine_name)
+#      self.verify_instance_gpu_agent(machine_name)
+    self.verify_pyspark()
 
   @parameterized.parameters(
         ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
@@ -291,9 +292,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
 
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_nvcc(machine_name, cuda_version)
-      self.verify_instance_pyspark(machine_name)
+      #self.verify_instance(machine_name)
+      #self.verify_instance_nvcc(machine_name, cuda_version)
+      #self.verify_instance_pyspark(machine_name)
     self.verify_pyspark()
 
   @parameterized.parameters(
@@ -411,8 +412,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
 
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_gpu_agent(machine_name)
+#      self.verify_instance(machine_name)
+#      self.verify_instance_gpu_agent(machine_name)
     self.verify_pyspark()
 
   @parameterized.parameters(
diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py
index 9cd0ca2c8..9f2b18683 100644
--- a/gpu/verify_pyspark.py
+++ b/gpu/verify_pyspark.py
@@ -23,7 +23,6 @@
 from tqdm import tqdm
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
 import pyspark.sql.functions as f
-import nltk
 
 spark = SparkSession.builder.appName("spark-rapids").getOrCreate()
 

From e221ede81e85c68f79672fa3fb0b9f480d405ff8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 3 Jan 2025 23:39:33 -0800
Subject: [PATCH 079/130] re-enable ssh tests

---
 gpu/test_gpu.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 19fb7fe81..fda5785f3 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -175,9 +175,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
         boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-#      self.verify_instance(machine_name)
-#      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
-#      self.verify_instance_pyspark(machine_name)
+      self.verify_instance(machine_name)
+      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
+      self.verify_instance_pyspark(machine_name)
     self.verify_pyspark()
 
   @parameterized.parameters(
@@ -209,7 +209,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
         boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-#      self.verify_instance(machine_name)
+      self.verify_instance(machine_name)
     self.verify_pyspark()
   @parameterized.parameters(
       ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
@@ -242,8 +242,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-#      self.verify_instance(machine_name)
-#      self.verify_instance_gpu_agent(machine_name)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
     self.verify_pyspark()
 
   @parameterized.parameters(
@@ -292,9 +292,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
 
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      #self.verify_instance(machine_name)
-      #self.verify_instance_nvcc(machine_name, cuda_version)
-      #self.verify_instance_pyspark(machine_name)
+      self.verify_instance(machine_name)
+      self.verify_instance_nvcc(machine_name, cuda_version)
+      self.verify_instance_pyspark(machine_name)
     self.verify_pyspark()
 
   @parameterized.parameters(
@@ -412,8 +412,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
 
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-#      self.verify_instance(machine_name)
-#      self.verify_instance_gpu_agent(machine_name)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
     self.verify_pyspark()
 
   @parameterized.parameters(
@@ -471,9 +471,9 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
       hostname="{}-{}".format(self.getClusterName(),machine_suffix)
-#      self.verify_instance(hostname)
-#      self.verify_instance_gpu_agent(hostname)
-#      self.verify_driver_signature(hostname)
+      self.verify_instance(hostname)
+      self.verify_instance_gpu_agent(hostname)
+      self.verify_driver_signature(hostname)
 
     self.verify_pyspark()
 

From c9950a8e8071a4359a524016408e4ae8d517d6a8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 00:16:16 -0800
Subject: [PATCH 080/130] refactored ssh command retry code into the base class

---
 gpu/test_gpu.py                         | 51 -------------------------
 integration_tests/dataproc_test_case.py | 21 ++++++++--
 2 files changed, 17 insertions(+), 55 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index fda5785f3..ab2457ec2 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -16,39 +16,8 @@ class NvidiaGpuDriverTestCase(DataprocTestCase):
   INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
-  GPU_V100 = "type=nvidia-tesla-v100"
-  GPU_A100 = "type=nvidia-tesla-a100,count=2"
   GPU_H100 = "type=nvidia-h100-80gb,count=8"
 
-  # Tests for PyTorch
-  TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
-
-  # Tests for TensorFlow
-  TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
-
-  def assert_instance_command(self,
-                             instance,
-                             cmd,
-                             timeout_in_minutes=DEFAULT_TIMEOUT):
-
-    retry_count = 5
-
-    ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
-      instance, self.cluster_zone, cmd)
-
-    while retry_count > 0:
-      try:
-        ret_code, stdout, stderr = self.assert_command( ssh_cmd, timeout_in_minutes )
-        return ret_code, stdout, stderr
-      except Exception as e:
-        print("An error occurred: ", e)
-        retry_count -= 1
-        if retry_count > 0:
-          time.sleep(10)
-          continue
-        else:
-          raise
-
   def verify_instance(self, name):
     # Verify that nvidia-smi works
     import random
@@ -56,26 +25,6 @@ def verify_instance(self, name):
     time.sleep( 3 + random.randint(1, 30) )
     self.assert_instance_command(name, "nvidia-smi", 1)
 
-  def verify_pytorch(self, name):
-    test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                               self.TORCH_TEST_SCRIPT_FILE_NAME)
-    self.upload_test_file(test_filename, name)
-
-    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
-        self.TORCH_TEST_SCRIPT_FILE_NAME)
-    self.assert_instance_command(name, verify_cmd)
-    self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
-
-  def verify_tensorflow(self, name):
-    test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                               self.TF_TEST_SCRIPT_FILE_NAME)
-    self.upload_test_file(test_filename, name)
-
-    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
-        self.TF_TEST_SCRIPT_FILE_NAME)
-    self.assert_instance_command(name, verify_cmd)
-    self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)
-
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 936718498..aed5bd7e8 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -286,11 +286,24 @@ def assert_instance_command(self,
         Raises:
             AssertionError: if command returned non-0 exit code.
         """
+      retry_count = 5
 
-        ret_code, stdout, stderr = self.assert_command(
-            'gcloud compute ssh {} --zone={} --command="{}"'.format(
-                instance, self.cluster_zone, cmd), timeout_in_minutes)
-        return ret_code, stdout, stderr
+      ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
+        instance, self.cluster_zone, cmd)
+
+      while retry_count > 0:
+        try:
+          ret_code, stdout, stderr = self.assert_command(
+              ssh_cmd, timeout_in_minutes )
+          return ret_code, stdout, stderr
+        except Exception as e:
+          print("An error occurred: ", e)
+          retry_count -= 1
+          if retry_count > 0:
+            time.sleep(10)
+            continue
+          else:
+            raise
 
     def assert_dataproc_job(self,
                             cluster_name,

From 8143d4cef910b78ccb394b17c1663f0e4c1d95a1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 00:23:17 -0800
Subject: [PATCH 081/130] remembered the imports ; sleep a random period

---
 integration_tests/dataproc_test_case.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index aed5bd7e8..e487dd8c5 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -7,6 +7,8 @@
 import string
 import subprocess
 import sys
+import time
+import random
 from threading import Timer
 
 import pkg_resources
@@ -300,7 +302,7 @@ def assert_instance_command(self,
           print("An error occurred: ", e)
           retry_count -= 1
           if retry_count > 0:
-            time.sleep(10)
+            time.sleep( 3 + random.randint(1, 10) )
             continue
           else:
             raise

From 834f7d5f128719b16762869e5ee396a9a4754193 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 01:16:07 -0800
Subject: [PATCH 082/130] A100->H100

---
 gpu/test_gpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index ab2457ec2..1f3328eaa 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -247,9 +247,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     self.verify_pyspark()
 
   @parameterized.parameters(
-      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
-#      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"),
-      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"),
+      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "11.8"),
+#      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.0"),
+      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.4"),
   )
   def test_install_gpu_with_mig(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
@@ -278,7 +278,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
         configuration,
         self.INIT_ACTIONS,
         master_machine_type="a3-highgpu-8g",
-        worker_machine_type="a2-highgpu-2g",
+        worker_machine_type="a3-highgpu-8g",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,

From 3d837955662b94899e793ec5cd069d50474affd6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 01:25:32 -0800
Subject: [PATCH 083/130] fixing whitespace for python

---
 integration_tests/dataproc_test_case.py | 35 +++++++++++++------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index e487dd8c5..c8a33b8bd 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -288,24 +288,25 @@ def assert_instance_command(self,
         Raises:
             AssertionError: if command returned non-0 exit code.
         """
-      retry_count = 5
 
-      ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
-        instance, self.cluster_zone, cmd)
-
-      while retry_count > 0:
-        try:
-          ret_code, stdout, stderr = self.assert_command(
-              ssh_cmd, timeout_in_minutes )
-          return ret_code, stdout, stderr
-        except Exception as e:
-          print("An error occurred: ", e)
-          retry_count -= 1
-          if retry_count > 0:
-            time.sleep( 3 + random.randint(1, 10) )
-            continue
-          else:
-            raise
+        retry_count = 5
+
+        ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
+          instance, self.cluster_zone, cmd)
+
+        while retry_count > 0:
+          try:
+            ret_code, stdout, stderr = self.assert_command(
+                ssh_cmd, timeout_in_minutes )
+            return ret_code, stdout, stderr
+          except Exception as e:
+            print("An error occurred: ", e)
+            retry_count -= 1
+            if retry_count > 0:
+              time.sleep( 3 + random.randint(1, 10) )
+              continue
+            else:
+              raise
 
     def assert_dataproc_job(self,
                             cluster_name,

From 7718e5abefc6689f0dde35afe44e24f3e55e7876 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 17:08:37 -0800
Subject: [PATCH 084/130] moved knox variables to common env ; renamed
 ambiguous variable name

---
 templates/common/util_functions |  11 +
 templates/dask/dask.sh.in       |  75 +++++
 templates/dask/util_functions   | 502 ++++++++++++++++++++++++++++++++
 templates/rapids/rapids.sh.in   |  79 +++++
 4 files changed, 667 insertions(+)
 create mode 100644 templates/dask/dask.sh.in
 create mode 100644 templates/dask/util_functions
 create mode 100644 templates/rapids/rapids.sh.in

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 6d58103a7..ac4809796 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -539,6 +539,12 @@ function check_secure_boot() {
                       mok_der=/var/lib/dkms/mok.pub ; fi
 }
 
+function restart_knox() {
+  systemctl stop knox
+  rm -rf "${KNOX_HOME}/data/deployments/*"
+  systemctl start knox
+}
+
 function install_dependencies() {
   test -f "${workdir}/complete/install-dependencies" && return 0
   pkg_list="screen"
@@ -578,6 +584,11 @@ function prepare_common_env() {
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
+  # Knox config
+  readonly KNOX_HOME=/usr/lib/knox
+  readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0"
+  readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
+
   mkdir -p "${workdir}/complete"
   set_proxy
   mount_ramdisk
diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
new file mode 100644
index 000000000..84a279f0a
--- /dev/null
+++ b/templates/dask/dask.sh.in
@@ -0,0 +1,75 @@
+#!/bin/bash
+#
+[% INSERT legal/license_header %]
+#
+[% PROCESS common/template_disclaimer %]
+#
+# This initialization action script will install Dask and other relevant
+# libraries on a Dataproc cluster. This is supported for either "yarn" or
+# "standalone" runtimes Please see dask.org and yarn.dask.org for more
+# information.
+
+set -euxo pipefail
+
+[% INSERT common/util_functions %]
+
+[% INSERT gpu/util_functions %]
+
+[% INSERT dask/util_functions %]
+
+function main() {
+  # Install Dask
+  install_dask
+
+  # In "standalone" mode, Dask relies on a systemd unit to launch.
+  # In "yarn" mode, it relies on a config.yaml file.
+  if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
+    # Create Dask YARN config file
+    configure_dask_yarn
+  elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then
+    # Create Dask service
+    install_systemd_dask_service
+
+    if [[ "$(hostname -s)" == "${MASTER}" ]]; then
+      systemctl start "${DASK_SCHEDULER_SERVICE}"
+      systemctl status "${DASK_SCHEDULER_SERVICE}"
+    fi
+
+    echo "Starting Dask 'standalone' cluster..."
+    if [[ "${enable_worker_service}" == "1" ]]; then
+      systemctl start "${DASK_WORKER_SERVICE}"
+      systemctl status "${DASK_WORKER_SERVICE}"
+    fi
+
+    configure_knox_for_dask
+
+    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
+    if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
+      configure_fluentd_for_dask
+    fi
+  else
+    echo "Unsupported Dask Runtime: ${DASK_RUNTIME}"
+    exit 1
+  fi
+
+  echo "Dask for ${DASK_RUNTIME} successfully initialized."
+}
+
+function exit_handler() {
+  gpu_exit_handler
+  common_exit_handler
+  return 0
+}
+
+function prepare_to_install(){
+  prepare_common_env
+  prepare_gpu_env
+  conda_env="$(get_metadata_attribute conda-env || echo 'dask')"
+  readonly conda_env
+  prepare_dask_env
+  trap exit_handler EXIT
+}
+
+prepare_to_install
+
+main
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
new file mode 100644
index 000000000..47e10a7d3
--- /dev/null
+++ b/templates/dask/util_functions
@@ -0,0 +1,502 @@
+function configure_dask_yarn() {
+  readonly DASK_YARN_CONFIG_DIR=/etc/dask/
+  readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml
+  # Minimal custom configuration is required for this
+  # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage
+  # for information on tuning Dask-Yarn environments.
+  mkdir -p "${DASK_YARN_CONFIG_DIR}"
+
+  local worker_class="dask.distributed.Nanny"
+  local gpu_count="0"
+  if command -v nvidia-smi ; then
+    gpu_count="1"
+    worker_class="dask_cuda.CUDAWorker"
+  fi
+
+  cat <<EOF >"${DASK_YARN_CONFIG_FILE}"
+# Config file for Dask Yarn.
+#
+# These values are joined on top of the default config, found at
+# https://yarn.dask.org/en/latest/configuration.html#default-configuration
+
+yarn:
+  environment: python://${DASK_CONDA_ENV}/bin/python
+
+  worker:
+    count: 2
+    gpus: ${gpu_count}
+    worker_class: ${worker_class}
+EOF
+}
+
+function install_systemd_dask_worker() {
+  echo "Installing systemd Dask Worker service..."
+  local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}"
+
+  mkdir -p "${dask_worker_local_dir}"
+
+  local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh"
+
+  local compute_mode_cmd=""
+  if command -v nvidia-smi ; then compute_mode_cmd="nvidia-smi --compute-mode=DEFAULT" ; fi
+  local worker_name="dask-worker"
+  if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda-worker" ; fi
+  local worker="${DASK_CONDA_ENV}/bin/${worker_name}"
+  cat <<EOF >"${DASK_WORKER_LAUNCHER}"
+#!/bin/bash
+LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log"
+${compute_mode_cmd}
+echo "${worker_name} starting, logging to \${LOGFILE}"
+${worker} "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1
+EOF
+
+  chmod 750 "${DASK_WORKER_LAUNCHER}"
+
+  local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service"
+  cat <<EOF >"${dask_service_file}"
+[Unit]
+Description=Dask Worker Service
+[Service]
+Type=simple
+Restart=on-failure
+ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}'
+[Install]
+WantedBy=multi-user.target
+EOF
+  chmod a+r "${dask_service_file}"
+
+  systemctl daemon-reload
+
+  # Enable the service
+  enable_systemd_dask_worker_service="0"
+  if [[ "${ROLE}" != "Master" ]]; then
+    enable_systemd_dask_worker_service="1"
+  else
+    # Enable service on single-node cluster (no workers)
+    local worker_count="$(get_metadata_attribute dataproc-worker-count)"
+    if [[ "${worker_count}" == "0" ]] &&
+       [[ "$(get_metadata_attribute dask-cuda-worker-on-master 'true')" == "true" ]] &&
+       [[ "$(get_metadata_attribute dask-worker-on-master 'true')" == "true" ]] ; then
+      enable_systemd_dask_worker_service="1"
+    fi
+  fi
+  readonly enable_systemd_dask_worker_service
+
+  if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
+    systemctl enable "${DASK_WORKER_SERVICE}"
+    systemctl restart "${DASK_WORKER_SERVICE}"
+  fi
+}
+
+function install_systemd_dask_scheduler() {
+  # only run scheduler on primary master
+  if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi
+  echo "Installing systemd Dask Scheduler service..."
+  local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}"
+
+  mkdir -p "${dask_scheduler_local_dir}"
+
+  local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh"
+
+  cat <<EOF >"${DASK_SCHEDULER_LAUNCHER}"
+#!/bin/bash
+LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
+echo "dask scheduler starting, logging to \${LOGFILE}"
+${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
+EOF
+
+  chmod 750 "${DASK_SCHEDULER_LAUNCHER}"
+
+  local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service"
+  cat <<EOF >"${dask_service_file}"
+[Unit]
+Description=Dask Scheduler Service
+[Service]
+Type=simple
+Restart=on-failure
+ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}'
+[Install]
+WantedBy=multi-user.target
+EOF
+  chmod a+r "${dask_service_file}"
+
+  systemctl daemon-reload
+
+  # Enable the service
+  systemctl enable "${DASK_SCHEDULER_SERVICE}"
+}
+
+function install_systemd_dask_service() {
+  install_systemd_dask_scheduler
+  install_systemd_dask_worker
+}
+
+function configure_knox_for_dask() {
+  if [[ ! -d "${KNOX_HOME}" ]]; then
+    echo "Skip configuring Knox rules for Dask"
+    return 0
+  fi
+
+  local DASK_UI_PORT=8787
+  if [[ -f /etc/knox/conf/topologies/default.xml ]]; then
+    sed -i \
+      "/<\/topology>/i <service><role>DASK<\/role><url>http://localhost:${DASK_UI_PORT}<\/url><\/service> <service><role>DASKWS<\/role><url>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \
+      /etc/knox/conf/topologies/default.xml
+  fi
+
+  mkdir -p "${KNOX_DASK_DIR}"
+
+  cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<service role="DASK" name="dask" version="0.1.0">
+  <policies>
+    <policy role="webappsec"/>
+    <policy role="authentication" name="Anonymous"/>
+    <policy role="rewrite"/>
+    <policy role="authorization"/>
+  </policies>
+
+  <routes>
+    <!-- Javascript paths -->
+    <route path="/dask/**/*.js">
+      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
+    </route>
+    <route path="/dask/**/*.js?**">
+      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
+    </route>
+
+    <!-- CSS paths -->
+    <route path="/dask/**/*.css">
+      <rewrite apply="DASK/dask/inbound/css/dask" to="request.url"/>
+    </route>
+
+    <!-- General path routing -->
+    <route path="/dask">
+      <rewrite apply="DASK/dask/inbound/root" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
+    </route>
+    <route path="/dask/**">
+      <rewrite apply="DASK/dask/inbound/root/path" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
+      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
+    </route>
+    <route path="/dask/**?**">
+      <rewrite apply="DASK/dask/inbound/root/query" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
+      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
+    </route>
+  </routes>
+  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
+</service>
+EOF
+
+  cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<rules>
+  <rule dir="IN" name="DASK/dask/inbound/js/dask" pattern="http://*:*/**/dask/{**}?{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/root" pattern="http://*:*/**/dask">
+    <rewrite template="{$serviceUrl[DASK]}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/root/path" pattern="http://*:*/**/dask/{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/root/query" pattern="http://*:*/**/dask/{**}?{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/css/dask" pattern="http://*:*/**/dask/{**}?{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
+  </rule>
+  <!-- without the /gateway/default prefix -->
+  <rule dir="IN" name="DASK/dask/inbound/root/noprefix" pattern="http://*:*/dask">
+    <rewrite template="{$serviceUrl[DASK]}"/>
+  </rule>
+
+  <rule dir="OUT" name="DASK/dask/outbound/logs" pattern="/logs">
+    <rewrite template="{$frontend[path]}/dask/info/logs"/>
+  </rule>
+
+  <!-- Rewrite redirect responses Location header -->
+  <filter name="DASK/dask/outbound/headers">
+    <content type="application/x-http-headers">
+      <apply path="Location" rule="DASK/dask/outbound/headers/location"/>
+    </content>
+  </filter>
+
+  <rule dir="OUT" name="DASK/dask/outbound/headers/location" flow="OR">
+    <match pattern="*://*:*/">
+      <rewrite template="{$frontend[path]}/dask/"/>
+    </match>
+    <match pattern="*://*:*/{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}"/>
+    </match>
+    <match pattern="*://*:*/{**}?{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
+    </match>
+    <match pattern="/{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}"/>
+    </match>
+    <match pattern="/{**}?{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
+    </match>
+  </rule>
+</rules>
+EOF
+
+  mkdir -p "${KNOX_DASKWS_DIR}"
+
+  cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<service role="DASKWS" name="daskws" version="0.1.0">
+  <policies>
+    <policy role="webappsec"/>
+    <policy role="authentication" name="Anonymous"/>
+    <policy role="rewrite"/>
+    <policy role="authorization"/>
+  </policies>
+
+  <routes>
+
+    <route path="/dask/**/ws">
+      <rewrite apply="DASKWS/daskws/inbound/ws" to="request.url"/>
+    </route>
+
+  </routes>
+  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
+</service>
+EOF
+
+  cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<rules>
+  <rule dir="IN" name="DASKWS/daskws/inbound/ws" pattern="ws://*:*/**/dask/{**}/ws">
+    <rewrite template="{$serviceUrl[DASKWS]}/{**}/ws"/>
+  </rule>
+</rules>
+EOF
+
+  chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}"
+
+  # Do not restart knox during pre-init script run
+  if [[ -n "${ROLE}" ]]; then
+    restart_knox
+  fi
+}
+
+function configure_fluentd_for_dask() {
+  if [[ "$(hostname -s)" == "${MASTER}" ]]; then
+    cat >/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
+# Fluentd config for Dask logs
+
+# Dask scheduler
+<source>
+  @type tail
+  path /var/log/dask-scheduler.log
+  pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos
+  read_from_head true
+  tag google.dataproc.dask-scheduler
+  <parse>
+    @type none
+  </parse>
+</source>
+
+<filter google.dataproc.dask-scheduler>
+  @type record_transformer
+  <record>
+    filename dask-scheduler.log
+  </record>
+</filter>
+EOF
+  fi
+
+  if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
+    cat >>/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
+# Dask worker
+<source>
+  @type tail
+  path /var/log/dask-worker.log
+  pos_file /var/tmp/fluentd.dataproc.dask.worker.pos
+  read_from_head true
+  tag google.dataproc.dask-worker
+  <parse>
+    @type none
+  </parse>
+</source>
+
+<filter google.dataproc.dask-worker>
+  @type record_transformer
+  <record>
+    filename dask-worker.log
+  </record>
+</filter>
+EOF
+  fi
+
+  systemctl restart google-fluentd
+}
+
+function install_dask() {
+  if is_cuda12 ; then
+    local python_spec="python>=3.11"
+    local cuda_spec="cuda-version>=12,<13"
+    local dask_spec="dask>=2024.7"
+  elif is_cuda11 ; then
+    local python_spec="python>=3.9"
+    local cuda_spec="cuda-version>=11,<12.0a0"
+    local dask_spec="dask"
+  fi
+
+  CONDA_PACKAGES=()
+  if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
+    # Pin `distributed` and `dask` package versions to old release
+    # because `dask-yarn` 0.9 uses skein in a way which
+    # is not compatible with `distributed` package 2022.2 and newer:
+    # https://github.com/dask/dask-yarn/issues/155
+
+    dask_spec="dask<2022.2"
+    python_spec="python>=3.7,<3.8.0a0"
+    if is_ubuntu18 ; then
+      # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
+      CONDA_PACKAGES+=("fiona<1.8.22")
+    fi
+    CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2")
+  fi
+
+  CONDA_PACKAGES+=(
+    "${cuda_spec}"
+    "${dask_spec}"
+    "dask-bigquery"
+    "dask-ml"
+    "dask-sql"
+  )
+
+  # Install dask
+  mamba="/opt/conda/miniconda3/bin/mamba"
+  conda="/opt/conda/miniconda3/bin/conda"
+
+  ( set +e
+  local is_installed=0
+  for installer in "${mamba}" "${conda}" ; do
+    test -d "${DASK_CONDA_ENV}" || \
+      time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \
+      -c 'conda-forge' -c 'nvidia'  \
+      ${CONDA_PACKAGES[*]} \
+      "${python_spec}" \
+      > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    sync
+    if [[ "$retval" == "0" ]] ; then
+      is_installed="1"
+      break
+    fi
+    "${conda}" config --set channel_priority flexible
+  done
+  if [[ "${is_installed}" == "0" ]]; then
+    echo "failed to install dask"
+    return 1
+  fi
+  )
+}
+
+function install_dask_rapids() {
+  if is_cuda12 ; then
+    local python_spec="python>=3.11"
+    local cuda_spec="cuda-version>=12,<13"
+    local dask_spec="dask>=2024.7"
+    local numba_spec="numba"
+  elif is_cuda11 ; then
+    local python_spec="python>=3.9"
+    local cuda_spec="cuda-version>=11,<12.0a0"
+    local dask_spec="dask"
+    local numba_spec="numba"
+  fi
+
+  rapids_spec="rapids>=${RAPIDS_VERSION}"
+  CONDA_PACKAGES=()
+  if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
+    # Pin `distributed` and `dask` package versions to old release
+    # because `dask-yarn` 0.9 uses skein in a way which
+    # is not compatible with `distributed` package 2022.2 and newer:
+    # https://github.com/dask/dask-yarn/issues/155
+
+    dask_spec="dask<2022.2"
+    python_spec="python>=3.7,<3.8.0a0"
+    rapids_spec="rapids<=24.05"
+    if is_ubuntu18 ; then
+      # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
+      CONDA_PACKAGES+=("fiona<1.8.22")
+    fi
+    CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2")
+  fi
+
+  CONDA_PACKAGES+=(
+    "${cuda_spec}"
+    "${rapids_spec}"
+    "${dask_spec}"
+    "dask-bigquery"
+    "dask-ml"
+    "dask-sql"
+    "cudf"
+    "${numba_spec}"
+  )
+
+  # Install cuda, rapids, dask
+  mamba="/opt/conda/miniconda3/bin/mamba"
+  conda="/opt/conda/miniconda3/bin/conda"
+
+  ( set +e
+  local is_installed="0"
+  for installer in "${mamba}" "${conda}" ; do
+    test -d "${DASK_CONDA_ENV}" || \
+      time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \
+      -c 'conda-forge' -c 'nvidia' -c 'rapidsai'  \
+      ${CONDA_PACKAGES[*]} \
+      "${python_spec}" \
+      > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    sync
+    if [[ "$retval" == "0" ]] ; then
+      is_installed="1"
+      break
+    fi
+    "${conda}" config --set channel_priority flexible
+  done
+  if [[ "${is_installed}" == "0" ]]; then
+    echo "failed to install dask"
+    return 1
+  fi
+  )
+}
+
+function prepare_dask_env() {
+  # Dask config
+  DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')"
+  readonly DASK_RUNTIME
+  readonly DASK_SERVICE=dask-cluster
+  readonly DASK_WORKER_SERVICE=dask-worker
+  readonly DASK_SCHEDULER_SERVICE=dask-scheduler
+  readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/${conda_env}"
+}
+
+function prepare_dask_rapids_env(){
+  prepare_dask_env
+  # RAPIDS config
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
+  readonly RAPIDS_RUNTIME
+
+  local DEFAULT_DASK_RAPIDS_VERSION="24.08"
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    DEFAULT_DASK_RAPIDS_VERSION="23.08" # Final release to support spark 3.1.3
+  fi
+  readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
+}
+
+
+function dask_exit_handler() {
+  echo "no exit handler for dask"
+}
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
new file mode 100644
index 000000000..9c74f5f3f
--- /dev/null
+++ b/templates/rapids/rapids.sh.in
@@ -0,0 +1,79 @@
+#!/bin/bash
+#
+[% INSERT legal/license_header %]
+#
+[% PROCESS common/template_disclaimer %]
+#
+# This initialization action script will install rapids on a Dataproc
+# cluster.
+
+set -euxo pipefail
+
+[% INSERT common/util_functions %]
+
+[% INSERT gpu/util_functions %]
+
+[% INSERT dask/util_functions %]
+
+function main() {
+  # Install Dask with RAPIDS
+  install_dask_rapids
+
+  # In "standalone" mode, Dask relies on a systemd unit to launch.
+  # In "yarn" mode, it relies a config.yaml file.
+  if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
+    # Create cuda accelerated Dask YARN config file
+    configure_dask_yarn
+  else
+    # Create Dask service
+    install_systemd_dask_service
+
+    if [[ "$(hostname -s)" == "${MASTER}" ]]; then
+      systemctl start "${DASK_SCHEDULER_SERVICE}"
+      systemctl status "${DASK_SCHEDULER_SERVICE}"
+    fi
+
+    echo "Starting Dask 'standalone' cluster..."
+    if [[ "${enable_worker_service}" == "1" ]]; then
+      systemctl start "${DASK_WORKER_SERVICE}"
+      systemctl status "${DASK_WORKER_SERVICE}"
+    fi
+
+    configure_knox_for_dask
+
+    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
+    if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
+      configure_fluentd_for_dask
+    fi
+  fi
+
+  echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized."
+  if [[ "${ROLE}" == "Master" ]]; then
+    systemctl restart hadoop-yarn-resourcemanager.service
+    # Restart NodeManager on Master as well if this is a single-node-cluster.
+    if systemctl list-units | grep hadoop-yarn-nodemanager; then
+      systemctl restart hadoop-yarn-nodemanager.service
+    fi
+  else
+    systemctl restart hadoop-yarn-nodemanager.service
+  fi
+}
+
+function exit_handler() {
+  gpu_exit_handler
+  common_exit_handler
+  return 0
+}
+
+function prepare_to_install(){
+  prepare_common_env
+  prepare_gpu_env
+  conda_env="$(get_metadata_attribute conda-env || echo 'dask-rapids')"
+  readonly conda_env
+  prepare_dask_rapids_env
+  trap exit_handler EXIT
+}
+
+prepare_to_install
+
+main

From aded30b112fe39504f1a86b7896b6b893ed7b794 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 17:20:17 -0800
Subject: [PATCH 085/130] remove gpu related code from dask action

---
 templates/dask/dask.sh.in              |   2 -
 templates/dask/util_functions          |  12 +-
 templates/rapids/util_functions        |   0
 templates/spark-rapids/spark-rapids.sh | 807 +++++++++++++++++++++++++
 4 files changed, 809 insertions(+), 12 deletions(-)
 create mode 100644 templates/rapids/util_functions
 create mode 100644 templates/spark-rapids/spark-rapids.sh

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index 84a279f0a..fd14be4a7 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -13,8 +13,6 @@ set -euxo pipefail
 
 [% INSERT common/util_functions %]
 
-[% INSERT gpu/util_functions %]
-
 [% INSERT dask/util_functions %]
 
 function main() {
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index 47e10a7d3..b9377b785 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -343,15 +343,8 @@ EOF
 }
 
 function install_dask() {
-  if is_cuda12 ; then
-    local python_spec="python>=3.11"
-    local cuda_spec="cuda-version>=12,<13"
-    local dask_spec="dask>=2024.7"
-  elif is_cuda11 ; then
-    local python_spec="python>=3.9"
-    local cuda_spec="cuda-version>=11,<12.0a0"
-    local dask_spec="dask"
-  fi
+  local python_spec="python>=3.11"
+  local dask_spec="dask>=2024.7"
 
   CONDA_PACKAGES=()
   if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
@@ -370,7 +363,6 @@ function install_dask() {
   fi
 
   CONDA_PACKAGES+=(
-    "${cuda_spec}"
     "${dask_spec}"
     "dask-bigquery"
     "dask-ml"
diff --git a/templates/rapids/util_functions b/templates/rapids/util_functions
new file mode 100644
index 000000000..e69de29bb
diff --git a/templates/spark-rapids/spark-rapids.sh b/templates/spark-rapids/spark-rapids.sh
new file mode 100644
index 000000000..c03bf80ef
--- /dev/null
+++ b/templates/spark-rapids/spark-rapids.sh
@@ -0,0 +1,807 @@
+#!/bin/bash
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2.
+# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only
+# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions.
+# Note that the script is designed to work when secure boot is disabled during cluster creation.
+# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu.
+
+set -euxo pipefail
+
+function os_id() {
+  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs
+}
+
+function os_version() {
+  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs
+}
+
+function is_debian() {
+  [[ "$(os_id)" == 'debian' ]]
+}
+
+function is_debian10() {
+  is_debian && [[ "$(os_version)" == '10'* ]]
+}
+
+function is_debian11() {
+  is_debian && [[ "$(os_version)" == '11'* ]]
+}
+
+function is_debian12() {
+  is_debian && [[ "$(os_version)" == '12'* ]]
+}
+
+function is_ubuntu() {
+  [[ "$(os_id)" == 'ubuntu' ]]
+}
+
+function is_ubuntu18() {
+  is_ubuntu && [[ "$(os_version)" == '18.04'* ]]
+}
+
+function is_ubuntu20() {
+  is_ubuntu && [[ "$(os_version)" == '20.04'* ]]
+}
+
+function is_ubuntu22() {
+  is_ubuntu && [[ "$(os_version)" == '22.04'* ]]
+}
+
+function is_rocky() {
+  [[ "$(os_id)" == 'rocky' ]]
+}
+
+function is_rocky8() {
+  is_rocky && [[ "$(os_version)" == '8'* ]]
+}
+
+function is_rocky9() {
+  is_rocky && [[ "$(os_version)" == '9'* ]]
+}
+
+function os_vercat() {
+  if is_ubuntu ; then
+      os_version | sed -e 's/[^0-9]//g'
+  elif is_rocky ; then
+      os_version | sed -e 's/[^0-9].*$//g'
+  else
+      os_version
+  fi
+}
+
+function get_metadata_attribute() {
+  local -r attribute_name=$1
+  local -r default_value="${2:-}"
+  /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
+}
+
+CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+PSN="$(get_metadata_attribute private_secret_name)"
+readonly PSN
+function configure_dkms_certs() {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+    else
+      modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
+    fi
+
+    # Verify that cert md5sum matches expected md5sum
+    if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key modulus"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
+
+    # Verify that key md5sum matches expected md5sum
+    if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert modulus"
+    fi
+
+    return
+  fi
+
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  # symlink private key and copy public cert from volatile storage for DKMS
+  if is_ubuntu ; then
+    mkdir -p /var/lib/shim-signed/mok
+    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv
+    cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der
+  else
+    mkdir -p /var/lib/dkms/
+    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
+    cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub
+  fi
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >2
+      return 0
+  fi
+  echo "WARN -- PURGING SIGNING MATERIAL -- WARN" >2
+  echo "future dkms runs will not use correct signing key" >2
+  rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv
+}
+
+function add_contrib_components() {
+  if ! is_debian ; then
+    return
+  fi
+  if is_debian12 ; then
+      # Include in sources file components on which nvidia-open-kernel-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
+}
+
+# Short name for nvidia urls
+if is_rocky ; then
+    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
+else
+    shortname="$(os_id)$(os_vercat)"
+fi
+readonly shortname
+
+# Detect dataproc image version from its various names
+if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then
+  DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+fi
+
+# Fetch Linux Family distro and Dataproc Image version
+readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
+
+# Fetch SPARK config
+readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
+if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then
+  readonly DEFAULT_XGBOOST_VERSION="1.7.6"
+  readonly SPARK_VERSION="3.0"
+else
+  echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+  exit 1
+fi
+
+# Update SPARK RAPIDS config
+readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0"
+readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+
+# Fetch instance roles and runtime
+readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
+readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
+readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+
+# CUDA version and Driver version config
+CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1')  #12.2.2
+NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05
+CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
+
+# EXCEPTIONS
+# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+if [[ "${OS_NAME}" == "ubuntu" ]]; then
+    if is_ubuntu18 ; then
+      CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
+      NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
+      CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
+    fi
+fi
+
+# Verify Secure boot
+SECURE_BOOT="disabled"
+SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+
+# Stackdriver GPU agent parameters
+# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+readonly INSTALL_GPU_AGENT
+
+# Dataproc configurations
+readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+readonly HIVE_CONF_DIR='/etc/hive/conf'
+readonly SPARK_CONF_DIR='/etc/spark/conf'
+
+NVIDIA_SMI_PATH='/usr/bin'
+MIG_MAJOR_CAPS=0
+IS_MIG_ENABLED=0
+
+function execute_with_retries() {
+  local -r cmd=$1
+  for ((i = 0; i < 10; i++)); do
+    if time eval "$cmd"; then
+      return 0
+    fi
+    sleep 5
+  done
+  return 1
+}
+
+function install_spark_rapids() {
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+  wget -nv --timeout=30 --tries=5 --retry-connrefused \
+    "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \
+    -P /usr/lib/spark/jars/
+}
+
+function configure_spark() {
+  if [[ "${SPARK_VERSION}" == "3"* ]]; then
+    cat >>${SPARK_CONF_DIR}/spark-defaults.conf <<EOF
+
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
+# query explain output won't show GPU operator, if user have doubt
+# they can uncomment the line before seeing the GPU plan explain, but AQE on gives user the best performance.
+spark.executor.resource.gpu.amount=1
+spark.plugins=com.nvidia.spark.SQLPlugin
+spark.executor.resource.gpu.discoveryScript=/usr/lib/spark/scripts/gpu/getGpusResources.sh
+spark.dynamicAllocation.enabled=false
+spark.sql.autoBroadcastJoinThreshold=10m
+spark.sql.files.maxPartitionBytes=512m
+# please update this config according to your application
+spark.task.resource.gpu.amount=0.25
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
+  else
+    cat >>${SPARK_CONF_DIR}/spark-defaults.conf <<EOF
+
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+spark.submit.pyFiles=/usr/lib/spark/jars/xgboost4j-spark_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
+  fi
+}
+
+# Enables a systemd service on bootup to install new headers.
+# This service recompiles kernel modules for Ubuntu and Debian, which are necessary for the functioning of nvidia-smi.
+function setup_systemd_update_headers() {
+  cat <<EOF >/lib/systemd/system/install-headers.service
+[Unit]
+Description=Install Linux headers for the current kernel
+After=network-online.target
+
+[Service]
+ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done'
+Type=oneshot
+RemainAfterExit=yes
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+  # Reload systemd to recognize the new unit file
+  systemctl daemon-reload
+
+  # Enable and start the service
+  systemctl enable --now install-headers.service
+}
+
+readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
+  fi
+}
+
+# Install NVIDIA GPU driver provided by NVIDIA
+function install_nvidia_gpu_driver() {
+
+  ## common steps for all linux family distros
+  readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*}
+
+  ## For Debian & Ubuntu
+  readonly LOCAL_INSTALLER_DEB="cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
+  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
+  readonly DIST_KEYRING_DIR="/var/cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local"
+
+  ## installation steps based OS
+  if is_debian ; then
+
+    export DEBIAN_FRONTEND=noninteractive
+
+    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
+
+    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+      "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb
+
+    dpkg -i /tmp/local-installer.deb
+    rm /tmp/local-installer.deb
+    cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+
+    add_contrib_components
+
+    execute_with_retries "apt-get update"
+
+    ## EXCEPTION
+    if is_debian10 ; then
+      apt-get remove -y libglvnd0
+      apt-get install -y ca-certificates-java
+    fi
+
+    configure_dkms_certs
+    execute_with_retries "apt-get install -y -q nvidia-kernel-open-dkms"
+    clear_dkms_key
+    execute_with_retries \
+	"apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
+    execute_with_retries \
+	"apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
+
+    modprobe nvidia
+
+    # enable a systemd service that updates kernel headers after reboot
+    setup_systemd_update_headers
+    # prevent auto upgrading nvidia packages
+    hold_nvidia_packages
+
+  elif is_ubuntu ; then
+
+    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
+
+    # Ubuntu 18.04 is not supported by new style NV debs; install from .run files + github
+    if is_ubuntu18 ; then
+
+      # fetch .run file
+      curl -o driver.run \
+        "https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run"
+      # Install all but kernel driver
+      bash driver.run --no-kernel-modules --silent --install-libglvnd
+      rm driver.run
+
+      WORKDIR=/opt/install-nvidia-driver
+      mkdir -p "${WORKDIR}"
+      pushd $_
+      # Fetch open souce kernel module with corresponding tag
+      test -d open-gpu-kernel-modules || \
+	 git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
+            --branch "${NVIDIA_DRIVER_VERSION}" --single-branch
+      cd ${WORKDIR}/open-gpu-kernel-modules
+      #
+      # build kernel modules
+      #
+      make -j$(nproc) modules \
+	   > /var/log/open-gpu-kernel-modules-build.log \
+	  2> /var/log/open-gpu-kernel-modules-build_error.log
+      configure_dkms_certs
+      # sign
+      for module in $(find kernel-open -name '*.ko'); do
+        /lib/modules/$(uname -r)/build/scripts/sign-file sha256 \
+          "${CA_TMPDIR}/db.rsa" \
+	  "${CA_TMPDIR}/db.der" \
+	  "${module}"
+      done
+      clear_dkms_key
+      # install
+      make modules_install \
+	   >> /var/log/open-gpu-kernel-modules-build.log \
+	  2>> /var/log/open-gpu-kernel-modules-build_error.log
+      depmod -a
+      modprobe nvidia
+      popd
+
+      #
+      # Install CUDA
+      #
+      cuda_runfile="cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run"
+      curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+       "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \
+       -o cuda.run
+      time bash cuda.run --silent --toolkit --no-opengl-libs
+      rm cuda.run
+    else
+      # Install from repo provided by NV
+      readonly UBUNTU_REPO_CUDA_PIN="${NVIDIA_REPO_URL}/cuda-${shortname}.pin"
+
+      curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+        "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600
+
+      curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+        "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb
+
+      dpkg -i /tmp/local-installer.deb
+      rm /tmp/local-installer.deb
+      cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+      execute_with_retries "apt-get update"
+
+      execute_with_retries "apt-get install -y -q --no-install-recommends dkms"
+      configure_dkms_certs
+      for pkg in "nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" \
+                 "cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" \
+                 "cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" ; do
+        execute_with_retries "apt-get install -y -q --no-install-recommends ${pkg}"
+      done
+      clear_dkms_key
+
+      modprobe nvidia
+    fi
+
+
+    # enable a systemd service that updates kernel headers after reboot
+    setup_systemd_update_headers
+    # prevent auto upgrading nvidia packages
+    hold_nvidia_packages
+
+  elif is_rocky ; then
+
+    # Ensure the Correct Kernel Development Packages are Installed
+    execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*"
+    execute_with_retries "dnf -y -q install pciutils kernel-devel gcc"
+
+    readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
+    execute_with_retries "dnf clean all"
+    configure_dkms_certs
+    execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms"
+    clear_dkms_key
+    execute_with_retries "dnf -y -q install cuda-toolkit"
+    modprobe nvidia
+
+  else
+    echo "Unsupported OS: '${OS_NAME}'"
+    exit 1
+  fi
+  ldconfig
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_agent() {
+  download_agent
+  install_agent_dependency
+  start_agent_service
+}
+
+function download_agent(){
+  if [[ ${OS_NAME} == rocky ]]; then
+    execute_with_retries "dnf -y -q install git"
+  else
+    execute_with_retries "apt-get install git -y"
+  fi
+  mkdir -p /opt/google
+  chmod 777 /opt/google
+  cd /opt/google
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+}
+
+function install_agent_dependency(){
+  cd /opt/google/compute-gpu-monitoring/linux
+  python3 -m venv venv
+  venv/bin/pip install wheel
+  venv/bin/pip install -Ur requirements.txt
+}
+
+function start_agent_service(){
+  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
+  systemctl daemon-reload
+  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
+}
+
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  /usr/local/bin/bdconfig set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
+}
+
+function configure_yarn() {
+  if [[ ! -f ${HADOOP_CONF_DIR}/resource-types.xml ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
+
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+
+}
+
+function configure_gpu_exclusive_mode() {
+  # check if running spark 3, if not, enable GPU exclusive mode
+  local spark_version
+  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
+  if [[ ${spark_version} != 3.* ]]; then
+    # include exclusive mode on GPU
+    nvidia-smi -c EXCLUSIVE_PROCESS
+  fi
+}
+
+function fetch_mig_scripts() {
+  mkdir -p /usr/local/yarn-mig-scripts
+  chmod 755 /usr/local/yarn-mig-scripts
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
+  chmod 755 /usr/local/yarn-mig-scripts/*
+}
+
+function configure_gpu_script() {
+  # Download GPU discovery script
+  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
+  mkdir -p ${spark_gpu_script_dir}
+  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
+  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
+  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
+  echo '
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+NUM_MIG_DEVICES=$(nvidia-smi -L | grep MIG | wc -l)
+ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e '\'':a'\'' -e '\''N'\'' -e'\''$!ba'\'' -e '\''s/\n/","/g'\'')
+if [ $NUM_MIG_DEVICES -gt 0 ]; then
+  MIG_INDEX=$(( $NUM_MIG_DEVICES - 1 ))
+  ADDRS=$(seq -s '\''","'\'' 0 $MIG_INDEX)
+fi
+echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
+' > ${spark_gpu_script_dir}/getGpusResources.sh
+
+  chmod a+rwx -R ${spark_gpu_script_dir}
+}
+
+function configure_gpu_isolation() {
+  # enable GPU isolation
+  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
+  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
+    # configure the container-executor.cfg to have major caps
+    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+  else
+    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HDOOP_CONF_DIR}/container-executor.cfg"
+  fi
+
+  # Configure a systemd unit to ensure that permissions are set on restart
+  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
+[Unit]
+Description=Set permissions to allow YARN to access device directories
+
+[Service]
+ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+  systemctl enable dataproc-cgroup-device-permissions
+  systemctl start dataproc-cgroup-device-permissions
+}
+
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn
+
+  # Detect NVIDIA GPU
+  if (lspci | grep -q NVIDIA); then
+    # if this is called without the MIG script then the drivers are not installed
+    nv_smi="/usr/bin/nvidia-smi"
+    if (test -f "${nv_smi}" && "${nv_smi}" --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l); then
+      NUM_MIG_GPUS="$($nv_smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)"
+      if [[ $NUM_MIG_GPUS -eq 1 ]]; then
+        if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
+      fi
+    fi
+
+    if is_debian || is_ubuntu ; then
+      execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
+    elif is_rocky ; then
+      echo "kernel devel and headers not required on rocky.  installing from binary"
+    fi
+
+    # if mig is enabled drivers would have already been installed
+    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+      install_nvidia_gpu_driver
+
+      #Install GPU metrics collection in Stackdriver if needed
+      if [[ ${INSTALL_GPU_AGENT} == true ]]; then
+        install_gpu_agent
+        echo 'GPU metrics agent successfully deployed.'
+      else
+        echo 'GPU metrics agent will not be installed.'
+      fi
+      configure_gpu_exclusive_mode
+    fi
+
+    configure_yarn_nodemanager
+    configure_gpu_script
+    configure_gpu_isolation
+  elif [[ "${ROLE}" == "Master" ]]; then
+    configure_yarn_nodemanager
+    configure_gpu_script
+  fi
+
+  # Restart YARN services if they are running already
+  for svc in resourcemanager nodemanager; do
+    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
+      systemctl restart hadoop-yarn-${svc}.service
+    fi
+  done
+}
+
+# Verify if compatible linux distros and secure boot options are used
+function check_os_and_secure_boot() {
+  if is_debian ; then
+    if ! is_debian10 && ! is_debian11 && ! is_debian12 ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+    fi
+  elif is_ubuntu ; then
+    if ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+    fi
+  elif is_rocky ; then
+    if ! is_rocky8 && ! is_rocky9 ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+    fi
+  fi
+
+  if [[ "${SECURE_BOOT}" == "enabled" && $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
+    echo "Error: Secure Boot is not supported before image 2.2. Please disable Secure Boot while creating the cluster."
+    exit 1
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+      echo "Secure boot is enabled, but no signing material provided."
+      echo "Please either disable secure boot or provide signing material as per"
+      echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+      return 1
+  fi
+}
+
+function remove_old_backports {
+  # This script uses 'apt-get update' and is therefore potentially dependent on
+  # backports repositories which have been archived.  In order to mitigate this
+  # problem, we will remove any reference to backports repos older than oldstable
+
+  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
+  oldoldstable=$(curl -s https://deb.debian.org/debian/dists/oldoldstable/Release | awk '/^Codename/ {print $2}');
+  oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}');
+  stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}');
+
+  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
+
+  if [[ -n "$matched_files" ]]; then
+    for filename in "${matched_files[@]}"; do
+      # Fetch from archive.debian.org for ${oldoldstable}-backports
+      perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
+                     {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
+    done
+  fi
+}
+
+
+function main() {
+  if is_debian && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
+    remove_old_backports
+  fi
+  check_os_and_secure_boot
+  setup_gpu_yarn
+  if [[ "${RUNTIME}" == "SPARK" ]]; then
+    install_spark_rapids
+    configure_spark
+    echo "RAPIDS initialized with Spark runtime"
+  else
+    echo "Unsupported RAPIDS Runtime: ${RUNTIME}"
+    exit 1
+  fi
+
+  for svc in resourcemanager nodemanager; do
+    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
+      systemctl restart hadoop-yarn-${svc}.service
+    fi
+  done
+  if is_debian || is_ubuntu ; then
+    apt-get clean
+  fi
+}
+
+main

From f553371dc64c8542674394385fa617c498f51277 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 18:02:23 -0800
Subject: [PATCH 086/130] changing failure to warning

---
 templates/common/util_functions | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index ac4809796..b4ef14440 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -521,13 +521,13 @@ function check_secure_boot() {
   readonly PSN
 
   if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
-    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
-    exit 1
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster."
+    return
   elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
     echo "Secure boot is enabled, but no signing material provided."
-    echo "Please either disable secure boot or provide signing material as per"
+    echo "Consider either disabling secure boot or provide signing material as per"
     echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-    return 1
+    return
   fi
 
   CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"

From 0439a8df0ad203579923dcbd485a31f2bb05752b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 4 Jan 2025 18:11:09 -0800
Subject: [PATCH 087/130] removing more gpu stuff from dask

---
 templates/dask/dask.sh.in | 2 --
 1 file changed, 2 deletions(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index fd14be4a7..20fd39619 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -54,14 +54,12 @@ function main() {
 }
 
 function exit_handler() {
-  gpu_exit_handler
   common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
   prepare_common_env
-  prepare_gpu_env
   conda_env="$(get_metadata_attribute conda-env || echo 'dask')"
   readonly conda_env
   prepare_dask_env

From 0251358290ff06d0654f7b8090c5a9346b6b2ef5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 14:47:13 -0800
Subject: [PATCH 088/130] moved MASTER global variable to common/util_functions

---
 templates/common/util_functions           | 4 ++++
 templates/spark-rapids/spark-rapids.sh.in | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index b4ef14440..accd93d3b 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -574,6 +574,10 @@ function prepare_common_env() {
   ROLE="$(get_metadata_attribute dataproc-role)"
   readonly ROLE
 
+  # master node
+  MASTER="$(get_metadata_attribute dataproc-master)"
+  readonly MASTER
+
   workdir=/opt/install-dpgce
   tmpdir=/tmp/
   temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 004080690..dc3ce3b36 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -60,9 +60,6 @@ function prepare_to_install(){
   prepare_common_env
   prepare_gpu_env
   trap exit_handler EXIT
-
-  # Fetch instance roles and runtime
-  readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
 }
 
 prepare_to_install

From a643c9ab56c13d6613ca997f974cb35f9353a0f1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 15:01:59 -0800
Subject: [PATCH 089/130] correct variable name

---
 templates/dask/dask.sh.in     | 2 +-
 templates/rapids/rapids.sh.in | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index 20fd39619..d006ef388 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -34,7 +34,7 @@ function main() {
     fi
 
     echo "Starting Dask 'standalone' cluster..."
-    if [[ "${enable_worker_service}" == "1" ]]; then
+    if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
       systemctl start "${DASK_WORKER_SERVICE}"
       systemctl status "${DASK_WORKER_SERVICE}"
     fi
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index 9c74f5f3f..8c7d85529 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -34,7 +34,7 @@ function main() {
     fi
 
     echo "Starting Dask 'standalone' cluster..."
-    if [[ "${enable_worker_service}" == "1" ]]; then
+    if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
       systemctl start "${DASK_WORKER_SERVICE}"
       systemctl status "${DASK_WORKER_SERVICE}"
     fi

From d6867d99369e0a0e0afd48a67fd15df9b1858237 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 15:36:19 -0800
Subject: [PATCH 090/130] moved hold_nvidia_packages out of common environment
 prepare into gpu env prep ; removed accidental inclusion of original
 spark-rapids.sh

---
 templates/common/util_functions        |   1 -
 templates/gpu/util_functions           |   3 +
 templates/spark-rapids/spark-rapids.sh | 807 -------------------------
 3 files changed, 3 insertions(+), 808 deletions(-)
 delete mode 100644 templates/spark-rapids/spark-rapids.sh

diff --git a/templates/common/util_functions b/templates/common/util_functions
index accd93d3b..ba66d2d55 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -674,7 +674,6 @@ function common_exit_handler() {
     # re-hold systemd package
     if ge_debian12 ; then
     apt-mark hold systemd libsystemd0 ; fi
-    hold_nvidia_packages
   else
     dnf clean all
   fi
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 6409d3fb1..46b49ef36 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1274,6 +1274,8 @@ function prepare_gpu_env(){
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
 # Users should run apt-mark unhold before they wish to upgrade these packages
 function hold_nvidia_packages() {
+  if ! is_debuntu ; then return ; fi
+
   apt-mark hold nvidia-*
   apt-mark hold libnvidia-*
   if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
@@ -1436,4 +1438,5 @@ function gpu_exit_handler() {
       fi
     done
   fi
+  hold_nvidia_packages
 }
diff --git a/templates/spark-rapids/spark-rapids.sh b/templates/spark-rapids/spark-rapids.sh
deleted file mode 100644
index c03bf80ef..000000000
--- a/templates/spark-rapids/spark-rapids.sh
+++ /dev/null
@@ -1,807 +0,0 @@
-#!/bin/bash
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS-IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2.
-# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only
-# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions.
-# Note that the script is designed to work when secure boot is disabled during cluster creation.
-# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu.
-
-set -euxo pipefail
-
-function os_id() {
-  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs
-}
-
-function os_version() {
-  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs
-}
-
-function is_debian() {
-  [[ "$(os_id)" == 'debian' ]]
-}
-
-function is_debian10() {
-  is_debian && [[ "$(os_version)" == '10'* ]]
-}
-
-function is_debian11() {
-  is_debian && [[ "$(os_version)" == '11'* ]]
-}
-
-function is_debian12() {
-  is_debian && [[ "$(os_version)" == '12'* ]]
-}
-
-function is_ubuntu() {
-  [[ "$(os_id)" == 'ubuntu' ]]
-}
-
-function is_ubuntu18() {
-  is_ubuntu && [[ "$(os_version)" == '18.04'* ]]
-}
-
-function is_ubuntu20() {
-  is_ubuntu && [[ "$(os_version)" == '20.04'* ]]
-}
-
-function is_ubuntu22() {
-  is_ubuntu && [[ "$(os_version)" == '22.04'* ]]
-}
-
-function is_rocky() {
-  [[ "$(os_id)" == 'rocky' ]]
-}
-
-function is_rocky8() {
-  is_rocky && [[ "$(os_version)" == '8'* ]]
-}
-
-function is_rocky9() {
-  is_rocky && [[ "$(os_version)" == '9'* ]]
-}
-
-function os_vercat() {
-  if is_ubuntu ; then
-      os_version | sed -e 's/[^0-9]//g'
-  elif is_rocky ; then
-      os_version | sed -e 's/[^0-9].*$//g'
-  else
-      os_version
-  fi
-}
-
-function get_metadata_attribute() {
-  local -r attribute_name=$1
-  local -r default_value="${2:-}"
-  /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
-}
-
-CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-PSN="$(get_metadata_attribute private_secret_name)"
-readonly PSN
-function configure_dkms_certs() {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping";
-      return 0
-  fi
-
-  mkdir -p "${CA_TMPDIR}"
-
-  # If the private key exists, verify it
-  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
-    echo "Private key material exists"
-
-    local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
-      modulus_md5sum="${expected_modulus_md5sum}"
-    else
-      modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
-    fi
-
-    # Verify that cert md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key modulus"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
-
-    # Verify that key md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert modulus"
-    fi
-
-    return
-  fi
-
-
-  # Retrieve cloud secrets keys
-  local sig_priv_secret_name
-  sig_priv_secret_name="${PSN}"
-  local sig_pub_secret_name
-  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
-  local sig_secret_project
-  sig_secret_project="$(get_metadata_attribute secret_project)"
-  local sig_secret_version
-  sig_secret_version="$(get_metadata_attribute secret_version)"
-
-  # If metadata values are not set, do not write mok keys
-  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
-
-  # Write private material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_priv_secret_name}" \
-      | dd status=none of="${CA_TMPDIR}/db.rsa"
-
-  # Write public material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_pub_secret_name}" \
-      | base64 --decode \
-      | dd status=none of="${CA_TMPDIR}/db.der"
-
-  # symlink private key and copy public cert from volatile storage for DKMS
-  if is_ubuntu ; then
-    mkdir -p /var/lib/shim-signed/mok
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der
-  else
-    mkdir -p /var/lib/dkms/
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub
-  fi
-}
-
-function clear_dkms_key {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping" >2
-      return 0
-  fi
-  echo "WARN -- PURGING SIGNING MATERIAL -- WARN" >2
-  echo "future dkms runs will not use correct signing key" >2
-  rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv
-}
-
-function add_contrib_components() {
-  if ! is_debian ; then
-    return
-  fi
-  if is_debian12 ; then
-      # Include in sources file components on which nvidia-open-kernel-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib"
-
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
-  fi
-}
-
-# Short name for nvidia urls
-if is_rocky ; then
-    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-else
-    shortname="$(os_id)$(os_vercat)"
-fi
-readonly shortname
-
-# Detect dataproc image version from its various names
-if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then
-  DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-fi
-
-# Fetch Linux Family distro and Dataproc Image version
-readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
-
-# Fetch SPARK config
-readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
-if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then
-  readonly DEFAULT_XGBOOST_VERSION="1.7.6"
-  readonly SPARK_VERSION="3.0"
-else
-  echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-  exit 1
-fi
-
-# Update SPARK RAPIDS config
-readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0"
-readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
-readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
-
-# Fetch instance roles and runtime
-readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
-readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
-readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-
-# CUDA version and Driver version config
-CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1')  #12.2.2
-NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05
-CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.2
-
-# EXCEPTIONS
-# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
-if [[ "${OS_NAME}" == "ubuntu" ]]; then
-    if is_ubuntu18 ; then
-      CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1')  #12.1.1
-      NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02
-      CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}"  #12.1
-    fi
-fi
-
-# Verify Secure boot
-SECURE_BOOT="disabled"
-SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
-
-# Stackdriver GPU agent parameters
-# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-readonly INSTALL_GPU_AGENT
-
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
-
-NVIDIA_SMI_PATH='/usr/bin'
-MIG_MAJOR_CAPS=0
-IS_MIG_ENABLED=0
-
-function execute_with_retries() {
-  local -r cmd=$1
-  for ((i = 0; i < 10; i++)); do
-    if time eval "$cmd"; then
-      return 0
-    fi
-    sleep 5
-  done
-  return 1
-}
-
-function install_spark_rapids() {
-  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
-  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
-  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
-
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-  wget -nv --timeout=30 --tries=5 --retry-connrefused \
-    "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \
-    -P /usr/lib/spark/jars/
-}
-
-function configure_spark() {
-  if [[ "${SPARK_VERSION}" == "3"* ]]; then
-    cat >>${SPARK_CONF_DIR}/spark-defaults.conf <<EOF
-
-###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
-# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
-# query explain output won't show GPU operator, if user have doubt
-# they can uncomment the line before seeing the GPU plan explain, but AQE on gives user the best performance.
-spark.executor.resource.gpu.amount=1
-spark.plugins=com.nvidia.spark.SQLPlugin
-spark.executor.resource.gpu.discoveryScript=/usr/lib/spark/scripts/gpu/getGpusResources.sh
-spark.dynamicAllocation.enabled=false
-spark.sql.autoBroadcastJoinThreshold=10m
-spark.sql.files.maxPartitionBytes=512m
-# please update this config according to your application
-spark.task.resource.gpu.amount=0.25
-###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
-EOF
-  else
-    cat >>${SPARK_CONF_DIR}/spark-defaults.conf <<EOF
-
-###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
-spark.submit.pyFiles=/usr/lib/spark/jars/xgboost4j-spark_${SPARK_VERSION}-${XGBOOST_VERSION}-${XGBOOST_GPU_SUB_VERSION}.jar
-###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
-EOF
-  fi
-}
-
-# Enables a systemd service on bootup to install new headers.
-# This service recompiles kernel modules for Ubuntu and Debian, which are necessary for the functioning of nvidia-smi.
-function setup_systemd_update_headers() {
-  cat <<EOF >/lib/systemd/system/install-headers.service
-[Unit]
-Description=Install Linux headers for the current kernel
-After=network-online.target
-
-[Service]
-ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done'
-Type=oneshot
-RemainAfterExit=yes
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-  # Reload systemd to recognize the new unit file
-  systemctl daemon-reload
-
-  # Enable and start the service
-  systemctl enable --now install-headers.service
-}
-
-readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
-
-# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
-# Users should run apt-mark unhold before they wish to upgrade these packages
-function hold_nvidia_packages() {
-  apt-mark hold nvidia-*
-  apt-mark hold libnvidia-*
-  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
-    apt-mark hold xserver-xorg-video-nvidia*
-  fi
-}
-
-# Install NVIDIA GPU driver provided by NVIDIA
-function install_nvidia_gpu_driver() {
-
-  ## common steps for all linux family distros
-  readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*}
-
-  ## For Debian & Ubuntu
-  readonly LOCAL_INSTALLER_DEB="cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb"
-  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
-  readonly DIST_KEYRING_DIR="/var/cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local"
-
-  ## installation steps based OS
-  if is_debian ; then
-
-    export DEBIAN_FRONTEND=noninteractive
-
-    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
-
-    curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-      "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb
-
-    dpkg -i /tmp/local-installer.deb
-    rm /tmp/local-installer.deb
-    cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
-
-    add_contrib_components
-
-    execute_with_retries "apt-get update"
-
-    ## EXCEPTION
-    if is_debian10 ; then
-      apt-get remove -y libglvnd0
-      apt-get install -y ca-certificates-java
-    fi
-
-    configure_dkms_certs
-    execute_with_retries "apt-get install -y -q nvidia-kernel-open-dkms"
-    clear_dkms_key
-    execute_with_retries \
-	"apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
-    execute_with_retries \
-	"apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"
-
-    modprobe nvidia
-
-    # enable a systemd service that updates kernel headers after reboot
-    setup_systemd_update_headers
-    # prevent auto upgrading nvidia packages
-    hold_nvidia_packages
-
-  elif is_ubuntu ; then
-
-    execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
-
-    # Ubuntu 18.04 is not supported by new style NV debs; install from .run files + github
-    if is_ubuntu18 ; then
-
-      # fetch .run file
-      curl -o driver.run \
-        "https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run"
-      # Install all but kernel driver
-      bash driver.run --no-kernel-modules --silent --install-libglvnd
-      rm driver.run
-
-      WORKDIR=/opt/install-nvidia-driver
-      mkdir -p "${WORKDIR}"
-      pushd $_
-      # Fetch open souce kernel module with corresponding tag
-      test -d open-gpu-kernel-modules || \
-	 git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
-            --branch "${NVIDIA_DRIVER_VERSION}" --single-branch
-      cd ${WORKDIR}/open-gpu-kernel-modules
-      #
-      # build kernel modules
-      #
-      make -j$(nproc) modules \
-	   > /var/log/open-gpu-kernel-modules-build.log \
-	  2> /var/log/open-gpu-kernel-modules-build_error.log
-      configure_dkms_certs
-      # sign
-      for module in $(find kernel-open -name '*.ko'); do
-        /lib/modules/$(uname -r)/build/scripts/sign-file sha256 \
-          "${CA_TMPDIR}/db.rsa" \
-	  "${CA_TMPDIR}/db.der" \
-	  "${module}"
-      done
-      clear_dkms_key
-      # install
-      make modules_install \
-	   >> /var/log/open-gpu-kernel-modules-build.log \
-	  2>> /var/log/open-gpu-kernel-modules-build_error.log
-      depmod -a
-      modprobe nvidia
-      popd
-
-      #
-      # Install CUDA
-      #
-      cuda_runfile="cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run"
-      curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-       "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \
-       -o cuda.run
-      time bash cuda.run --silent --toolkit --no-opengl-libs
-      rm cuda.run
-    else
-      # Install from repo provided by NV
-      readonly UBUNTU_REPO_CUDA_PIN="${NVIDIA_REPO_URL}/cuda-${shortname}.pin"
-
-      curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-        "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600
-
-      curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-        "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb
-
-      dpkg -i /tmp/local-installer.deb
-      rm /tmp/local-installer.deb
-      cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
-      execute_with_retries "apt-get update"
-
-      execute_with_retries "apt-get install -y -q --no-install-recommends dkms"
-      configure_dkms_certs
-      for pkg in "nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" \
-                 "cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" \
-                 "cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" ; do
-        execute_with_retries "apt-get install -y -q --no-install-recommends ${pkg}"
-      done
-      clear_dkms_key
-
-      modprobe nvidia
-    fi
-
-
-    # enable a systemd service that updates kernel headers after reboot
-    setup_systemd_update_headers
-    # prevent auto upgrading nvidia packages
-    hold_nvidia_packages
-
-  elif is_rocky ; then
-
-    # Ensure the Correct Kernel Development Packages are Installed
-    execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*"
-    execute_with_retries "dnf -y -q install pciutils kernel-devel gcc"
-
-    readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
-    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
-    execute_with_retries "dnf clean all"
-    configure_dkms_certs
-    execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms"
-    clear_dkms_key
-    execute_with_retries "dnf -y -q install cuda-toolkit"
-    modprobe nvidia
-
-  else
-    echo "Unsupported OS: '${OS_NAME}'"
-    exit 1
-  fi
-  ldconfig
-  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-}
-
-# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
-function install_gpu_agent() {
-  download_agent
-  install_agent_dependency
-  start_agent_service
-}
-
-function download_agent(){
-  if [[ ${OS_NAME} == rocky ]]; then
-    execute_with_retries "dnf -y -q install git"
-  else
-    execute_with_retries "apt-get install git -y"
-  fi
-  mkdir -p /opt/google
-  chmod 777 /opt/google
-  cd /opt/google
-  test -d compute-gpu-monitoring || \
-    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
-}
-
-function install_agent_dependency(){
-  cd /opt/google/compute-gpu-monitoring/linux
-  python3 -m venv venv
-  venv/bin/pip install wheel
-  venv/bin/pip install -Ur requirements.txt
-}
-
-function start_agent_service(){
-  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
-  systemctl daemon-reload
-  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
-}
-
-function set_hadoop_property() {
-  local -r config_file=$1
-  local -r property=$2
-  local -r value=$3
-  /usr/local/bin/bdconfig set_property \
-    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
-    --name "${property}" --value "${value}" \
-    --clobber
-}
-
-function configure_yarn() {
-  if [[ ! -f ${HADOOP_CONF_DIR}/resource-types.xml ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
-
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
-
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
-
-}
-
-function configure_gpu_exclusive_mode() {
-  # check if running spark 3, if not, enable GPU exclusive mode
-  local spark_version
-  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
-  if [[ ${spark_version} != 3.* ]]; then
-    # include exclusive mode on GPU
-    nvidia-smi -c EXCLUSIVE_PROCESS
-  fi
-}
-
-function fetch_mig_scripts() {
-  mkdir -p /usr/local/yarn-mig-scripts
-  chmod 755 /usr/local/yarn-mig-scripts
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  chmod 755 /usr/local/yarn-mig-scripts/*
-}
-
-function configure_gpu_script() {
-  # Download GPU discovery script
-  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
-  mkdir -p ${spark_gpu_script_dir}
-  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
-  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
-  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
-  echo '
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-NUM_MIG_DEVICES=$(nvidia-smi -L | grep MIG | wc -l)
-ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e '\'':a'\'' -e '\''N'\'' -e'\''$!ba'\'' -e '\''s/\n/","/g'\'')
-if [ $NUM_MIG_DEVICES -gt 0 ]; then
-  MIG_INDEX=$(( $NUM_MIG_DEVICES - 1 ))
-  ADDRS=$(seq -s '\''","'\'' 0 $MIG_INDEX)
-fi
-echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
-' > ${spark_gpu_script_dir}/getGpusResources.sh
-
-  chmod a+rwx -R ${spark_gpu_script_dir}
-}
-
-function configure_gpu_isolation() {
-  # enable GPU isolation
-  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
-  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
-    # configure the container-executor.cfg to have major caps
-    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
-    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-  else
-    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HDOOP_CONF_DIR}/container-executor.cfg"
-  fi
-
-  # Configure a systemd unit to ensure that permissions are set on restart
-  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
-[Unit]
-Description=Set permissions to allow YARN to access device directories
-
-[Service]
-ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-  systemctl enable dataproc-cgroup-device-permissions
-  systemctl start dataproc-cgroup-device-permissions
-}
-
-function setup_gpu_yarn() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn
-
-  # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
-    # if this is called without the MIG script then the drivers are not installed
-    nv_smi="/usr/bin/nvidia-smi"
-    if (test -f "${nv_smi}" && "${nv_smi}" --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l); then
-      NUM_MIG_GPUS="$($nv_smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)"
-      if [[ $NUM_MIG_GPUS -eq 1 ]]; then
-        if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
-
-    if is_debian || is_ubuntu ; then
-      execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'"
-    elif is_rocky ; then
-      echo "kernel devel and headers not required on rocky.  installing from binary"
-    fi
-
-    # if mig is enabled drivers would have already been installed
-    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-      install_nvidia_gpu_driver
-
-      #Install GPU metrics collection in Stackdriver if needed
-      if [[ ${INSTALL_GPU_AGENT} == true ]]; then
-        install_gpu_agent
-        echo 'GPU metrics agent successfully deployed.'
-      else
-        echo 'GPU metrics agent will not be installed.'
-      fi
-      configure_gpu_exclusive_mode
-    fi
-
-    configure_yarn_nodemanager
-    configure_gpu_script
-    configure_gpu_isolation
-  elif [[ "${ROLE}" == "Master" ]]; then
-    configure_yarn_nodemanager
-    configure_gpu_script
-  fi
-
-  # Restart YARN services if they are running already
-  for svc in resourcemanager nodemanager; do
-    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
-      systemctl restart hadoop-yarn-${svc}.service
-    fi
-  done
-}
-
-# Verify if compatible linux distros and secure boot options are used
-function check_os_and_secure_boot() {
-  if is_debian ; then
-    if ! is_debian10 && ! is_debian11 && ! is_debian12 ; then
-      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
-      exit 1
-    fi
-  elif is_ubuntu ; then
-    if ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ; then
-      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
-      exit 1
-    fi
-  elif is_rocky ; then
-    if ! is_rocky8 && ! is_rocky9 ; then
-      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
-      exit 1
-    fi
-  fi
-
-  if [[ "${SECURE_BOOT}" == "enabled" && $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
-    echo "Error: Secure Boot is not supported before image 2.2. Please disable Secure Boot while creating the cluster."
-    exit 1
-  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
-      echo "Secure boot is enabled, but no signing material provided."
-      echo "Please either disable secure boot or provide signing material as per"
-      echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-      return 1
-  fi
-}
-
-function remove_old_backports {
-  # This script uses 'apt-get update' and is therefore potentially dependent on
-  # backports repositories which have been archived.  In order to mitigate this
-  # problem, we will remove any reference to backports repos older than oldstable
-
-  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
-  oldoldstable=$(curl -s https://deb.debian.org/debian/dists/oldoldstable/Release | awk '/^Codename/ {print $2}');
-  oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}');
-  stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}');
-
-  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
-
-  if [[ -n "$matched_files" ]]; then
-    for filename in "${matched_files[@]}"; do
-      # Fetch from archive.debian.org for ${oldoldstable}-backports
-      perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
-                     {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
-    done
-  fi
-}
-
-
-function main() {
-  if is_debian && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
-    remove_old_backports
-  fi
-  check_os_and_secure_boot
-  setup_gpu_yarn
-  if [[ "${RUNTIME}" == "SPARK" ]]; then
-    install_spark_rapids
-    configure_spark
-    echo "RAPIDS initialized with Spark runtime"
-  else
-    echo "Unsupported RAPIDS Runtime: ${RUNTIME}"
-    exit 1
-  fi
-
-  for svc in resourcemanager nodemanager; do
-    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
-      systemctl restart hadoop-yarn-${svc}.service
-    fi
-  done
-  if is_debian || is_ubuntu ; then
-    apt-get clean
-  fi
-}
-
-main

From 6a7d10db899790eb6271bc881f8cb690bd460c0f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 16:29:55 -0800
Subject: [PATCH 091/130] added comments and timing collection

---
 templates/dask/dask.sh.in | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index d006ef388..2e23c9d6d 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -28,17 +28,21 @@ function main() {
     # Create Dask service
     install_systemd_dask_service
 
+    # only run scheduler on primary master
     if [[ "$(hostname -s)" == "${MASTER}" ]]; then
-      systemctl start "${DASK_SCHEDULER_SERVICE}"
+      date
+      time systemctl start "${DASK_SCHEDULER_SERVICE}"
       systemctl status "${DASK_SCHEDULER_SERVICE}"
     fi
 
     echo "Starting Dask 'standalone' cluster..."
     if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
-      systemctl start "${DASK_WORKER_SERVICE}"
+      date
+      time systemctl start "${DASK_WORKER_SERVICE}"
       systemctl status "${DASK_WORKER_SERVICE}"
     fi
 
+    date
     configure_knox_for_dask
 
     local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"

From 1ab3f8d5f860cb7daa451831207bb8b18eca1887 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 16:30:38 -0800
Subject: [PATCH 092/130] no need to consider unsupported dataproc < 2.0 image
 versions ; reducing instance type a little

---
 dask/test_dask.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dask/test_dask.py b/dask/test_dask.py
index 440493511..1126d7d80 100644
--- a/dask/test_dask.py
+++ b/dask/test_dask.py
@@ -56,16 +56,13 @@ def _run_dask_test_script(self, name, script):
     )
     def test_dask(self, configuration, instances, runtime):
 
-        if self.getImageVersion() < pkg_resources.parse_version("2.0"):
-            self.skipTest("Not supported in pre-2.0 images")
-
         metadata = None
         if runtime:
             metadata = "dask-runtime={}".format(runtime)
 
         self.createCluster(configuration,
                            self.INIT_ACTIONS,
-                           machine_type='n1-standard-16',
+                           machine_type='n1-highmem-8',
                            metadata=metadata,
                            timeout_in_minutes=20)
 

From 598b6907b4d622212c6bb432eb33aee9d3812c0b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 16:31:18 -0800
Subject: [PATCH 093/130] using "dask-scheduler" instead of "dask scheduler"

---
 templates/dask/util_functions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index b9377b785..1b459a546 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -102,7 +102,7 @@ function install_systemd_dask_scheduler() {
 #!/bin/bash
 LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
 echo "dask scheduler starting, logging to \${LOGFILE}"
-${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
+${DASK_CONDA_ENV}/bin/dask-scheduler >> "\${LOGFILE}" 2>&1
 EOF
 
   chmod 750 "${DASK_SCHEDULER_LAUNCHER}"

From 81c7d28b06330d321558b42aac734fc29fbc9b3f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 16:51:25 -0800
Subject: [PATCH 094/130] wait for dask scheduler before starting worker

---
 templates/dask/dask.sh.in     | 7 +++++++
 templates/dask/util_functions | 6 +++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index 2e23c9d6d..d619f9f88 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -38,6 +38,13 @@ function main() {
     echo "Starting Dask 'standalone' cluster..."
     if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
       date
+      # Pause while scheduler comes online
+      retries=30
+      while ! nc -vz cluster-1718310842-m 8786 ; do
+        sleep 3s
+        ((retries--)
+        if [[ "${retries}" == "0" ]]; then echo "dask scheduler unreachable" ; exit 1 ; fi
+      fi
       time systemctl start "${DASK_WORKER_SERVICE}"
       systemctl status "${DASK_WORKER_SERVICE}"
     fi
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index 1b459a546..f7fe507d1 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -39,15 +39,15 @@ function install_systemd_dask_worker() {
 
   local compute_mode_cmd=""
   if command -v nvidia-smi ; then compute_mode_cmd="nvidia-smi --compute-mode=DEFAULT" ; fi
-  local worker_name="dask-worker"
-  if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda-worker" ; fi
+  local worker_name="dask worker"
+  if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda worker" ; fi
   local worker="${DASK_CONDA_ENV}/bin/${worker_name}"
   cat <<EOF >"${DASK_WORKER_LAUNCHER}"
 #!/bin/bash
 LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log"
 ${compute_mode_cmd}
 echo "${worker_name} starting, logging to \${LOGFILE}"
-${worker} "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1
+${worker} --local-directory="${dask_worker_local_dir}" --memory-limit=auto "${MASTER}:8786" >> "\${LOGFILE}" 2>&1
 EOF
 
   chmod 750 "${DASK_WORKER_LAUNCHER}"

From 48906e1862105a36bd803292f1748189b47f5d26 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 16:54:00 -0800
Subject: [PATCH 095/130] using variable instead of my own cluster master name

---
 templates/dask/dask.sh.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index d619f9f88..9d82c5063 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -40,7 +40,7 @@ function main() {
       date
       # Pause while scheduler comes online
       retries=30
-      while ! nc -vz cluster-1718310842-m 8786 ; do
+      while ! nc -vz "${MASTER}" 8786 ; do
         sleep 3s
         ((retries--)
         if [[ "${retries}" == "0" ]]; then echo "dask scheduler unreachable" ; exit 1 ; fi

From 510e5202972b11ee43cdece00925abc1513c9734 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 17:25:21 -0800
Subject: [PATCH 096/130] corrected syntax errors ; dump log on service failure

---
 templates/dask/dask.sh.in     | 14 +++++++++++---
 templates/dask/util_functions |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index 9d82c5063..b64482aeb 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -32,6 +32,10 @@ function main() {
     if [[ "$(hostname -s)" == "${MASTER}" ]]; then
       date
       time systemctl start "${DASK_SCHEDULER_SERVICE}"
+      local substate_val="$(systemctl show ${DASK_SCHEDULER_SERVICE} -p SubState --value)"
+      if [[ "${substate_val}" != 'running' ]] ; then
+        cat "/var/log/${DASK_SCHEDULER_SERVICE}.log"
+      fi
       systemctl status "${DASK_SCHEDULER_SERVICE}"
     fi
 
@@ -42,10 +46,14 @@ function main() {
       retries=30
       while ! nc -vz "${MASTER}" 8786 ; do
         sleep 3s
-        ((retries--)
-        if [[ "${retries}" == "0" ]]; then echo "dask scheduler unreachable" ; exit 1 ; fi
-      fi
+        ((retries--))
+        if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi
+      done
       time systemctl start "${DASK_WORKER_SERVICE}"
+      local substate_val="$(systemctl show ${DASK_WORKER_SERVICE} -p SubState --value)"
+      if [[ "${substate_val}" != 'running' ]] ; then
+        cat "/var/log/${DASK_WORKER_SERVICE}.log"
+      fi
       systemctl status "${DASK_WORKER_SERVICE}"
     fi
 
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index f7fe507d1..fca23a74b 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -102,7 +102,7 @@ function install_systemd_dask_scheduler() {
 #!/bin/bash
 LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
 echo "dask scheduler starting, logging to \${LOGFILE}"
-${DASK_CONDA_ENV}/bin/dask-scheduler >> "\${LOGFILE}" 2>&1
+${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
 EOF
 
   chmod 750 "${DASK_SCHEDULER_LAUNCHER}"

From 9e9f87266f95d9eb8e2da23fcf67f8c1b6eb5a8e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 18:19:38 -0800
Subject: [PATCH 097/130] refactored some common code ; setting default value
 for metadata attribute correctly

---
 templates/dask/dask.sh.in     | 33 ++-------------------------------
 templates/dask/util_functions | 35 ++++++++++++++++++++++++++++++++++-
 templates/rapids/rapids.sh.in | 16 +++-------------
 3 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index b64482aeb..dd2932042 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -27,37 +27,8 @@ function main() {
   elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then
     # Create Dask service
     install_systemd_dask_service
+    start_systemd_dask_service
 
-    # only run scheduler on primary master
-    if [[ "$(hostname -s)" == "${MASTER}" ]]; then
-      date
-      time systemctl start "${DASK_SCHEDULER_SERVICE}"
-      local substate_val="$(systemctl show ${DASK_SCHEDULER_SERVICE} -p SubState --value)"
-      if [[ "${substate_val}" != 'running' ]] ; then
-        cat "/var/log/${DASK_SCHEDULER_SERVICE}.log"
-      fi
-      systemctl status "${DASK_SCHEDULER_SERVICE}"
-    fi
-
-    echo "Starting Dask 'standalone' cluster..."
-    if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
-      date
-      # Pause while scheduler comes online
-      retries=30
-      while ! nc -vz "${MASTER}" 8786 ; do
-        sleep 3s
-        ((retries--))
-        if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi
-      done
-      time systemctl start "${DASK_WORKER_SERVICE}"
-      local substate_val="$(systemctl show ${DASK_WORKER_SERVICE} -p SubState --value)"
-      if [[ "${substate_val}" != 'running' ]] ; then
-        cat "/var/log/${DASK_WORKER_SERVICE}.log"
-      fi
-      systemctl status "${DASK_WORKER_SERVICE}"
-    fi
-
-    date
     configure_knox_for_dask
 
     local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
@@ -79,7 +50,7 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
-  conda_env="$(get_metadata_attribute conda-env || echo 'dask')"
+  conda_env="$(get_metadata_attribute conda-env 'dask')"
   readonly conda_env
   prepare_dask_env
   trap exit_handler EXIT
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index fca23a74b..54066b984 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -40,7 +40,7 @@ function install_systemd_dask_worker() {
   local compute_mode_cmd=""
   if command -v nvidia-smi ; then compute_mode_cmd="nvidia-smi --compute-mode=DEFAULT" ; fi
   local worker_name="dask worker"
-  if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda worker" ; fi
+  if test -f "${DASK_CONDA_ENV}/bin/dask-cuda" ; then worker_name="dask-cuda worker" ; fi
   local worker="${DASK_CONDA_ENV}/bin/${worker_name}"
   cat <<EOF >"${DASK_WORKER_LAUNCHER}"
 #!/bin/bash
@@ -131,6 +131,39 @@ function install_systemd_dask_service() {
   install_systemd_dask_worker
 }
 
+function start_systemd_dask_service() {
+  # only run scheduler on primary master
+  if [[ "$(hostname -s)" == "${MASTER}" ]]; then
+    date
+    time systemctl start "${DASK_SCHEDULER_SERVICE}"
+    local substate_val="$(systemctl show ${DASK_SCHEDULER_SERVICE} -p SubState --value)"
+    if [[ "${substate_val}" != 'running' ]] ; then
+      cat "/var/log/${DASK_SCHEDULER_SERVICE}.log"
+    fi
+    systemctl status "${DASK_SCHEDULER_SERVICE}"
+  fi
+
+  echo "Starting Dask 'standalone' cluster..."
+  if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
+    date
+    # Pause while scheduler comes online
+    retries=30
+    while ! nc -vz "${MASTER}" 8786 ; do
+      sleep 3s
+      ((retries--))
+      if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi
+    done
+    time systemctl start "${DASK_WORKER_SERVICE}"
+    local substate_val="$(systemctl show ${DASK_WORKER_SERVICE} -p SubState --value)"
+    if [[ "${substate_val}" != 'running' ]] ; then
+      cat "/var/log/${DASK_WORKER_SERVICE}.log"
+    fi
+    systemctl status "${DASK_WORKER_SERVICE}"
+  fi
+
+  date
+}
+
 function configure_knox_for_dask() {
   if [[ ! -d "${KNOX_HOME}" ]]; then
     echo "Skip configuring Knox rules for Dask"
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index 8c7d85529..a63f44b3b 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -27,21 +27,11 @@ function main() {
   else
     # Create Dask service
     install_systemd_dask_service
-
-    if [[ "$(hostname -s)" == "${MASTER}" ]]; then
-      systemctl start "${DASK_SCHEDULER_SERVICE}"
-      systemctl status "${DASK_SCHEDULER_SERVICE}"
-    fi
-
-    echo "Starting Dask 'standalone' cluster..."
-    if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
-      systemctl start "${DASK_WORKER_SERVICE}"
-      systemctl status "${DASK_WORKER_SERVICE}"
-    fi
+    start_systemd_dask_service
 
     configure_knox_for_dask
 
-    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
+    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')"
     if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
       configure_fluentd_for_dask
     fi
@@ -68,7 +58,7 @@ function exit_handler() {
 function prepare_to_install(){
   prepare_common_env
   prepare_gpu_env
-  conda_env="$(get_metadata_attribute conda-env || echo 'dask-rapids')"
+  conda_env="$(get_metadata_attribute conda-env 'dask-rapids')"
   readonly conda_env
   prepare_dask_rapids_env
   trap exit_handler EXIT

From 7480a23b0ac4a72303163466b5f501dab0043b61 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 22:26:38 -0800
Subject: [PATCH 098/130] added new function is_ramdisk ; keeping conda cache
 in its own directory ; same for pip cache ; refactored pip setup and teardown

---
 templates/common/util_functions | 66 ++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 21 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index ba66d2d55..a52001db3 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -319,6 +319,20 @@ function set_proxy(){
   export NO_PROXY="${no_proxy}"
 }
 
+function is_ramdisk() {
+  if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi
+  if   ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0
+  elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi
+
+  if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then
+    IS_RAMDISK="true"
+    return 0
+  else
+    IS_RAMDISK="false"
+    return 1
+  fi
+}
+
 function mount_ramdisk(){
   local free_mem
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
@@ -327,18 +341,11 @@ function mount_ramdisk(){
   # Write to a ramdisk instead of churning the persistent disk
 
   tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
+  mkdir -p "${tmpdir}/pkgs_dirs"
   mount -t tmpfs tmpfs "${tmpdir}"
 
   # Download conda packages to tmpfs
-  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
-
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  pip cache purge || echo "unable to purge pip cache"
-
-  # Download pip packages to tmpfs
-  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs"
 
   # Download OS packages to tmpfs
   if is_debuntu ; then
@@ -346,6 +353,7 @@ function mount_ramdisk(){
   else
     mount -t tmpfs tmpfs /var/cache/dnf
   fi
+  is_ramdisk -f
 }
 
 function check_os() {
@@ -553,6 +561,21 @@ function install_dependencies() {
   touch "${workdir}/complete/install-dependencies"
 }
 
+function prepare_pip_env() {
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv"
+  source "${tmpdir}/python-venv/bin/activate"
+
+  pip cache purge || echo "unable to purge pip cache"
+  if is_ramdisk ; then
+    # Download pip packages to tmpfs
+    mkdir -p "${tmpdir}/cache-dir"
+    pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir"
+  fi
+}
+
+
 function prepare_common_env() {
   define_os_comparison_functions
 
@@ -590,8 +613,6 @@ function prepare_common_env() {
 
   # Knox config
   readonly KNOX_HOME=/usr/lib/knox
-  readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0"
-  readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
 
   mkdir -p "${workdir}/complete"
   set_proxy
@@ -636,13 +657,17 @@ function prepare_common_env() {
   touch "${workdir}/complete/prepare.common"
 }
 
+function pip_exit_handler() {
+  if is_ramdisk ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+  fi
+}
+
 function common_exit_handler() {
   set +ex
   echo "Exit handler invoked"
 
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
   # Restart YARN services if they are running already
   for svc in resourcemanager nodemanager; do
     if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
@@ -653,9 +678,6 @@ function common_exit_handler() {
 
   # If system memory was sufficient to mount memory-backed filesystems
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-
     # Clean up shared memory mounts
     for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
       if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
@@ -678,6 +700,7 @@ function common_exit_handler() {
     dnf clean all
   fi
 
+  # When creating image, print disk usage statistics, zero unused disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
     # print disk usage statistics for large components
     if is_ubuntu ; then
@@ -719,11 +742,12 @@ function common_exit_handler() {
           '@siz=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
                   grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting};
 print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+       "starting-disk-used: $starting", $/,
+       "maximum-disk-used:  $max", $/,
+       "minimum-disk-used:  $min", $/,
+       "     increased-by:  $inc", $/ )' < "/run/disk-usage.log"
 
 
     # zero free disk space

From 6b73d22f1fc64b7cf9e80c6f115d1964d9283b59 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 22:28:13 -0800
Subject: [PATCH 099/130] calling functions from refactored pip setup/teardown

---
 templates/dask/dask.sh.in                 | 2 ++
 templates/gpu/install_gpu_driver.sh.in    | 2 ++
 templates/spark-rapids/spark-rapids.sh.in | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index dd2932042..b0279160f 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -44,12 +44,14 @@ function main() {
 }
 
 function exit_handler() {
+  pip_exit_handler
   common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
   prepare_common_env
+  prepare_pip_env
   conda_env="$(get_metadata_attribute conda-env 'dask')"
   readonly conda_env
   prepare_dask_env
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index ffdda45e4..0e27f1086 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -39,12 +39,14 @@ function main() {
 
 function exit_handler() {
   gpu_exit_handler
+  pip_exit_handler
   common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
   prepare_common_env
+  prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT
 }
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index dc3ce3b36..1467fedf9 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -52,12 +52,14 @@ function main() {
 
 function exit_handler() {
   gpu_exit_handler
+  pip_exit_handler
   common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
   prepare_common_env
+  prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT
 }

From 2e45a7553fe45029ec8816995f1f04a9ca0d0a1a Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 22:29:21 -0800
Subject: [PATCH 100/130] moved knox dask config to
 templates/dask/util_functions

---
 templates/dask/util_functions | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index 54066b984..a2863ec8b 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -506,6 +506,9 @@ function prepare_dask_env() {
   readonly DASK_WORKER_SERVICE=dask-worker
   readonly DASK_SCHEDULER_SERVICE=dask-scheduler
   readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/${conda_env}"
+  # Knox dask config
+  readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0"
+  readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
 }
 
 function prepare_dask_rapids_env(){

From 33fdd38029f0379ced2ad2a53d1b3ff94a809ace Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 22:29:51 -0800
Subject: [PATCH 101/130] added copyright to templates/legal/license_header

---
 templates/legal/license_header | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/templates/legal/license_header b/templates/legal/license_header
index 4c05ecc74..0230ca951 100644
--- a/templates/legal/license_header
+++ b/templates/legal/license_header
@@ -1,3 +1,5 @@
+# Copyright 2015 Google LLC and contributors
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at

From 4f974c5aaada2809bf80fb86fcbc440428e41900 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 22:30:53 -0800
Subject: [PATCH 102/130] latest generated action

---
 gpu/install_gpu_driver.sh | 124 ++++++++++++++++++++++++--------------
 1 file changed, 78 insertions(+), 46 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 59a592d30..91ad4ede0 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 #
+# Copyright 2015 Google LLC and contributors
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -345,6 +347,20 @@ function set_proxy(){
   export NO_PROXY="${no_proxy}"
 }
 
+function is_ramdisk() {
+  if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi
+  if   ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0
+  elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi
+
+  if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then
+    IS_RAMDISK="true"
+    return 0
+  else
+    IS_RAMDISK="false"
+    return 1
+  fi
+}
+
 function mount_ramdisk(){
   local free_mem
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
@@ -353,18 +369,11 @@ function mount_ramdisk(){
   # Write to a ramdisk instead of churning the persistent disk
 
   tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
+  mkdir -p "${tmpdir}/pkgs_dirs"
   mount -t tmpfs tmpfs "${tmpdir}"
 
   # Download conda packages to tmpfs
-  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
-
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  pip cache purge || echo "unable to purge pip cache"
-
-  # Download pip packages to tmpfs
-  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs"
 
   # Download OS packages to tmpfs
   if is_debuntu ; then
@@ -372,6 +381,7 @@ function mount_ramdisk(){
   else
     mount -t tmpfs tmpfs /var/cache/dnf
   fi
+  is_ramdisk -f
 }
 
 function check_os() {
@@ -547,13 +557,13 @@ function check_secure_boot() {
   readonly PSN
 
   if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
-    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
-    exit 1
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster."
+    return
   elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
     echo "Secure boot is enabled, but no signing material provided."
-    echo "Please either disable secure boot or provide signing material as per"
+    echo "Consider either disabling secure boot or provide signing material as per"
     echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-    return 1
+    return
   fi
 
   CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
@@ -565,6 +575,12 @@ function check_secure_boot() {
                       mok_der=/var/lib/dkms/mok.pub ; fi
 }
 
+function restart_knox() {
+  systemctl stop knox
+  rm -rf "${KNOX_HOME}/data/deployments/*"
+  systemctl start knox
+}
+
 function install_dependencies() {
   test -f "${workdir}/complete/install-dependencies" && return 0
   pkg_list="screen"
@@ -573,6 +589,21 @@ function install_dependencies() {
   touch "${workdir}/complete/install-dependencies"
 }
 
+function prepare_pip_env() {
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv"
+  source "${tmpdir}/python-venv/bin/activate"
+
+  pip cache purge || echo "unable to purge pip cache"
+  if is_ramdisk ; then
+    # Download pip packages to tmpfs
+    mkdir -p "${tmpdir}/cache-dir"
+    pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir"
+  fi
+}
+
+
 function prepare_common_env() {
   define_os_comparison_functions
 
@@ -594,6 +625,10 @@ function prepare_common_env() {
   ROLE="$(get_metadata_attribute dataproc-role)"
   readonly ROLE
 
+  # master node
+  MASTER="$(get_metadata_attribute dataproc-master)"
+  readonly MASTER
+
   workdir=/opt/install-dpgce
   tmpdir=/tmp/
   temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
@@ -604,6 +639,9 @@ function prepare_common_env() {
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
+  # Knox config
+  readonly KNOX_HOME=/usr/lib/knox
+
   mkdir -p "${workdir}/complete"
   set_proxy
   mount_ramdisk
@@ -647,13 +685,17 @@ function prepare_common_env() {
   touch "${workdir}/complete/prepare.common"
 }
 
+function pip_exit_handler() {
+  if is_ramdisk ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+  fi
+}
+
 function common_exit_handler() {
   set +ex
   echo "Exit handler invoked"
 
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
   # Restart YARN services if they are running already
   for svc in resourcemanager nodemanager; do
     if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
@@ -664,9 +706,6 @@ function common_exit_handler() {
 
   # If system memory was sufficient to mount memory-backed filesystems
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-
     # Clean up shared memory mounts
     for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
       if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
@@ -685,11 +724,11 @@ function common_exit_handler() {
     # re-hold systemd package
     if ge_debian12 ; then
     apt-mark hold systemd libsystemd0 ; fi
-    hold_nvidia_packages
   else
     dnf clean all
   fi
 
+  # When creating image, print disk usage statistics, zero unused disk space
   if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
     # print disk usage statistics for large components
     if is_ubuntu ; then
@@ -731,11 +770,12 @@ function common_exit_handler() {
           '@siz=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
                   grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting};
 print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+       "starting-disk-used: $starting", $/,
+       "maximum-disk-used:  $max", $/,
+       "minimum-disk-used:  $min", $/,
+       "     increased-by:  $inc", $/ )' < "/run/disk-usage.log"
 
 
     # zero free disk space
@@ -802,6 +842,15 @@ function set_support_matrix() {
 set_support_matrix
 
 function set_cuda_version() {
+  case "${DATAPROC_IMAGE_VERSION}" in
+    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    *   )
+      echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
+      exit 1
+      ;;
+  esac
   local cuda_url
   cuda_url=$(get_metadata_attribute 'cuda-url' '')
   if [[ -n "${cuda_url}" ]] ; then
@@ -810,29 +859,8 @@ function set_cuda_version() {
     CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
     if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
       DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
-      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
     fi
   fi
-
-  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
-    DEFAULT_CUDA_VERSION='12.4.1'
-  fi
-  # EXCEPTIONS
-  # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
-  case "${DATAPROC_IMAGE_VERSION}" in
-    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;;
-    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
-    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
-    *   )
-      echo "unrecognized Dataproc image version"
-      exit 1
-      ;;
-  esac
-
-  if le_ubuntu18 ; then
-    DEFAULT_CUDA_VERSION="12.1.1"
-    CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}"  #12.1
-  fi
   readonly DEFAULT_CUDA_VERSION
 
   CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
@@ -845,7 +873,6 @@ function set_cuda_version() {
     CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
   fi
   readonly CUDA_FULL_VERSION
-
 }
 
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
@@ -2037,6 +2064,8 @@ function prepare_gpu_env(){
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
 # Users should run apt-mark unhold before they wish to upgrade these packages
 function hold_nvidia_packages() {
+  if ! is_debuntu ; then return ; fi
+
   apt-mark hold nvidia-*
   apt-mark hold libnvidia-*
   if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
@@ -2199,6 +2228,7 @@ function gpu_exit_handler() {
       fi
     done
   fi
+  hold_nvidia_packages
 }
 
 
@@ -2229,12 +2259,14 @@ function main() {
 
 function exit_handler() {
   gpu_exit_handler
+  pip_exit_handler
   common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
   prepare_common_env
+  prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT
 }

From 75d8e321bda73d723616fc531138121f9573425d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 5 Jan 2025 23:53:47 -0800
Subject: [PATCH 103/130] removed redundant template disclaimer

---
 templates/spark-rapids/spark-rapids.sh.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 1467fedf9..56603252b 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -22,7 +22,6 @@
 # For details see
 # github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot
 #
-[% PROCESS common/template_disclaimer %]
 
 set -euxo pipefail
 

From 34fce25def2b95b8542ac15fd79b7df0c93ca62b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 02:01:40 -0800
Subject: [PATCH 104/130] setup and tear-down for actions which work with conda

---
 templates/common/util_functions | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index a52001db3..e27a1f9d5 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -575,6 +575,16 @@ function prepare_pip_env() {
   fi
 }
 
+function prepare_conda_env() {
+  CONDA=/opt/conda/miniconda3/bin/conda
+  touch ~/.condarc
+  cp ~/.condarc ~/.condarc.default
+  if is_ramdisk ; then
+    # Download conda packages to tmpfs
+    mkdir -p "${tmpdir}/conda_cache"
+    ${CONDA} config --add pkgs_dirs "${tmpdir}/conda_cache"
+  fi
+}
 
 function prepare_common_env() {
   define_os_comparison_functions
@@ -664,6 +674,10 @@ function pip_exit_handler() {
   fi
 }
 
+function conda_exit_handler() {
+  mv ~/.condarc.default ~/.condarc
+}
+
 function common_exit_handler() {
   set +ex
   echo "Exit handler invoked"

From bbe062e8328941acff8f3209a839fc3fedf39016 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 02:08:11 -0800
Subject: [PATCH 105/130] * refactored common conda installer functionality
 from dask.sh.in and   rapids.sh.in into a function install_conda_packages *
 removed redundant yarn service restarts in rapids.sh.in * added conda prep
 and exit handlers

---
 templates/dask/util_functions | 80 ++++++++++++++++++++---------------
 templates/gpu/util_functions  |  2 +-
 templates/rapids/rapids.sh.in | 13 +-----
 3 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index a2863ec8b..5705c4a78 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -377,10 +377,14 @@ EOF
 
 function install_dask() {
   local python_spec="python>=3.11"
-  local dask_spec="dask>=2024.7"
+  local dask_version="2024.12.1"
+  local dask_spec="dask>=${dask_version}"
+  local cache_key_name="dask-${dask_version}"
 
   CONDA_PACKAGES=()
   if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
+    dask_yarn_version="0.9"
+    cache_key_name="dask-yarn-${dask_yarn_version}"
     # Pin `distributed` and `dask` package versions to old release
     # because `dask-yarn` 0.9 uses skein in a way which
     # is not compatible with `distributed` package 2022.2 and newer:
@@ -392,7 +396,7 @@ function install_dask() {
       # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
       CONDA_PACKAGES+=("fiona<1.8.22")
     fi
-    CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2")
+    CONDA_PACKAGES+=('dask-yarn=${dask_yarn_version}' "distributed<2022.2")
   fi
 
   CONDA_PACKAGES+=(
@@ -402,49 +406,30 @@ function install_dask() {
     "dask-sql"
   )
 
-  # Install dask
-  mamba="/opt/conda/miniconda3/bin/mamba"
-  conda="/opt/conda/miniconda3/bin/conda"
-
-  ( set +e
-  local is_installed=0
-  for installer in "${mamba}" "${conda}" ; do
-    test -d "${DASK_CONDA_ENV}" || \
-      time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \
-      -c 'conda-forge' -c 'nvidia'  \
-      ${CONDA_PACKAGES[*]} \
-      "${python_spec}" \
-      > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
-    sync
-    if [[ "$retval" == "0" ]] ; then
-      is_installed="1"
-      break
-    fi
-    "${conda}" config --set channel_priority flexible
-  done
-  if [[ "${is_installed}" == "0" ]]; then
-    echo "failed to install dask"
-    return 1
-  fi
-  )
+  unset CONDA_CHANNEL_ARGS
+  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}}"
+  install_conda_packages "${cache_key}"
 }
 
 function install_dask_rapids() {
+  local numba_spec="numba"
+  local dask_version="2024.12.1"
+  local dask_spec="dask>=${dask_version}"
+
   if is_cuda12 ; then
     local python_spec="python>=3.11"
     local cuda_spec="cuda-version>=12,<13"
-    local dask_spec="dask>=2024.7"
-    local numba_spec="numba"
   elif is_cuda11 ; then
     local python_spec="python>=3.9"
     local cuda_spec="cuda-version>=11,<12.0a0"
-    local dask_spec="dask"
-    local numba_spec="numba"
   fi
 
   rapids_spec="rapids>=${RAPIDS_VERSION}"
   CONDA_PACKAGES=()
+  local cache_key_name="dask-rapids-${RAPIDS_VERSION}"
   if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
+    local rapids_version="24.05"
+    cache_key_name="dask-rapids-yarn-${rapids_version}"
     # Pin `distributed` and `dask` package versions to old release
     # because `dask-yarn` 0.9 uses skein in a way which
     # is not compatible with `distributed` package 2022.2 and newer:
@@ -452,7 +437,7 @@ function install_dask_rapids() {
 
     dask_spec="dask<2022.2"
     python_spec="python>=3.7,<3.8.0a0"
-    rapids_spec="rapids<=24.05"
+    rapids_spec="rapids<=${rapids_version}"
     if is_ubuntu18 ; then
       # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
       CONDA_PACKAGES+=("fiona<1.8.22")
@@ -471,6 +456,31 @@ function install_dask_rapids() {
     "${numba_spec}"
   )
 
+  CONDA_CHANNEL_ARGS="-c 'conda-forge' -c 'nvidia' -c 'rapidsai'"
+
+  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}}"
+  install_conda_packages "${cache_key}"
+}
+
+# The bash array CONDA_PACKAGES must contain a set of package
+# specifications before calling this function
+
+# The bash string CONDA_CHANNEL_ARGS may contain arguments to specify
+# conda channels. Default is "-c 'conda-forge'"
+
+function install_conda_packages() {
+  local cache_key="${1}"
+
+  local build_tarball="${cache_key}.tar.gz"
+  local gcs_tarball="${pkg_bucket}/conda/${cache_key%%_*}/${build_tarball}"
+  local local_tarball="${tmpdir}/${build_tarball}"
+
+  if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+    echo "cache hit"
+    gcloud storage cat "${gcs_tarball}" | tar -C / -xz
+    return 0
+  fi
+
   # Install cuda, rapids, dask
   mamba="/opt/conda/miniconda3/bin/mamba"
   conda="/opt/conda/miniconda3/bin/conda"
@@ -480,17 +490,21 @@ function install_dask_rapids() {
   for installer in "${mamba}" "${conda}" ; do
     test -d "${DASK_CONDA_ENV}" || \
       time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \
-      -c 'conda-forge' -c 'nvidia' -c 'rapidsai'  \
+      "${CONDA_CHANNEL_ARGS:- -c 'conda-forge'}"  \
       ${CONDA_PACKAGES[*]} \
       "${python_spec}" \
       > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
     sync
     if [[ "$retval" == "0" ]] ; then
       is_installed="1"
+      tar czf "${local_tarball}" "${DASK_CONDA_ENV}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
       break
     fi
     "${conda}" config --set channel_priority flexible
   done
+
   if [[ "${is_installed}" == "0" ]]; then
     echo "failed to install dask"
     return 1
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 46b49ef36..e86a2ff66 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -779,7 +779,7 @@ function install_nvidia_userspace_runfile() {
       depmod -a
     else
       clear_dkms_key
-      tar czvf "${local_tarball}" \
+      tar czf "${local_tarball}" \
         /var/log/nvidia-installer.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index a63f44b3b..7ca6c410c 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -36,27 +36,18 @@ function main() {
       configure_fluentd_for_dask
     fi
   fi
-
-  echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized."
-  if [[ "${ROLE}" == "Master" ]]; then
-    systemctl restart hadoop-yarn-resourcemanager.service
-    # Restart NodeManager on Master as well if this is a single-node-cluster.
-    if systemctl list-units | grep hadoop-yarn-nodemanager; then
-      systemctl restart hadoop-yarn-nodemanager.service
-    fi
-  else
-    systemctl restart hadoop-yarn-nodemanager.service
-  fi
 }
 
 function exit_handler() {
   gpu_exit_handler
+  conda_exit_handler
   common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
   prepare_common_env
+  prepare_conda_env
   prepare_gpu_env
   conda_env="$(get_metadata_attribute conda-env 'dask-rapids')"
   readonly conda_env

From 10f16983a721bcaf5df7ce934a2d74b83da0e0ad Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 10:50:11 -0800
Subject: [PATCH 106/130] tested rapids.sh init action with dataproc-repro

---
 templates/dask/dask.sh.in                 |  4 +++-
 templates/dask/util_functions             | 13 +++++--------
 templates/gpu/install_gpu_driver.sh.in    |  2 ++
 templates/gpu/util_functions              |  7 ++-----
 templates/rapids/rapids.sh.in             |  7 +++++--
 templates/spark-rapids/spark-rapids.sh.in |  4 ++++
 6 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index b0279160f..8e6d2d7d4 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -51,7 +51,9 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
-  prepare_pip_env
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
+  readonly RAPIDS_RUNTIME
+  prepare_conda_env
   conda_env="$(get_metadata_attribute conda-env 'dask')"
   readonly conda_env
   prepare_dask_env
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index 5705c4a78..c9dc71b96 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -407,13 +407,13 @@ function install_dask() {
   )
 
   unset CONDA_CHANNEL_ARGS
-  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}}"
+  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}"
   install_conda_packages "${cache_key}"
 }
 
 function install_dask_rapids() {
   local numba_spec="numba"
-  local dask_version="2024.12.1"
+  local dask_version="2024.7"
   local dask_spec="dask>=${dask_version}"
 
   if is_cuda12 ; then
@@ -456,9 +456,9 @@ function install_dask_rapids() {
     "${numba_spec}"
   )
 
-  CONDA_CHANNEL_ARGS="-c 'conda-forge' -c 'nvidia' -c 'rapidsai'"
+  CONDA_CHANNEL_ARGS="-c conda-forge -c nvidia -c rapidsai"
 
-  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}}"
+  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}"
   install_conda_packages "${cache_key}"
 }
 
@@ -490,7 +490,7 @@ function install_conda_packages() {
   for installer in "${mamba}" "${conda}" ; do
     test -d "${DASK_CONDA_ENV}" || \
       time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \
-      "${CONDA_CHANNEL_ARGS:- -c 'conda-forge'}"  \
+      ${CONDA_CHANNEL_ARGS:- -c 'conda-forge'}  \
       ${CONDA_PACKAGES[*]} \
       "${python_spec}" \
       > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
@@ -527,9 +527,6 @@ function prepare_dask_env() {
 
 function prepare_dask_rapids_env(){
   prepare_dask_env
-  # RAPIDS config
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
-  readonly RAPIDS_RUNTIME
 
   local DEFAULT_DASK_RAPIDS_VERSION="24.08"
   if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 0e27f1086..57f4e640c 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -46,6 +46,8 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+  readonly RAPIDS_RUNTIME
   prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index e86a2ff66..fb3e8fa4b 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -145,7 +145,8 @@ function set_cudnn_version() {
   readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
 
   # Parameters for NVIDIA-provided cuDNN library
-  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly DEFAULT_CUDNN_VERSION
   CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
   # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
   if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
@@ -1252,10 +1253,6 @@ function prepare_gpu_env(){
   INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
   readonly INSTALL_GPU_AGENT
 
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-  readonly RAPIDS_RUNTIME
-
   # determine whether we have nvidia-smi installed and working
   nvsmi
 
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index 7ca6c410c..e6b973b45 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -47,11 +47,14 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
-  prepare_conda_env
-  prepare_gpu_env
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
+  readonly RAPIDS_RUNTIME
   conda_env="$(get_metadata_attribute conda-env 'dask-rapids')"
   readonly conda_env
   prepare_dask_rapids_env
+  prepare_conda_env
+  prepare_gpu_env
   trap exit_handler EXIT
 }
 
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 56603252b..29bc83824 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -58,6 +58,10 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+  readonly RAPIDS_RUNTIME
+
   prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT

From 8a4cbd94d71ab20151b522d150d6a3a137101185 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 15:00:55 -0800
Subject: [PATCH 107/130] templates/dask/dask.sh.in,
 templates/dask/util_functions, templates/gpu/install_gpu_driver.sh.in,
 templates/gpu/util_functions, templates/rapids/rapids.sh.in,
 templates/spark-rapids/spark-rapids.sh.in:

* cleaned up definition of RAPIDS_RUNTIME ; default to SPARK and use
  DASK only for dask-rapids

templates/dask/util_functions,
templates/gpu/util_functions,
templates/common/util_functions:

* added utility functions to check whether a phase has been complete,
  mark a phase complete and mark a phase as incomplete

templates/dask/util_functions:

* conda environment is now archived from the environment directory
  rather than from /

templates/rapids/rapids.sh.in:

* Now executing gpu installer logic before installing dask-rapids
* now exiting if rapids runtime is not DASK
---
 templates/common/util_functions           | 24 +++++++--
 templates/dask/dask.sh.in                 |  2 -
 templates/dask/util_functions             | 24 +++++++--
 templates/gpu/install_gpu_driver.sh.in    |  2 -
 templates/gpu/util_functions              | 66 ++++++++++++++---------
 templates/rapids/rapids.sh.in             | 17 ++++--
 templates/spark-rapids/spark-rapids.sh.in |  4 --
 7 files changed, 95 insertions(+), 44 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index e27a1f9d5..351e20fad 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -553,12 +553,28 @@ function restart_knox() {
   systemctl start knox
 }
 
+function is_complete() {
+  phase="$1"
+  test -f "${workdir}/complete/${phase}"
+}
+
+function mark_complete() {
+  phase="$1"
+  touch "${workdir}/complete/${phase}"
+}
+
+function mark_incomplete() {
+  phase="$1"
+  rm -f "${workdir}/complete/${phase}"
+}
+
 function install_dependencies() {
-  test -f "${workdir}/complete/install-dependencies" && return 0
+  is_complete install-dependencies && return 0
+
   pkg_list="screen"
   if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
   elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
-  touch "${workdir}/complete/install-dependencies"
+  mark_complete install-dependencies
 }
 
 function prepare_pip_env() {
@@ -630,7 +646,7 @@ function prepare_common_env() {
 
   readonly install_log="${tmpdir}/install.log"
 
-  if test -f "${workdir}/complete/prepare.common" ; then return ; fi
+  is_complete prepare.common && return
 
   repair_old_backports
 
@@ -664,7 +680,7 @@ function prepare_common_env() {
       bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
  fi
 
-  touch "${workdir}/complete/prepare.common"
+  mark_complete prepare.common
 }
 
 function pip_exit_handler() {
diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index 8e6d2d7d4..cafc2df89 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -51,8 +51,6 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
-  readonly RAPIDS_RUNTIME
   prepare_conda_env
   conda_env="$(get_metadata_attribute conda-env 'dask')"
   readonly conda_env
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index c9dc71b96..d1aee00b4 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -376,6 +376,8 @@ EOF
 }
 
 function install_dask() {
+  is_complete install.dask && return
+
   local python_spec="python>=3.11"
   local dask_version="2024.12.1"
   local dask_spec="dask>=${dask_version}"
@@ -409,9 +411,13 @@ function install_dask() {
   unset CONDA_CHANNEL_ARGS
   local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}"
   install_conda_packages "${cache_key}"
+
+  mark_complete install.dask
 }
 
 function install_dask_rapids() {
+  if ( is_complete install.dask-rapids && test -d "${DASK_CONDA_ENV}" ) ; then return ; fi
+
   local numba_spec="numba"
   local dask_version="2024.7"
   local dask_spec="dask>=${dask_version}"
@@ -460,6 +466,8 @@ function install_dask_rapids() {
 
   local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}"
   install_conda_packages "${cache_key}"
+
+  mark_complete install.dask-rapids
 }
 
 # The bash array CONDA_PACKAGES must contain a set of package
@@ -477,7 +485,8 @@ function install_conda_packages() {
 
   if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
     echo "cache hit"
-    gcloud storage cat "${gcs_tarball}" | tar -C / -xz
+    mkdir -p "${DASK_CONDA_ENV}"
+    time ( gcloud storage cat "${gcs_tarball}" | tar -C "${DASK_CONDA_ENV}" -xz )
     return 0
   fi
 
@@ -497,9 +506,13 @@ function install_conda_packages() {
     sync
     if [[ "$retval" == "0" ]] ; then
       is_installed="1"
-      tar czf "${local_tarball}" "${DASK_CONDA_ENV}"
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
+      pushd "${DASK_CONDA_ENV}"
+      time (
+        tar czf "${local_tarball}" .
+        gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+        rm "${local_tarball}"
+      )
+      popd
       break
     fi
     "${conda}" config --set channel_priority flexible
@@ -528,6 +541,9 @@ function prepare_dask_env() {
 function prepare_dask_rapids_env(){
   prepare_dask_env
 
+  # Default rapids runtime
+  readonly DEFAULT_RAPIDS_RUNTIME='DASK'
+
   local DEFAULT_DASK_RAPIDS_VERSION="24.08"
   if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
     DEFAULT_DASK_RAPIDS_VERSION="23.08" # Final release to support spark 3.1.3
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 57f4e640c..0e27f1086 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -46,8 +46,6 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-  readonly RAPIDS_RUNTIME
   prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index fb3e8fa4b..61d6bf478 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -306,7 +306,7 @@ function uninstall_cuda_keyring_pkg() {
 }
 
 function install_local_cuda_repo() {
-  if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi
+  is_complete install-local-cuda-repo && return
 
   if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
   CUDA_LOCAL_REPO_INSTALLED="1"
@@ -329,7 +329,7 @@ function install_local_cuda_repo() {
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
 
-  touch "${workdir}/complete/install-local-cuda-repo"
+  mark_complete install-local-cuda-repo
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
@@ -337,7 +337,8 @@ function uninstall_local_cuda_repo(){
 }
 
 function install_local_cudnn_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
+  is_complete install-local-cudnn-repo && return
+
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
   CUDNN_PKG_NAME="${pkgname}"
   local_deb_fn="${pkgname}_1.0-1_amd64.deb"
@@ -353,7 +354,7 @@ function install_local_cudnn_repo() {
 
   cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
-  touch "${workdir}/complete/install-local-cudnn-repo"
+  mark_complete install-local-cudnn-repo
 }
 
 function uninstall_local_cudnn_repo() {
@@ -362,7 +363,7 @@ function uninstall_local_cudnn_repo() {
 }
 
 function install_local_cudnn8_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
+  is_complete install-local-cudnn8-repo && return
 
   if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
   elif is_debian ; then cudnn8_shortname="debian11"
@@ -396,19 +397,19 @@ function install_local_cudnn8_repo() {
   rm -f "${local_deb_fn}"
 
   cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  touch "${workdir}/complete/install-local-cudnn8-repo"
+  mark_complete install-local-cudnn8-repo
 }
 
 function uninstall_local_cudnn8_repo() {
   apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cudnn8-repo"
+  mark_incomplete install-local-cudnn8-repo
 }
 
 function install_nvidia_nccl() {
   readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
   readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
 
-  if test -f "${workdir}/complete/nccl" ; then return ; fi
+  is_complete nccl && return
 
   if is_cuda11 && is_debian12 ; then
     echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
@@ -499,14 +500,15 @@ function install_nvidia_nccl() {
   fi
 
   popd
-  touch "${workdir}/complete/nccl"
+  mark_complete nccl
 }
 
 function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
 function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
-  if test -f "${workdir}/complete/cudnn" ; then return ; fi
+  is_complete cudnn && return
+
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -565,7 +567,7 @@ function install_nvidia_cudnn() {
   ldconfig
 
   echo "NVIDIA cuDNN successfully installed for ${_shortname}."
-  touch "${workdir}/complete/cudnn"
+  mark_complete cudnn
 }
 
 function add_nonfree_components() {
@@ -722,7 +724,8 @@ function install_nvidia_userspace_runfile() {
   #
   # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
   # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  if test -f "${workdir}/complete/userspace" ; then return ; fi
+  is_complete userspace && return
+
   local local_fn="${tmpdir}/userspace.run"
 
   cache_fetched_package "${USERSPACE_URL}" \
@@ -788,12 +791,13 @@ function install_nvidia_userspace_runfile() {
   fi
 
   rm -f "${local_fn}"
-  touch "${workdir}/complete/userspace"
+  mark_complete userspace
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${workdir}/complete/cuda" ; then return ; fi
+  is_complete cuda && return
+
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
@@ -802,7 +806,7 @@ function install_cuda_runfile() {
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
   rm -f "${local_fn}"
-  touch "${workdir}/complete/cuda"
+  mark_complete cuda
   sync
 }
 
@@ -840,7 +844,7 @@ function load_kernel_module() {
 }
 
 function install_cuda(){
-  if test -f "${workdir}/complete/cuda-repo" ; then return ; fi
+  is_complete cuda-repo && return
 
   if ( ge_debian12 && is_src_os ) ; then
     echo "installed with the driver on ${_shortname}"
@@ -853,10 +857,12 @@ function install_cuda(){
   # Includes CUDA packages
   add_repo_cuda
 
-  touch "${workdir}/complete/cuda-repo"
+  mark_complete cuda-repo
 }
 
 function install_nvidia_container_toolkit() {
+  is_complete install-nvtk && return
+
   local container_runtime_default
     if command -v docker     ; then container_runtime_default='docker'
   elif command -v containerd ; then container_runtime_default='containerd'
@@ -872,11 +878,13 @@ function install_nvidia_container_toolkit() {
     execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
   nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
   systemctl restart "${CONTAINER_RUNTIME}"
+
+  mark_complete install-nvtk
 }
 
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if test -f "${workdir}/complete/gpu-driver" ; then return ; fi
+  is_complete gpu-driver && return
 
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
@@ -898,11 +906,11 @@ function install_nvidia_gpu_driver() {
   build_driver_from_github
 
   echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  touch "${workdir}/complete/gpu-driver"
+  mark_complete gpu-driver
 }
 
 function install_ops_agent(){
-  if test -f "${workdir}/complete/ops-agent" ; then return ; fi
+  is_complete ops-agent && return
 
   mkdir -p /opt/google
   cd /opt/google
@@ -910,7 +918,7 @@ function install_ops_agent(){
   curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
   execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
-  touch "${workdir}/complete/ops-agent"
+  is_complete ops-agent
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
@@ -1189,7 +1197,7 @@ function query_nvsmi() {
 }
 
 function install_build_dependencies() {
-  if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
+  is_complete build-dependencies && return
 
   if is_debuntu ; then
     if is_ubuntu22 && is_cuda12 ; then
@@ -1227,7 +1235,7 @@ function install_build_dependencies() {
 
     execute_with_retries "${dnf_cmd}"
   fi
-  touch "${workdir}/complete/build-dependencies"
+  mark_complete build-dependencies
 }
 
 function prepare_gpu_env(){
@@ -1245,6 +1253,14 @@ function prepare_gpu_env(){
   CUDNN8_PKG_NAME=""
   CUDA_LOCAL_REPO_INSTALLED="0"
 
+  if ! test -v DEFAULT_RAPIDS_RUNTIME ; then
+    readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
+  fi
+
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
+  readonly RAPIDS_RUNTIME
+
   # Whether to install NVIDIA-provided or OS-provided GPU driver
   GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
   readonly GPU_DRIVER_PROVIDER
@@ -1337,7 +1353,7 @@ function configure_mig_cgi() {
 }
 
 function enable_mig() {
-  if test -f "${workdir}/complete/enable-mig" ; then return ; fi
+  is_complete enable-mig && return
 
   # Start persistenced if it's not already running
   if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
@@ -1349,7 +1365,7 @@ function enable_mig() {
   nvsmi -mig 1
   clear_nvsmi_cache
 
-  touch "${workdir}/complete/enable-mig"
+  mark_complete enable-mig
 }
 
 function enable_and_configure_mig() {
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index e6b973b45..75f4c7605 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -16,6 +16,20 @@ set -euxo pipefail
 [% INSERT dask/util_functions %]
 
 function main() {
+  setup_gpu_yarn
+
+  echo "yarn setup complete"
+
+  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
+    install_nvidia_nccl
+    install_nvidia_cudnn
+  fi
+
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    echo "RAPIDS recognizes SPARK runtime - currently supported using gpu/install_gpu_driver.sh or spark-rapids/spark-rapids.sh"
+    exit 1
+  fi
+
   # Install Dask with RAPIDS
   install_dask_rapids
 
@@ -47,9 +61,6 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
-  readonly RAPIDS_RUNTIME
   conda_env="$(get_metadata_attribute conda-env 'dask-rapids')"
   readonly conda_env
   prepare_dask_rapids_env
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 29bc83824..56603252b 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -58,10 +58,6 @@ function exit_handler() {
 
 function prepare_to_install(){
   prepare_common_env
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-  readonly RAPIDS_RUNTIME
-
   prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT

From b01b8675f06d76aee5ce72cba8766a225c147fcc Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 17:48:39 -0800
Subject: [PATCH 108/130] refactor yarn functions into their own template

---
 templates/common/util_functions           |  4 +-
 templates/common/yarn_functions           | 69 ++++++++++++++++
 templates/gpu/install_gpu_driver.sh.in    | 17 ++++
 templates/gpu/mig_functions               | 97 +++++++++++++++++++++++
 templates/gpu/util_functions              | 11 ++-
 templates/rapids/rapids.sh.in             |  2 +
 templates/spark-rapids/mig.sh.in          | 49 +++++++++++-
 templates/spark-rapids/spark-rapids.sh.in | 22 ++++-
 8 files changed, 261 insertions(+), 10 deletions(-)
 create mode 100644 templates/common/yarn_functions
 create mode 100644 templates/gpu/mig_functions

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 351e20fad..dfd2cfdf1 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -580,8 +580,8 @@ function install_dependencies() {
 function prepare_pip_env() {
   # Clear pip cache
   # TODO: make this conditional on which OSs have pip without cache purge
-  test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv"
-  source "${tmpdir}/python-venv/bin/activate"
+  test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv"
+  source "${workdir}/python-venv/bin/activate"
 
   pip cache purge || echo "unable to purge pip cache"
   if is_ramdisk ; then
diff --git a/templates/common/yarn_functions b/templates/common/yarn_functions
new file mode 100644
index 000000000..8e38c7b0a
--- /dev/null
+++ b/templates/common/yarn_functions
@@ -0,0 +1,69 @@
+function configure_yarn_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
+
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
+
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
+
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_resources
+
+  # When there is no GPU, but the installer is executing on a master node:
+  if [[ "${gpu_count}" == "0" ]] ; then
+    if [[ "${ROLE}" == "Master" ]]; then
+      configure_yarn_nodemanager
+    fi
+    return 0
+  fi
+
+  install_nvidia_container_toolkit
+  configure_yarn_nodemanager_gpu
+  configure_gpu_script
+  configure_gpu_isolation
+}
+
+function yarn_exit_handler() {
+  # Restart YARN services if they are running already
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl  stop "hadoop-yarn-${svc}.service"
+      systemctl start "hadoop-yarn-${svc}.service"
+    fi
+  done
+  # restart services stopped during preparation stage
+  # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+}
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 0e27f1086..dcbd8c15e 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -10,9 +10,25 @@ set -euxo pipefail
 
 [% INSERT common/util_functions %]
 
+[% INSERT common/yarn_functions %]
+
 [% INSERT gpu/util_functions %]
 
 function main() {
+  install_nvidia_gpu_driver
+  install_cuda
+  load_kernel_module
+
+  #Install GPU metrics collection in Stackdriver if needed
+  if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+    install_gpu_agent
+#    install_gpu_monitoring_agent
+    echo 'GPU metrics agent successfully deployed.'
+  else
+    echo 'GPU metrics agent has not been installed.'
+  fi
+  configure_gpu_exclusive_mode
+
   setup_gpu_yarn
 
   echo "yarn setup complete"
@@ -40,6 +56,7 @@ function main() {
 function exit_handler() {
   gpu_exit_handler
   pip_exit_handler
+  yarn_exit_handler
   common_exit_handler
   return 0
 }
diff --git a/templates/gpu/mig_functions b/templates/gpu/mig_functions
new file mode 100644
index 000000000..233b2d02c
--- /dev/null
+++ b/templates/gpu/mig_functions
@@ -0,0 +1,97 @@
+function fetch_mig_scripts() {
+  mkdir -p /usr/local/yarn-mig-scripts
+  chmod 755 /usr/local/yarn-mig-scripts
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
+  chmod 755 /usr/local/yarn-mig-scripts/*
+}
+
+function delete_mig_instances() (
+  # delete all instances
+  set +e
+  nvidia-smi mig -dci
+
+  case "${?}" in
+    "0" ) echo "compute instances deleted"            ;;
+    "2" ) echo "invalid argument"                     ;;
+    "6" ) echo "No compute instances found to delete" ;;
+    *   ) echo "unrecognized return code"             ;;
+  esac
+
+  nvidia-smi mig -dgi
+  case "${?}" in
+    "0" ) echo "compute instances deleted"        ;;
+    "2" ) echo "invalid argument"                 ;;
+    "6" ) echo "No GPU instances found to delete" ;;
+    *   ) echo "unrecognized return code"         ;;
+  esac
+)
+
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
+function configure_mig_cgi() {
+  delete_mig_instances
+  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
+  if test -n "${META_MIG_CGI_VALUE}"; then
+    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
+  else
+    # https://pci-ids.ucw.cz/v2.2/pci.ids
+    local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)"
+    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then
+      # run the following command to list placement profiles
+      # nvidia-smi mig -lgipp
+      #
+      # This is the result when using H100 instances on 20241220
+      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
+      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
+      # GPU  0 Profile ID  9 Placements: {0,4}:4
+      # GPU  0 Profile ID  5 Placement : {0}:4
+      # GPU  0 Profile ID  0 Placement : {0}:8
+
+      # For H100 3D controllers, consider profile 19, 7x1G instances
+      nvidia-smi mig -cgi 9,9 -C
+    elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
+      # Dataproc only supports H100s right now ; split in 2 if not specified
+      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
+      nvidia-smi mig -cgi 9,9 -C
+    else
+      echo "unrecognized 3D controller"
+    fi
+  fi
+  clear_nvsmi_cache
+}
+
+function enable_mig() {
+  is_complete enable-mig && return
+
+  # Start persistenced if it's not already running
+  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
+  for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
+    # Write an ascii zero to the numa node indicator
+    echo "0" | dd of="${f}" status=none
+  done
+  time nvsmi --gpu-reset # 30s
+  nvsmi -mig 1
+  clear_nvsmi_cache
+
+  mark_complete enable-mig
+}
+
+function enable_and_configure_mig() {
+  # default MIG to on when this script is used
+  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+
+  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
+
+  enable_mig
+  query_nvsmi
+  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
+  mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
+
+  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
+  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured but NOT enabled.  Failing" ; exit 1 ; fi
+
+  echo "MIG is fully enabled"
+  configure_mig_cgi
+}
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 61d6bf478..e8aa1a8d5 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -943,9 +943,12 @@ function download_gpu_monitoring_agent(){
 
 function install_gpu_monitoring_agent_dependency(){
   cd /opt/google/compute-gpu-monitoring/linux
-  python3 -m venv venv
-  venv/bin/pip install wheel
-  venv/bin/pip install -Ur requirements.txt
+  /usr/bin/python3 -m venv venv
+  (
+    source venv/bin/activate
+    pip install wheel
+    pip install -Ur requirements.txt
+  )
 }
 
 function start_gpu_monitoring_agent_service(){
@@ -971,7 +974,7 @@ function install_gpu_agent() {
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
   local venv="${install_dir}/venv"
-  python3 -m venv "${venv}"
+  /usr/bin/python3 -m venv "${venv}"
 (
   source "${venv}/bin/activate"
   python3 -m pip install --upgrade pip
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index 75f4c7605..4e46ab1d3 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -54,6 +54,7 @@ function main() {
 
 function exit_handler() {
   gpu_exit_handler
+  pip_exit_handler
   conda_exit_handler
   common_exit_handler
   return 0
@@ -65,6 +66,7 @@ function prepare_to_install(){
   readonly conda_env
   prepare_dask_rapids_env
   prepare_conda_env
+  prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT
 }
diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
index 28a463602..99b494c4f 100644
--- a/templates/spark-rapids/mig.sh.in
+++ b/templates/spark-rapids/mig.sh.in
@@ -14,13 +14,55 @@
 #
 [% PROCESS common/template_disclaimer %]
 
-set -euxo pipefail
-
 [% INSERT common/util_functions %]
 
+[% INSERT common/yarn_functions %]
+
+[% INSERT gpu/mig_functions %]
+
 [% INSERT gpu/util_functions %]
 
+set -euxo pipefail
+
 function main() {
+  if [[ "${nvsmi_works}" == "1" ]] ; then
+    # if this is called without the MIG script then the drivers are not installed
+    query_nvsmi
+    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
+    set +e
+    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
+    set -e
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
+      fi
+    fi
+  fi
+
+  # if mig is enabled drivers would have already been installed
+  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+    install_nvidia_gpu_driver
+    install_cuda
+    load_kernel_module
+
+    #Install GPU metrics collection in Stackdriver if needed
+    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+      install_gpu_agent
+#      install_gpu_monitoring_agent
+      echo 'GPU metrics agent successfully deployed.'
+    else
+      echo 'GPU metrics agent has not been installed.'
+    fi
+    configure_gpu_exclusive_mode
+  fi
+
   setup_gpu_yarn
 
   echo "yarn setup complete"
@@ -33,12 +75,15 @@ function main() {
 
 function exit_handler() {
   gpu_exit_handler
+  pip_exit_handler
+  yarn_exit_handler
   common_exit_handler
   return 0
 }
 
 function prepare_to_install(){
   prepare_common_env
+  prepare_pip_env
   prepare_gpu_env
   trap exit_handler EXIT
 }
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 56603252b..0bfc0b331 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -27,9 +27,25 @@ set -euxo pipefail
 
 [% INSERT common/util_functions %]
 
+[% INSERT common/yarn_functions %]
+
 [% INSERT gpu/util_functions %]
 
 function main() {
+  install_nvidia_gpu_driver
+  install_cuda
+  load_kernel_module
+
+  #Install GPU metrics collection in Stackdriver if needed
+  if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+#    install_gpu_agent
+    install_gpu_monitoring_agent
+    echo 'GPU metrics agent successfully deployed.'
+  else
+    echo 'GPU metrics agent has not been installed.'
+  fi
+  configure_gpu_exclusive_mode
+
   setup_gpu_yarn
 
   echo "yarn setup complete"
@@ -39,10 +55,11 @@ function main() {
     configure_gpu_script
     echo "RAPIDS initialized with Spark runtime"
   elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
-    # we are not currently tooled for installing dask in this action.
-    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
+    echo "This action only installs spark-rapids"
+    exit 1
   else
     echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
+    exit 1
   fi
 
   echo "main complete"
@@ -52,6 +69,7 @@ function main() {
 function exit_handler() {
   gpu_exit_handler
   pip_exit_handler
+  yarn_exit_handler
   common_exit_handler
   return 0
 }

From c6c09db27d71a0affe97665fabf9786d7b215ed8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 17:49:24 -0800
Subject: [PATCH 109/130] refactor mig functions into their own template

---
 templates/common/util_functions |  61 ++-----------
 templates/dask/util_functions   |  12 +--
 templates/gpu/util_functions    | 155 --------------------------------
 3 files changed, 10 insertions(+), 218 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index dfd2cfdf1..4d9f983a4 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -162,45 +162,6 @@ function set_hadoop_property() {
     --clobber
 }
 
-function configure_yarn_resources() {
-  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
-  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
-
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
-
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
-
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
-
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
-
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
-  fi
-}
-
 function clean_up_sources_lists() {
   #
   # bigtop (primary)
@@ -664,17 +625,18 @@ function prepare_common_env() {
     dnf clean all
   fi
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+  # When creating a disk image:
+  if [[ -n "$(get_metadata_attribute creating-image "")" ]]; then
+    df / > "/run/disk-usage.log"
 
- ( set +e
+  # zero free disk space
+  ( set +e
     time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
   )
 
     install_dependencies
 
     # Monitor disk usage in a screen session
-    df / > "/run/disk-usage.log"
     touch "/run/keep-running-df"
     screen -d -m -LUS keep-running-df \
       bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
@@ -698,25 +660,14 @@ function common_exit_handler() {
   set +ex
   echo "Exit handler invoked"
 
-  # Restart YARN services if they are running already
-  for svc in resourcemanager nodemanager; do
-    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl  stop "hadoop-yarn-${svc}.service"
-      systemctl start "hadoop-yarn-${svc}.service"
-    fi
-  done
-
   # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+  if is_ramdisk ; then
     # Clean up shared memory mounts
     for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
       if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
         umount -f ${shmdir}
       fi
     done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
   fi
 
   if is_debuntu ; then
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index d1aee00b4..5a1f7e201 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -74,9 +74,10 @@ EOF
   else
     # Enable service on single-node cluster (no workers)
     local worker_count="$(get_metadata_attribute dataproc-worker-count)"
-    if [[ "${worker_count}" == "0" ]] &&
-       [[ "$(get_metadata_attribute dask-cuda-worker-on-master 'true')" == "true" ]] &&
-       [[ "$(get_metadata_attribute dask-worker-on-master 'true')" == "true" ]] ; then
+    if ( [[ "${worker_count}" == "0" ]] ||
+         ( [[ "$(get_metadata_attribute dask-cuda-worker-on-master 'true')" == "true" ]] &&
+           [[ "$(get_metadata_attribute dask-worker-on-master 'true')"      == "true" ]] )
+       ) ; then
       enable_systemd_dask_worker_service="1"
     fi
   fi
@@ -550,8 +551,3 @@ function prepare_dask_rapids_env(){
   fi
   readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
 }
-
-
-function dask_exit_handler() {
-  echo "no exit handler for dask"
-}
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index e8aa1a8d5..4834adb33 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1013,14 +1013,6 @@ function configure_gpu_exclusive_mode() {
   clear_nvsmi_cache
 }
 
-function fetch_mig_scripts() {
-  mkdir -p /usr/local/yarn-mig-scripts
-  chmod 755 /usr/local/yarn-mig-scripts
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  chmod 755 /usr/local/yarn-mig-scripts/*
-}
-
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
   local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
@@ -1299,153 +1291,6 @@ function hold_nvidia_packages() {
   fi
 }
 
-function delete_mig_instances() (
-  # delete all instances
-  set +e
-  nvidia-smi mig -dci
-
-  case "${?}" in
-    "0" ) echo "compute instances deleted"            ;;
-    "2" ) echo "invalid argument"                     ;;
-    "6" ) echo "No compute instances found to delete" ;;
-    *   ) echo "unrecognized return code"             ;;
-  esac
-
-  nvidia-smi mig -dgi
-  case "${?}" in
-    "0" ) echo "compute instances deleted"        ;;
-    "2" ) echo "invalid argument"                 ;;
-    "6" ) echo "No GPU instances found to delete" ;;
-    *   ) echo "unrecognized return code"         ;;
-  esac
-)
-
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
-function configure_mig_cgi() {
-  delete_mig_instances
-  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
-  if test -n "${META_MIG_CGI_VALUE}"; then
-    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
-  else
-    # https://pci-ids.ucw.cz/v2.2/pci.ids
-    local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)"
-    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then
-      # run the following command to list placement profiles
-      # nvidia-smi mig -lgipp
-      #
-      # This is the result when using H100 instances on 20241220
-      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
-      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
-      # GPU  0 Profile ID  9 Placements: {0,4}:4
-      # GPU  0 Profile ID  5 Placement : {0}:4
-      # GPU  0 Profile ID  0 Placement : {0}:8
-
-      # For H100 3D controllers, consider profile 19, 7x1G instances
-      nvidia-smi mig -cgi 9,9 -C
-    elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
-      # Dataproc only supports H100s right now ; split in 2 if not specified
-      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
-      nvidia-smi mig -cgi 9,9 -C
-    else
-      echo "unrecognized 3D controller"
-    fi
-  fi
-  clear_nvsmi_cache
-}
-
-function enable_mig() {
-  is_complete enable-mig && return
-
-  # Start persistenced if it's not already running
-  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
-  for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
-    # Write an ascii zero to the numa node indicator
-    echo "0" | dd of="${f}" status=none
-  done
-  time nvsmi --gpu-reset # 30s
-  nvsmi -mig 1
-  clear_nvsmi_cache
-
-  mark_complete enable-mig
-}
-
-function enable_and_configure_mig() {
-  # default MIG to on when this script is used
-  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
-
-  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
-
-  enable_mig
-  query_nvsmi
-  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-  mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
-
-  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
-  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured but NOT enabled.  Failing" ; exit 1 ; fi
-
-  echo "MIG is fully enabled"
-  configure_mig_cgi
-}
-
-function setup_gpu_yarn() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn_resources
-
-  # When there is no GPU, but the installer is executing on a master node:
-  if [[ "${gpu_count}" == "0" ]] ; then
-    if [[ "${ROLE}" == "Master" ]]; then
-      configure_yarn_nodemanager
-    fi
-    return 0
-  fi
-
-  if [[ "${nvsmi_works}" == "1" ]] ; then
-    # if this is called without the MIG script then the drivers are not installed
-    query_nvsmi
-    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-    set +e
-    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
-    set -e
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
-  fi
-
-  # if mig is enabled drivers would have already been installed
-  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-    install_nvidia_gpu_driver
-    install_cuda
-    load_kernel_module
-
-    #Install GPU metrics collection in Stackdriver if needed
-    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-      install_gpu_agent
-#      install_gpu_monitoring_agent
-      echo 'GPU metrics agent successfully deployed.'
-    else
-      echo 'GPU metrics agent has not been installed.'
-    fi
-    configure_gpu_exclusive_mode
-  fi
-
-  install_nvidia_container_toolkit
-  configure_yarn_nodemanager_gpu
-  configure_gpu_script
-  configure_gpu_isolation
-}
-
 function gpu_exit_handler() {
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
     for shmdir in /var/cudnn-local ; do

From 88f9f7f70370697555fc413e1cbc49e7a4f99507 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 15:16:47 -0800
Subject: [PATCH 110/130] state before gpu rebranch

---
 cloudbuild/presubmit.sh                   |    5 +
 templates/common/install_functions        |   53 +
 templates/common/yarn_functions           |   33 -
 templates/dask/dask.sh.in                 |    2 +-
 templates/dask/util_functions             |   18 +-
 templates/gpu/install_functions           |  947 ++++++++++++++++++
 templates/gpu/install_gpu_driver.sh.in    |   14 +-
 templates/gpu/spark_functions             |   36 +
 templates/gpu/util_functions              | 1093 +--------------------
 templates/gpu/yarn_functions              |  145 +++
 templates/rapids/rapids.sh.in             |   15 +-
 templates/spark-rapids/spark-rapids.sh.in |   13 +-
 12 files changed, 1220 insertions(+), 1154 deletions(-)
 create mode 100644 templates/common/install_functions
 create mode 100644 templates/gpu/install_functions
 create mode 100644 templates/gpu/spark_functions
 create mode 100644 templates/gpu/yarn_functions

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index 2b2e978b0..1ec0d5756 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -50,6 +50,11 @@ determine_tests_to_run() {
   # Infer the files that changed
   mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD)
   mapfile -t CHANGED_FILES < <(git diff origin/master --name-only | grep -v template)
+  for tt in $(git diff origin/master --name-only | grep 'templates/.*/.*\.sh\.in'); do
+    local genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"`
+    perl templates/generate-action.pl "${genfile}" > "${genfile}"
+    CHANGED_FILES+=("${genfile}")
+  done
   echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}"
   echo "Changed files: ${CHANGED_FILES[*]}"
 
diff --git a/templates/common/install_functions b/templates/common/install_functions
new file mode 100644
index 000000000..f731feed6
--- /dev/null
+++ b/templates/common/install_functions
@@ -0,0 +1,53 @@
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+  fi
+
+  apt-get update -qq
+}
+
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | dd of="${repo_path}" status=progress
+#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+}
+
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
+#
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+  mkdir -p "$(dirname "${kr_path}")"
+
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
diff --git a/templates/common/yarn_functions b/templates/common/yarn_functions
index 8e38c7b0a..6e556f975 100644
--- a/templates/common/yarn_functions
+++ b/templates/common/yarn_functions
@@ -1,17 +1,3 @@
-function configure_yarn_resources() {
-  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
-  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
-
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
-
 # This configuration should be applied only if GPU is attached to the node
 function configure_yarn_nodemanager() {
   set_hadoop_property 'yarn-site.xml' \
@@ -37,25 +23,6 @@ function configure_yarn_nodemanager() {
   fi
 }
 
-function setup_gpu_yarn() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn_resources
-
-  # When there is no GPU, but the installer is executing on a master node:
-  if [[ "${gpu_count}" == "0" ]] ; then
-    if [[ "${ROLE}" == "Master" ]]; then
-      configure_yarn_nodemanager
-    fi
-    return 0
-  fi
-
-  install_nvidia_container_toolkit
-  configure_yarn_nodemanager_gpu
-  configure_gpu_script
-  configure_gpu_isolation
-}
-
 function yarn_exit_handler() {
   # Restart YARN services if they are running already
   for svc in resourcemanager nodemanager; do
diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
index cafc2df89..2f8450dd6 100644
--- a/templates/dask/dask.sh.in
+++ b/templates/dask/dask.sh.in
@@ -31,7 +31,7 @@ function main() {
 
     configure_knox_for_dask
 
-    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
+    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')"
     if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
       configure_fluentd_for_dask
     fi
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index 5a1f7e201..d67da1fc1 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -423,15 +423,16 @@ function install_dask_rapids() {
   local dask_version="2024.7"
   local dask_spec="dask>=${dask_version}"
 
-  if is_cuda12 ; then
-    local python_spec="python>=3.11"
-    local cuda_spec="cuda-version>=12,<13"
-  elif is_cuda11 ; then
-    local python_spec="python>=3.9"
-    local cuda_spec="cuda-version>=11,<12.0a0"
+  local python_spec="python>=3.11"
+  local cuda_spec="cuda-version>=12,<13"
+  local cudart_spec="cuda-cudart"
+  if is_cuda11 ; then
+    python_spec="python>=3.9"
+    cuda_spec="cuda-version>=11,<12.0a0"
+    cudart_spec="cudatoolkit"
   fi
 
-  rapids_spec="rapids>=${RAPIDS_VERSION}"
+  local rapids_spec="rapids>=${RAPIDS_VERSION}"
   CONDA_PACKAGES=()
   local cache_key_name="dask-rapids-${RAPIDS_VERSION}"
   if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
@@ -443,7 +444,7 @@ function install_dask_rapids() {
     # https://github.com/dask/dask-yarn/issues/155
 
     dask_spec="dask<2022.2"
-    python_spec="python>=3.7,<3.8.0a0"
+    python_spec="python>=3.9"
     rapids_spec="rapids<=${rapids_version}"
     if is_ubuntu18 ; then
       # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
@@ -454,6 +455,7 @@ function install_dask_rapids() {
 
   CONDA_PACKAGES+=(
     "${cuda_spec}"
+    "${cudart_spec}"
     "${rapids_spec}"
     "${dask_spec}"
     "dask-bigquery"
diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
new file mode 100644
index 000000000..2ea8ca4d2
--- /dev/null
+++ b/templates/gpu/install_functions
@@ -0,0 +1,947 @@
+function set_cudnn_version() {
+  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+  # Parameters for NVIDIA-provided cuDNN library
+  DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly DEFAULT_CUDNN_VERSION
+  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
+    # cuDNN v8 is not distribution for ubuntu20+, debian12
+    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
+    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+    CUDNN_VERSION="8.8.0.121"
+  fi
+  readonly CUDNN_VERSION
+}
+
+
+function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
+function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
+
+function set_cuda_repo_shortname() {
+# Short name for urls
+# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
+  if is_rocky ; then
+    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
+  else
+    shortname="$(os_id)$(os_vercat)"
+  fi
+}
+
+function set_nv_urls() {
+  # Parameters for NVIDIA-provided package repositories
+  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+
+  # Parameter for NVIDIA-provided Rocky Linux GPU driver
+  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+}
+
+function set_cuda_runfile_url() {
+  local MAX_DRIVER_VERSION
+  local MAX_CUDA_VERSION
+
+  local MIN_OPEN_DRIVER_VER="515.48.07"
+  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
+  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
+
+  if is_cuda12 ; then
+    if is_debian12 ; then
+      MIN_DRIVER_VERSION="545.23.06"
+      MIN_CUDA_VERSION="12.3.0"
+    elif is_debian10 ; then
+      MAX_DRIVER_VERSION="555.42.02"
+      MAX_CUDA_VERSION="12.5.0"
+    elif is_ubuntu18 ; then
+      MAX_DRIVER_VERSION="530.30.02"
+      MAX_CUDA_VERSION="12.1.1"
+    fi
+  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    if le_debian10 ; then
+      # cuda 11 is not supported for <= debian10
+      MAX_CUDA_VERSION="0"
+      MAX_DRIVER_VERSION="0"
+    fi
+  else
+    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+
+  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
+    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
+    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
+    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  fi
+
+  # driver version named in cuda runfile filename
+  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+  readonly -A drv_for_cuda=(
+          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+          ["11.8.0"]="520.61.05"
+          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+          ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+  )
+
+  # Verify that the file with the indicated combination exists
+  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
+  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
+  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
+  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
+
+  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
+  readonly NVIDIA_CUDA_URL
+
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
+  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    exit 1
+  fi
+
+  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
+  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+  fi
+}
+
+function set_cudnn_tarball_url() {
+CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
+CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
+if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
+  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
+  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
+    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
+    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
+  fi
+  # Use legacy url format with one of the tarball name formats depending on version as above
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
+fi
+if ( version_ge "${CUDA_VERSION}" "12.0" ); then
+  # Use modern url format When cuda version is greater than or equal to 12.0
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
+fi
+readonly CUDNN_TARBALL
+readonly CUDNN_TARBALL_URL
+}
+
+function install_cuda_keyring_pkg() {
+  if ( test -v CUDA_KEYRING_PKG_INSTALLED &&
+       [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi
+  local kr_ver=1.1
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
+    -o "${tmpdir}/cuda-keyring.deb"
+  dpkg -i "${tmpdir}/cuda-keyring.deb"
+  rm -f "${tmpdir}/cuda-keyring.deb"
+  CUDA_KEYRING_PKG_INSTALLED="1"
+}
+
+function uninstall_cuda_keyring_pkg() {
+  apt-get purge -yq cuda-keyring
+  CUDA_KEYRING_PKG_INSTALLED="0"
+}
+
+function install_local_cuda_repo() {
+  is_complete install-local-cuda-repo && return
+
+  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  CUDA_LOCAL_REPO_INSTALLED="1"
+  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
+  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
+  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
+  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
+  readonly DIST_KEYRING_DIR="/var/${pkgname}"
+
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+
+  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+
+  if is_ubuntu ; then
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
+      -o /etc/apt/preferences.d/cuda-repository-pin-600
+  fi
+
+  mark_complete install-local-cuda-repo
+}
+function uninstall_local_cuda_repo(){
+  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
+  rm -f "${workdir}/complete/install-local-cuda-repo"
+}
+
+function install_local_cudnn_repo() {
+  is_complete install-local-cudnn-repo && return
+
+  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
+  CUDNN_PKG_NAME="${pkgname}"
+  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
+
+  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
+
+  dpkg -i "${tmpdir}/local-installer.deb"
+
+  rm -f "${tmpdir}/local-installer.deb"
+
+  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+
+  mark_complete install-local-cudnn-repo
+}
+
+function uninstall_local_cudnn_repo() {
+  apt-get purge -yq "${CUDNN_PKG_NAME}"
+  rm -f "${workdir}/complete/install-local-cudnn-repo"
+}
+
+function install_local_cudnn8_repo() {
+  is_complete install-local-cudnn8-repo && return
+
+  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
+  elif is_debian ; then cudnn8_shortname="debian11"
+  else return 0 ; fi
+  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
+  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
+  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
+
+  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
+  CUDNN8_PKG_NAME="${pkgname}"
+
+  deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_fn="${tmpdir}/${deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+
+  # cache the cudnn package
+  cache_fetched_package "${local_deb_url}" \
+                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${local_deb_fn}"
+
+  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
+  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
+  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
+    mkdir -p "${cudnn_path}"
+    mount -t tmpfs tmpfs "${cudnn_path}"
+  fi
+
+  dpkg -i "${local_deb_fn}"
+
+  rm -f "${local_deb_fn}"
+
+  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  mark_complete install-local-cudnn8-repo
+}
+
+function uninstall_local_cudnn8_repo() {
+  apt-get purge -yq "${CUDNN8_PKG_NAME}"
+  mark_incomplete install-local-cudnn8-repo
+}
+
+function install_nvidia_nccl() {
+  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
+  is_complete nccl && return
+
+  if is_cuda11 && is_debian12 ; then
+    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
+    return
+  fi
+
+  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
+
+  # https://github.com/NVIDIA/nccl/blob/master/README.md
+  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Fermi:     SM_20,             compute_30
+  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+  # The following architectures are suppored by open kernel driver
+  # Volta:     SM_70,SM_72,       compute_70,compute_72
+  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+  # The following architectures are supported by CUDA v11.8+
+  # Ada:       SM_89,             compute_89
+  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+  # Blackwell: SM_100,            compute_100
+                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
+  if version_ge "${CUDA_VERSION}" "11.8" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
+  fi
+  if version_ge "${CUDA_VERSION}" "12.0" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
+  fi
+
+  mkdir -p "${workdir}"
+  pushd "${workdir}"
+
+  test -d "${workdir}/nccl" || {
+    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "nccl-${NCCL_VERSION}-1" nccl
+  }
+
+  local build_path
+  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
+                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
+
+  test -d "${workdir}/nccl/build" || {
+    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+
+    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+    if echo "${output}" | grep -q "${gcs_tarball}" ; then
+      # cache hit - unpack from cache
+      echo "cache hit"
+    else
+      # build and cache
+      pushd nccl
+      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+      install_build_dependencies
+      if is_debuntu ; then
+        # These packages are required to build .deb packages from source
+        execute_with_retries \
+          apt-get install -y -qq build-essential devscripts debhelper fakeroot
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.debian.build
+      elif is_rocky ; then
+        # These packages are required to build .rpm packages from source
+        execute_with_retries \
+          dnf -y -q install rpm-build rpmdevtools
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.redhat.build
+      fi
+      tar czvf "/${local_tarball}" "../${build_path}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar xz
+  }
+
+  if is_debuntu ; then
+    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
+  elif is_rocky ; then
+    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
+  fi
+
+  popd
+  mark_complete nccl
+}
+
+function install_nvidia_cudnn() {
+  is_complete cudnn && return
+
+  local major_version
+  major_version="${CUDNN_VERSION%%.*}"
+  local cudnn_pkg_version
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
+
+  if is_rocky ; then
+    if is_cudnn8 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn${major_version}" \
+        "libcudnn${major_version}-devel"
+      sync
+    elif is_cudnn9 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
+        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
+      sync
+    else
+      echo "Unsupported cudnn version: '${major_version}'"
+    fi
+  elif is_debuntu; then
+    if ge_debian12 && is_src_os ; then
+      apt-get -y install nvidia-cudnn
+    else
+      if is_cudnn8 ; then
+        install_local_cudnn8_repo
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+            "libcudnn8=${cudnn_pkg_version}" \
+            "libcudnn8-dev=${cudnn_pkg_version}"
+
+        uninstall_local_cudnn8_repo
+	sync
+      elif is_cudnn9 ; then
+	install_cuda_keyring_pkg
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
+	sync
+      else
+        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
+      fi
+    fi
+  else
+    echo "Unsupported OS: '${_shortname}'"
+    exit 1
+  fi
+
+  ldconfig
+
+  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
+  mark_complete cudnn
+}
+
+function add_nonfree_components() {
+  if is_src_nvidia ; then return; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-open-kernel-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib non-free non-free-firmware"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
+  fi
+}
+
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+function add_repo_nvidia_container_toolkit() {
+  local nvctk_root="https://nvidia.github.io/libnvidia-container"
+  local signing_key_url="${nvctk_root}/gpgkey"
+  local repo_data
+
+  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
+                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+
+  os_add_repo nvidia-container-toolkit \
+              "${signing_key_url}" \
+              "${repo_data}" \
+              "no"
+}
+
+function add_repo_cuda() {
+  if is_debuntu ; then
+    install_cuda_keyring_pkg # 11.7+, 12.0+
+  elif is_rocky ; then
+    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
+  fi
+}
+
+function build_driver_from_github() {
+  # non-GPL driver will have been built on rocky8
+  if is_rocky8 ; then return 0 ; fi
+  pushd "${workdir}"
+
+  test -d "${workdir}/open-gpu-kernel-modules" || {
+    local tarball_fn="${DRIVER_VERSION}.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
+  }
+
+  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local def_dir="${modulus_md5sum:-unsigned}"
+    local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
+
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+      echo "cache hit"
+    else
+      # build the kernel modules
+      pushd open-gpu-kernel-modules
+      install_build_dependencies
+      if ( is_cuda11 && is_ubuntu22 ) ; then
+        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
+        exit 1
+      fi
+      execute_with_retries make -j$(nproc) modules \
+        >  kernel-open/build.log \
+        2> kernel-open/build_error.log
+      # Sign kernel modules
+      if [[ -n "${PSN}" ]]; then
+        configure_dkms_certs
+        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
+          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+          "${mok_key}" \
+          "${mok_der}" \
+          "${module}"
+        done
+	clear_dkms_key
+      fi
+      make modules_install \
+        >>  kernel-open/build.log \
+        2>> kernel-open/build_error.log
+      # Collect build logs and installed binaries
+      tar czvf "${local_tarball}" \
+        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+    depmod -a
+  }
+
+  popd
+}
+
+function build_driver_from_packages() {
+  if is_debuntu ; then
+    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
+      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
+      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
+    if is_debian ; then
+      pkglist=(
+        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
+        "nvidia-smi=${DRIVER_VERSION}-1"
+        "nvidia-alternative=${DRIVER_VERSION}-1"
+        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
+        "nvidia-kernel-support=${DRIVER_VERSION}-1"
+        "nvidia-modprobe=${DRIVER_VERSION}-1"
+        "libnvidia-ml1=${DRIVER_VERSION}-1"
+      )
+    fi
+    add_contrib_component
+    apt-get update -qq
+    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
+    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
+    sync
+
+  elif is_rocky ; then
+    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
+      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
+    else
+      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
+    fi
+    sync
+  fi
+}
+
+function install_nvidia_userspace_runfile() {
+  # Parameters for NVIDIA-provided Debian GPU driver
+  readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+  readonly USERSPACE_FILENAME
+
+  # This .run file contains NV's OpenGL implementation as well as
+  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
+  # including glib (https://docs.gtk.org/glib/), and what appears to
+  # be a copy of the source from the kernel-open directory of for
+  # example DRIVER_VERSION=560.35.03
+  #
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
+  #
+  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
+  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
+  is_complete userspace && return
+
+  local local_fn="${tmpdir}/userspace.run"
+
+  cache_fetched_package "${USERSPACE_URL}" \
+                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
+                        "${local_fn}"
+
+  local runfile_args
+  runfile_args=""
+  local cache_hit="0"
+  local local_tarball
+
+  if is_rocky8 ; then
+    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+      local_tarball="${workdir}/${build_tarball}"
+      local def_dir="${modulus_md5sum:-unsigned}"
+      local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
+
+      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+        cache_hit="1"
+        runfile_args="--no-kernel-modules"
+        echo "cache hit"
+      else
+        install_build_dependencies
+        configure_dkms_certs
+        local signing_options
+        signing_options=""
+        if [[ -n "${PSN}" ]]; then
+          signing_options="--module-signing-hash sha256 \
+          --module-signing-x509-hash sha256 \
+          --module-signing-secret-key \"${mok_key}\" \
+          --module-signing-public-key \"${mok_der}\" \
+          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
+          "
+        fi
+        runfile_args="--no-dkms ${signing_options}"
+      fi
+    }
+  else
+    runfile_args="--no-kernel-modules"
+  fi
+
+  execute_with_retries bash "${local_fn}" -e -q \
+    ${runfile_args} \
+    --ui=none \
+    --install-libglvnd \
+    --tmpdir="${tmpdir}"
+
+  if is_rocky8 ; then
+    if [[ "${cache_hit}" == "1" ]] ; then
+      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+      depmod -a
+    else
+      clear_dkms_key
+      tar czf "${local_tarball}" \
+        /var/log/nvidia-installer.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    fi
+  fi
+
+  rm -f "${local_fn}"
+  mark_complete userspace
+  sync
+}
+
+function install_cuda_runfile() {
+  is_complete cuda && return
+
+  local local_fn="${tmpdir}/cuda.run"
+
+  cache_fetched_package "${NVIDIA_CUDA_URL}" \
+			"${pkg_bucket}/${CUDA_RUNFILE}" \
+                        "${local_fn}"
+
+  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
+  rm -f "${local_fn}"
+  mark_complete cuda
+  sync
+}
+
+function install_cuda_toolkit() {
+  local cudatk_package=cuda-toolkit
+  if ge_debian12 && is_src_os ; then
+    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
+  elif [[ -n "${CUDA_VERSION}" ]]; then
+    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
+  fi
+  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
+  readonly cudatk_package
+  if is_debuntu ; then
+#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
+    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
+  elif is_rocky ; then
+    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
+    execute_with_retries dnf -y -q install "${cudatk_package}"
+  fi
+  sync
+}
+
+function load_kernel_module() {
+  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
+  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
+    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+  done
+
+  depmod -a
+  modprobe nvidia
+  for suffix in uvm modeset drm; do
+    modprobe "nvidia-${suffix}"
+  done
+  # TODO: if peermem is available, also modprobe nvidia-peermem
+}
+
+function install_cuda(){
+  is_complete cuda-repo && return
+
+  if ( ge_debian12 && is_src_os ) ; then
+    echo "installed with the driver on ${_shortname}"
+    return 0
+  fi
+
+  # The OS package distributions are unreliable
+  install_cuda_runfile
+
+  # Includes CUDA packages
+  add_repo_cuda
+
+  mark_complete cuda-repo
+}
+
+function install_nvidia_container_toolkit() {
+  is_complete install-nvtk && return
+
+  local container_runtime_default
+    if command -v docker     ; then container_runtime_default='docker'
+  elif command -v containerd ; then container_runtime_default='containerd'
+  elif command -v crio       ; then container_runtime_default='crio'
+                               else container_runtime_default='' ; fi
+  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
+
+  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
+
+  add_repo_nvidia_container_toolkit
+  if is_debuntu ; then
+    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
+    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
+  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
+  systemctl restart "${CONTAINER_RUNTIME}"
+
+  mark_complete install-nvtk
+}
+
+# Install NVIDIA GPU driver provided by NVIDIA
+function install_nvidia_gpu_driver() {
+  is_complete gpu-driver && return
+
+  if ( ge_debian12 && is_src_os ) ; then
+    add_nonfree_components
+    apt-get update -qq
+    apt-get -yq install \
+        dkms \
+        nvidia-open-kernel-dkms \
+        nvidia-open-kernel-support \
+        nvidia-smi \
+        libglvnd0 \
+        libcuda1
+    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
+    return 0
+  fi
+
+  # OS driver packages do not produce reliable driver ; use runfile
+  install_nvidia_userspace_runfile
+
+  build_driver_from_github
+
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+  mark_complete gpu-driver
+}
+
+function install_ops_agent(){
+  is_complete ops-agent && return
+
+  mkdir -p /opt/google
+  cd /opt/google
+  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
+  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
+
+  is_complete ops-agent
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_monitoring_agent() {
+  download_gpu_monitoring_agent
+  install_gpu_monitoring_agent_dependency
+  start_gpu_monitoring_agent_service
+}
+
+function download_gpu_monitoring_agent(){
+  if is_rocky ; then
+    execute_with_retries "dnf -y -q install git"
+  else
+    execute_with_retries "apt-get install git -y"
+  fi
+  mkdir -p /opt/google
+  chmod 777 /opt/google
+  cd /opt/google
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+}
+
+function install_gpu_monitoring_agent_dependency(){
+  cd /opt/google/compute-gpu-monitoring/linux
+  /usr/bin/python3 -m venv venv
+  (
+    source venv/bin/activate
+    pip install wheel
+    pip install -Ur requirements.txt
+  )
+}
+
+function start_gpu_monitoring_agent_service(){
+  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
+  systemctl daemon-reload
+  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_agent() {
+  # Stackdriver GPU agent parameters
+#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
+  if ( ! command -v pip && is_debuntu ) ; then
+    execute_with_retries "apt-get install -y -qq python3-pip"
+  fi
+  local install_dir=/opt/gpu-utilization-agent
+  mkdir -p "${install_dir}"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
+    | sed -e 's/-u --format=/--format=/' \
+    | dd status=none of="${install_dir}/report_gpu_metrics.py"
+  local venv="${install_dir}/venv"
+  /usr/bin/python3 -m venv "${venv}"
+(
+  source "${venv}/bin/activate"
+  python3 -m pip install --upgrade pip
+  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
+)
+  sync
+
+  # Generate GPU service.
+  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
+[Unit]
+Description=GPU Utilization Metric Agent
+
+[Service]
+Type=simple
+PIDFile=/run/gpu_agent.pid
+ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
+User=root
+Group=root
+WorkingDirectory=/
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+EOF
+  # Reload systemd manager configuration
+  systemctl daemon-reload
+  # Enable gpu-utilization-agent service
+  systemctl --no-reload --now enable gpu-utilization-agent.service
+}
+
+function configure_gpu_exclusive_mode() {
+  # only run this function when spark < 3.0
+  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
+  # include exclusive mode on GPU
+  nvsmi -c EXCLUSIVE_PROCESS
+  clear_nvsmi_cache
+}
+
+function install_build_dependencies() {
+  is_complete build-dependencies && return
+
+  if is_debuntu ; then
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
+
+  elif is_rocky ; then
+    execute_with_retries dnf -y -q install gcc
+
+    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
+    set +e
+    eval "${dnf_cmd}" > "${install_log}" 2>&1
+    local retval="$?"
+    set -e
+
+    if [[ "${retval}" == "0" ]] ; then return ; fi
+
+    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
+      # this kernel-devel may have been migrated to the vault
+      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
+      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
+      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
+        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
+       )"
+    fi
+
+    execute_with_retries "${dnf_cmd}"
+  fi
+  mark_complete build-dependencies
+}
+
+function install_gpu_driver_and_cuda() {
+  install_nvidia_gpu_driver
+  install_cuda
+  load_kernel_module
+}
+
+function prepare_gpu_install_env() {
+  # Whether to install NVIDIA-provided or OS-provided GPU driver
+  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+  readonly GPU_DRIVER_PROVIDER
+
+  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+  readonly INSTALL_GPU_AGENT
+
+  set_cuda_repo_shortname
+  set_nv_urls
+  set_cuda_runfile_url
+  set_cudnn_version
+  set_cudnn_tarball_url
+
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+}
+
+function gpu_install_exit_handler() {
+  if is_ramdisk ; then
+    for shmdir in /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+  fi
+  hold_nvidia_packages
+}
\ No newline at end of file
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index dcbd8c15e..001ef7acc 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -10,14 +10,18 @@ set -euxo pipefail
 
 [% INSERT common/util_functions %]
 
-[% INSERT common/yarn_functions %]
+[% INSERT common/install_functions %]
 
 [% INSERT gpu/util_functions %]
 
+[% INSERT gpu/install_functions %]
+
+[% INCLUDE gpu/yarn_functions %]
+
+[% INSERT gpu/spark_functions %]
+
 function main() {
-  install_nvidia_gpu_driver
-  install_cuda
-  load_kernel_module
+  install_gpu_driver_and_cuda
 
   #Install GPU metrics collection in Stackdriver if needed
   if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
@@ -54,6 +58,7 @@ function main() {
 }
 
 function exit_handler() {
+  gpu_install_exit_handler
   gpu_exit_handler
   pip_exit_handler
   yarn_exit_handler
@@ -65,6 +70,7 @@ function prepare_to_install(){
   prepare_common_env
   prepare_pip_env
   prepare_gpu_env
+  prepare_gpu_install_env
   trap exit_handler EXIT
 }
 
diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions
new file mode 100644
index 000000000..5da2530d4
--- /dev/null
+++ b/templates/gpu/spark_functions
@@ -0,0 +1,36 @@
+function install_spark_rapids() {
+  # Update SPARK RAPIDS config
+  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
+
+  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
+  local -r scala_ver="2.12"
+
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+  fi
+
+  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+
+  local jar_basename
+
+  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
+  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+}
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 4834adb33..48473d13b 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -49,8 +49,6 @@ function set_support_matrix() {
   )
 }
 
-set_support_matrix
-
 function set_cuda_version() {
   case "${DATAPROC_IMAGE_VERSION}" in
     "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
@@ -140,1027 +138,9 @@ function set_driver_version() {
   fi
 }
 
-function set_cudnn_version() {
-  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
-
-  # Parameters for NVIDIA-provided cuDNN library
-  DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-  readonly DEFAULT_CUDNN_VERSION
-  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
-    # cuDNN v8 is not distribution for ubuntu20+, debian12
-    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
-    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-    CUDNN_VERSION="8.8.0.121"
-  fi
-  readonly CUDNN_VERSION
-}
-
-
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-
-function set_cuda_repo_shortname() {
-# Short name for urls
-# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
-  if is_rocky ; then
-    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-  else
-    shortname="$(os_id)$(os_vercat)"
-  fi
-}
-
-function set_nv_urls() {
-  # Parameters for NVIDIA-provided package repositories
-  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
-
-  # Parameter for NVIDIA-provided Rocky Linux GPU driver
-  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
-}
-
-function set_cuda_runfile_url() {
-  local MAX_DRIVER_VERSION
-  local MAX_CUDA_VERSION
-
-  local MIN_OPEN_DRIVER_VER="515.48.07"
-  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
-  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
-
-  if is_cuda12 ; then
-    if is_debian12 ; then
-      MIN_DRIVER_VERSION="545.23.06"
-      MIN_CUDA_VERSION="12.3.0"
-    elif is_debian10 ; then
-      MAX_DRIVER_VERSION="555.42.02"
-      MAX_CUDA_VERSION="12.5.0"
-    elif is_ubuntu18 ; then
-      MAX_DRIVER_VERSION="530.30.02"
-      MAX_CUDA_VERSION="12.1.1"
-    fi
-  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
-    if le_debian10 ; then
-      # cuda 11 is not supported for <= debian10
-      MAX_CUDA_VERSION="0"
-      MAX_DRIVER_VERSION="0"
-    fi
-  else
-    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  fi
-
-  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
-    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
-    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  fi
-  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
-    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
-  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
-    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
-  fi
-
-  # driver version named in cuda runfile filename
-  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
-  readonly -A drv_for_cuda=(
-          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
-          ["11.8.0"]="520.61.05"
-          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
-          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
-          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
-          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
-          ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
-          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
-          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
-  )
-
-  # Verify that the file with the indicated combination exists
-  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
-  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
-  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
-  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
-
-  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
-  readonly NVIDIA_CUDA_URL
-
-  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
-  readonly CUDA_RUNFILE
-
-  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
-    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
-    exit 1
-  fi
-
-  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
-    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
-  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
-    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
-    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
-    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
-  fi
-}
-
-function set_cudnn_tarball_url() {
-CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
-CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
-if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
-  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
-  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
-    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
-    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
-  fi
-  # Use legacy url format with one of the tarball name formats depending on version as above
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
-fi
-if ( version_ge "${CUDA_VERSION}" "12.0" ); then
-  # Use modern url format When cuda version is greater than or equal to 12.0
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
-fi
-readonly CUDNN_TARBALL
-readonly CUDNN_TARBALL_URL
-}
-
-function install_cuda_keyring_pkg() {
-  if ( test -v CUDA_KEYRING_PKG_INSTALLED &&
-       [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi
-  local kr_ver=1.1
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
-    -o "${tmpdir}/cuda-keyring.deb"
-  dpkg -i "${tmpdir}/cuda-keyring.deb"
-  rm -f "${tmpdir}/cuda-keyring.deb"
-  CUDA_KEYRING_PKG_INSTALLED="1"
-}
-
-function uninstall_cuda_keyring_pkg() {
-  apt-get purge -yq cuda-keyring
-  CUDA_KEYRING_PKG_INSTALLED="0"
-}
-
-function install_local_cuda_repo() {
-  is_complete install-local-cuda-repo && return
-
-  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  CUDA_LOCAL_REPO_INSTALLED="1"
-  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
-  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
-  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
-  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
-  readonly DIST_KEYRING_DIR="/var/${pkgname}"
-
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-
-  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
-
-  if is_ubuntu ; then
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
-      -o /etc/apt/preferences.d/cuda-repository-pin-600
-  fi
-
-  mark_complete install-local-cuda-repo
-}
-function uninstall_local_cuda_repo(){
-  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cuda-repo"
-}
-
-function install_local_cudnn_repo() {
-  is_complete install-local-cudnn-repo && return
-
-  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
-  CUDNN_PKG_NAME="${pkgname}"
-  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
-
-  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
-
-  dpkg -i "${tmpdir}/local-installer.deb"
-
-  rm -f "${tmpdir}/local-installer.deb"
-
-  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
-
-  mark_complete install-local-cudnn-repo
-}
-
-function uninstall_local_cudnn_repo() {
-  apt-get purge -yq "${CUDNN_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cudnn-repo"
-}
-
-function install_local_cudnn8_repo() {
-  is_complete install-local-cudnn8-repo && return
-
-  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
-  elif is_debian ; then cudnn8_shortname="debian11"
-  else return 0 ; fi
-  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
-  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
-  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
-
-  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
-  CUDNN8_PKG_NAME="${pkgname}"
-
-  deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_fn="${tmpdir}/${deb_fn}"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
-
-  # cache the cudnn package
-  cache_fetched_package "${local_deb_url}" \
-                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
-                        "${local_deb_fn}"
-
-  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
-  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
-  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
-    mkdir -p "${cudnn_path}"
-    mount -t tmpfs tmpfs "${cudnn_path}"
-  fi
-
-  dpkg -i "${local_deb_fn}"
-
-  rm -f "${local_deb_fn}"
-
-  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  mark_complete install-local-cudnn8-repo
-}
-
-function uninstall_local_cudnn8_repo() {
-  apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  mark_incomplete install-local-cudnn8-repo
-}
-
-function install_nvidia_nccl() {
-  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
-
-  is_complete nccl && return
-
-  if is_cuda11 && is_debian12 ; then
-    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
-    return
-  fi
-
-  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
-
-  # https://github.com/NVIDIA/nccl/blob/master/README.md
-  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Fermi:     SM_20,             compute_30
-  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
-  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
-  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
-
-  # The following architectures are suppored by open kernel driver
-  # Volta:     SM_70,SM_72,       compute_70,compute_72
-  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
-
-  # The following architectures are supported by CUDA v11.8+
-  # Ada:       SM_89,             compute_89
-  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
-  # Blackwell: SM_100,            compute_100
-                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
-  if version_ge "${CUDA_VERSION}" "11.8" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
-  fi
-  if version_ge "${CUDA_VERSION}" "12.0" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
-  fi
-
-  mkdir -p "${workdir}"
-  pushd "${workdir}"
-
-  test -d "${workdir}/nccl" || {
-    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
-      | tar xz
-    mv "nccl-${NCCL_VERSION}-1" nccl
-  }
-
-  local build_path
-  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
-                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
-
-  test -d "${workdir}/nccl/build" || {
-    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
-    local local_tarball="${workdir}/${build_tarball}"
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
-
-    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
-    if echo "${output}" | grep -q "${gcs_tarball}" ; then
-      # cache hit - unpack from cache
-      echo "cache hit"
-    else
-      # build and cache
-      pushd nccl
-      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
-      install_build_dependencies
-      if is_debuntu ; then
-        # These packages are required to build .deb packages from source
-        execute_with_retries \
-          apt-get install -y -qq build-essential devscripts debhelper fakeroot
-        export NVCC_GENCODE
-        execute_with_retries make -j$(nproc) pkg.debian.build
-      elif is_rocky ; then
-        # These packages are required to build .rpm packages from source
-        execute_with_retries \
-          dnf -y -q install rpm-build rpmdevtools
-        export NVCC_GENCODE
-        execute_with_retries make -j$(nproc) pkg.redhat.build
-      fi
-      tar czvf "/${local_tarball}" "../${build_path}"
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
-      make clean
-      popd
-    fi
-    gcloud storage cat "${gcs_tarball}" | tar xz
-  }
-
-  if is_debuntu ; then
-    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
-  elif is_rocky ; then
-    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
-  fi
-
-  popd
-  mark_complete nccl
-}
-
 function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
 function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
-function install_nvidia_cudnn() {
-  is_complete cudnn && return
-
-  local major_version
-  major_version="${CUDNN_VERSION%%.*}"
-  local cudnn_pkg_version
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
-
-  if is_rocky ; then
-    if is_cudnn8 ; then
-      execute_with_retries dnf -y -q install \
-        "libcudnn${major_version}" \
-        "libcudnn${major_version}-devel"
-      sync
-    elif is_cudnn9 ; then
-      execute_with_retries dnf -y -q install \
-        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
-        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
-      sync
-    else
-      echo "Unsupported cudnn version: '${major_version}'"
-    fi
-  elif is_debuntu; then
-    if ge_debian12 && is_src_os ; then
-      apt-get -y install nvidia-cudnn
-    else
-      if is_cudnn8 ; then
-        install_local_cudnn8_repo
-
-        apt-get update -qq
-
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-            "libcudnn8=${cudnn_pkg_version}" \
-            "libcudnn8-dev=${cudnn_pkg_version}"
-
-        uninstall_local_cudnn8_repo
-	sync
-      elif is_cudnn9 ; then
-	install_cuda_keyring_pkg
-
-        apt-get update -qq
-
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
-	sync
-      else
-        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
-      fi
-    fi
-  else
-    echo "Unsupported OS: '${_shortname}'"
-    exit 1
-  fi
-
-  ldconfig
-
-  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
-  mark_complete cudnn
-}
-
-function add_nonfree_components() {
-  if is_src_nvidia ; then return; fi
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-open-kernel-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib non-free non-free-firmware"
-
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
-  fi
-}
-
-#
-# Install package signing key and add corresponding repository
-# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
-function add_repo_nvidia_container_toolkit() {
-  local nvctk_root="https://nvidia.github.io/libnvidia-container"
-  local signing_key_url="${nvctk_root}/gpgkey"
-  local repo_data
-
-  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
-                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
-
-  os_add_repo nvidia-container-toolkit \
-              "${signing_key_url}" \
-              "${repo_data}" \
-              "no"
-}
-
-function add_repo_cuda() {
-  if is_debuntu ; then
-    install_cuda_keyring_pkg # 11.7+, 12.0+
-  elif is_rocky ; then
-    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
-  fi
-}
-
-function build_driver_from_github() {
-  # non-GPL driver will have been built on rocky8
-  if is_rocky8 ; then return 0 ; fi
-  pushd "${workdir}"
-
-  test -d "${workdir}/open-gpu-kernel-modules" || {
-    local tarball_fn="${DRIVER_VERSION}.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
-      | tar xz
-    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
-  }
-
-  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
-  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
-    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
-    local local_tarball="${workdir}/${build_tarball}"
-    local def_dir="${modulus_md5sum:-unsigned}"
-    local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
-
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
-
-    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
-      echo "cache hit"
-    else
-      # build the kernel modules
-      pushd open-gpu-kernel-modules
-      install_build_dependencies
-      if ( is_cuda11 && is_ubuntu22 ) ; then
-        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
-        exit 1
-      fi
-      execute_with_retries make -j$(nproc) modules \
-        >  kernel-open/build.log \
-        2> kernel-open/build_error.log
-      # Sign kernel modules
-      if [[ -n "${PSN}" ]]; then
-        configure_dkms_certs
-        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
-          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
-          "${mok_key}" \
-          "${mok_der}" \
-          "${module}"
-        done
-	clear_dkms_key
-      fi
-      make modules_install \
-        >>  kernel-open/build.log \
-        2>> kernel-open/build_error.log
-      # Collect build logs and installed binaries
-      tar czvf "${local_tarball}" \
-        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
-        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
-      make clean
-      popd
-    fi
-    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
-    depmod -a
-  }
-
-  popd
-}
-
-function build_driver_from_packages() {
-  if is_debuntu ; then
-    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
-      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
-      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
-    if is_debian ; then
-      pkglist=(
-        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
-        "nvidia-smi=${DRIVER_VERSION}-1"
-        "nvidia-alternative=${DRIVER_VERSION}-1"
-        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
-        "nvidia-kernel-support=${DRIVER_VERSION}-1"
-        "nvidia-modprobe=${DRIVER_VERSION}-1"
-        "libnvidia-ml1=${DRIVER_VERSION}-1"
-      )
-    fi
-    add_contrib_component
-    apt-get update -qq
-    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
-    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
-    sync
-
-  elif is_rocky ; then
-    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
-      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
-    else
-      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
-    fi
-    sync
-  fi
-}
-
-function install_nvidia_userspace_runfile() {
-  # Parameters for NVIDIA-provided Debian GPU driver
-  readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-
-  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
-
-  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
-  readonly USERSPACE_FILENAME
-
-  # This .run file contains NV's OpenGL implementation as well as
-  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
-  # including glib (https://docs.gtk.org/glib/), and what appears to
-  # be a copy of the source from the kernel-open directory of for
-  # example DRIVER_VERSION=560.35.03
-  #
-  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
-  #
-  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
-  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  is_complete userspace && return
-
-  local local_fn="${tmpdir}/userspace.run"
-
-  cache_fetched_package "${USERSPACE_URL}" \
-                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
-                        "${local_fn}"
-
-  local runfile_args
-  runfile_args=""
-  local cache_hit="0"
-  local local_tarball
-
-  if is_rocky8 ; then
-    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
-    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
-      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
-      local_tarball="${workdir}/${build_tarball}"
-      local def_dir="${modulus_md5sum:-unsigned}"
-      local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
-
-      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
-
-      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
-        cache_hit="1"
-        runfile_args="--no-kernel-modules"
-        echo "cache hit"
-      else
-        install_build_dependencies
-        configure_dkms_certs
-        local signing_options
-        signing_options=""
-        if [[ -n "${PSN}" ]]; then
-          signing_options="--module-signing-hash sha256 \
-          --module-signing-x509-hash sha256 \
-          --module-signing-secret-key \"${mok_key}\" \
-          --module-signing-public-key \"${mok_der}\" \
-          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
-          "
-        fi
-        runfile_args="--no-dkms ${signing_options}"
-      fi
-    }
-  else
-    runfile_args="--no-kernel-modules"
-  fi
-
-  execute_with_retries bash "${local_fn}" -e -q \
-    ${runfile_args} \
-    --ui=none \
-    --install-libglvnd \
-    --tmpdir="${tmpdir}"
-
-  if is_rocky8 ; then
-    if [[ "${cache_hit}" == "1" ]] ; then
-      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
-      depmod -a
-    else
-      clear_dkms_key
-      tar czf "${local_tarball}" \
-        /var/log/nvidia-installer.log \
-        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-    fi
-  fi
-
-  rm -f "${local_fn}"
-  mark_complete userspace
-  sync
-}
-
-function install_cuda_runfile() {
-  is_complete cuda && return
-
-  local local_fn="${tmpdir}/cuda.run"
-
-  cache_fetched_package "${NVIDIA_CUDA_URL}" \
-			"${pkg_bucket}/${CUDA_RUNFILE}" \
-                        "${local_fn}"
-
-  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
-  rm -f "${local_fn}"
-  mark_complete cuda
-  sync
-}
-
-function install_cuda_toolkit() {
-  local cudatk_package=cuda-toolkit
-  if ge_debian12 && is_src_os ; then
-    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
-  elif [[ -n "${CUDA_VERSION}" ]]; then
-    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
-  fi
-  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
-  readonly cudatk_package
-  if is_debuntu ; then
-#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
-    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
-  elif is_rocky ; then
-    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
-    execute_with_retries dnf -y -q install "${cudatk_package}"
-  fi
-  sync
-}
-
-function load_kernel_module() {
-  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
-  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
-  done
-
-  depmod -a
-  modprobe nvidia
-  for suffix in uvm modeset drm; do
-    modprobe "nvidia-${suffix}"
-  done
-  # TODO: if peermem is available, also modprobe nvidia-peermem
-}
-
-function install_cuda(){
-  is_complete cuda-repo && return
-
-  if ( ge_debian12 && is_src_os ) ; then
-    echo "installed with the driver on ${_shortname}"
-    return 0
-  fi
-
-  # The OS package distributions are unreliable
-  install_cuda_runfile
-
-  # Includes CUDA packages
-  add_repo_cuda
-
-  mark_complete cuda-repo
-}
-
-function install_nvidia_container_toolkit() {
-  is_complete install-nvtk && return
-
-  local container_runtime_default
-    if command -v docker     ; then container_runtime_default='docker'
-  elif command -v containerd ; then container_runtime_default='containerd'
-  elif command -v crio       ; then container_runtime_default='crio'
-                               else container_runtime_default='' ; fi
-  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
-
-  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
-
-  add_repo_nvidia_container_toolkit
-  if is_debuntu ; then
-    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
-    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
-  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
-  systemctl restart "${CONTAINER_RUNTIME}"
-
-  mark_complete install-nvtk
-}
-
-# Install NVIDIA GPU driver provided by NVIDIA
-function install_nvidia_gpu_driver() {
-  is_complete gpu-driver && return
-
-  if ( ge_debian12 && is_src_os ) ; then
-    add_nonfree_components
-    apt-get update -qq
-    apt-get -yq install \
-        dkms \
-        nvidia-open-kernel-dkms \
-        nvidia-open-kernel-support \
-        nvidia-smi \
-        libglvnd0 \
-        libcuda1
-    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
-    return 0
-  fi
-
-  # OS driver packages do not produce reliable driver ; use runfile
-  install_nvidia_userspace_runfile
-
-  build_driver_from_github
-
-  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  mark_complete gpu-driver
-}
-
-function install_ops_agent(){
-  is_complete ops-agent && return
-
-  mkdir -p /opt/google
-  cd /opt/google
-  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
-  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
-  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
-
-  is_complete ops-agent
-}
-
-# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
-function install_gpu_monitoring_agent() {
-  download_gpu_monitoring_agent
-  install_gpu_monitoring_agent_dependency
-  start_gpu_monitoring_agent_service
-}
-
-function download_gpu_monitoring_agent(){
-  if is_rocky ; then
-    execute_with_retries "dnf -y -q install git"
-  else
-    execute_with_retries "apt-get install git -y"
-  fi
-  mkdir -p /opt/google
-  chmod 777 /opt/google
-  cd /opt/google
-  test -d compute-gpu-monitoring || \
-    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
-}
-
-function install_gpu_monitoring_agent_dependency(){
-  cd /opt/google/compute-gpu-monitoring/linux
-  /usr/bin/python3 -m venv venv
-  (
-    source venv/bin/activate
-    pip install wheel
-    pip install -Ur requirements.txt
-  )
-}
-
-function start_gpu_monitoring_agent_service(){
-  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
-  systemctl daemon-reload
-  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
-}
-
-# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
-function install_gpu_agent() {
-  # Stackdriver GPU agent parameters
-#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
-  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
-  if ( ! command -v pip && is_debuntu ) ; then
-    execute_with_retries "apt-get install -y -qq python3-pip"
-  fi
-  local install_dir=/opt/gpu-utilization-agent
-  mkdir -p "${install_dir}"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
-    | sed -e 's/-u --format=/--format=/' \
-    | dd status=none of="${install_dir}/report_gpu_metrics.py"
-  local venv="${install_dir}/venv"
-  /usr/bin/python3 -m venv "${venv}"
-(
-  source "${venv}/bin/activate"
-  python3 -m pip install --upgrade pip
-  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
-)
-  sync
-
-  # Generate GPU service.
-  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
-[Unit]
-Description=GPU Utilization Metric Agent
-
-[Service]
-Type=simple
-PIDFile=/run/gpu_agent.pid
-ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
-User=root
-Group=root
-WorkingDirectory=/
-Restart=always
-
-[Install]
-WantedBy=multi-user.target
-EOF
-  # Reload systemd manager configuration
-  systemctl daemon-reload
-  # Enable gpu-utilization-agent service
-  systemctl --no-reload --now enable gpu-utilization-agent.service
-}
-
-function configure_gpu_exclusive_mode() {
-  # only run this function when spark < 3.0
-  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
-  # include exclusive mode on GPU
-  nvsmi -c EXCLUSIVE_PROCESS
-  clear_nvsmi_cache
-}
-
-function install_spark_rapids() {
-  # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
-  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
-
-  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
-  local -r scala_ver="2.12"
-
-  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
-  fi
-
-  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
-  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
-
-  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
-  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
-  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
-
-  local jar_basename
-
-  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
-  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
-
-  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
-  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
-
-  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
-  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
-}
-
-function configure_gpu_script() {
-  # Download GPU discovery script
-  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
-  mkdir -p ${spark_gpu_script_dir}
-  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
-  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
-  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
-  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
-  cat > "${gpus_resources_script}" <<'EOF'
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
-
-ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
-
-echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
-EOF
-
-  chmod a+rx "${gpus_resources_script}"
-
-  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-
-  local executor_cores
-  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
-  local executor_memory
-  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
-  local task_cpus=2
-  local gpu_amount
-
-  # The current setting of spark.task.resource.gpu.amount (0.333) is
-  # not ideal to get the best performance from the RAPIDS Accelerator
-  # plugin. It's recommended to be 1/{executor core count} unless you
-  # have a special use case.
-#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
-
-# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
-
-  cat >>"${spark_defaults_conf}" <<EOF
-###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
-# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
-# query explain output won't show GPU operator, if the user has doubts
-# they can uncomment the line before seeing the GPU plan explain;
-# having AQE enabled gives user the best performance.
-spark.executor.resource.gpu.amount=${gpu_count}
-spark.plugins=com.nvidia.spark.SQLPlugin
-spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
-spark.executor.cores=${executor_cores}
-spark.executor.memory=${executor_memory_gb}G
-spark.dynamicAllocation.enabled=false
-# please update this config according to your application
-spark.task.resource.gpu.amount=${gpu_amount}
-spark.task.cpus=2
-spark.yarn.unmanagedAM.enabled=false
-###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
-EOF
-}
-
-function configure_yarn_nodemanager_gpu() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
-  configure_yarn_nodemanager
-}
-
-function configure_gpu_isolation() {
-  # enable GPU isolation
-  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
-  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
-    # configure the container-executor.cfg to have major caps
-    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
-    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-  else
-    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
-  fi
-
-  # Configure a systemd unit to ensure that permissions are set on restart
-  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
-[Unit]
-Description=Set permissions to allow YARN to access device directories
-
-[Service]
-ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-  systemctl enable dataproc-cgroup-device-permissions
-  systemctl start dataproc-cgroup-device-permissions
-}
-
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
   if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
@@ -1191,49 +171,12 @@ function query_nvsmi() {
   nvsmi -q -x --dtd > "${nvsmi_query_xml}"
 }
 
-function install_build_dependencies() {
-  is_complete build-dependencies && return
-
-  if is_debuntu ; then
-    if is_ubuntu22 && is_cuda12 ; then
-      # On ubuntu22, the default compiler does not build some kernel module versions
-      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
-      execute_with_retries apt-get install -y -qq gcc-12
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-12
-    fi
-
-  elif is_rocky ; then
-    execute_with_retries dnf -y -q install gcc
-
-    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    set +e
-    eval "${dnf_cmd}" > "${install_log}" 2>&1
-    local retval="$?"
-    set -e
-
-    if [[ "${retval}" == "0" ]] ; then return ; fi
-
-    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
-      # this kernel-devel may have been migrated to the vault
-      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
-      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
-      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
-        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
-       )"
-    fi
+function prepare_gpu_env(){
+  set_support_matrix
 
-    execute_with_retries "${dnf_cmd}"
-  fi
-  mark_complete build-dependencies
-}
+  set_cuda_version
+  set_driver_version
 
-function prepare_gpu_env(){
   set +e
   gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
   set -e
@@ -1256,27 +199,8 @@ function prepare_gpu_env(){
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
   readonly RAPIDS_RUNTIME
 
-  # Whether to install NVIDIA-provided or OS-provided GPU driver
-  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
-  readonly GPU_DRIVER_PROVIDER
-
-  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-  readonly INSTALL_GPU_AGENT
-
   # determine whether we have nvidia-smi installed and working
   nvsmi
-
-  set_cuda_version
-  set_driver_version
-  set_cuda_repo_shortname
-  set_nv_urls
-  set_cuda_runfile_url
-  set_cudnn_version
-  set_cudnn_tarball_url
-
-  if   is_cuda11 ; then gcc_ver="11"
-  elif is_cuda12 ; then gcc_ver="12" ; fi
 }
 
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
@@ -1292,12 +216,5 @@ function hold_nvidia_packages() {
 }
 
 function gpu_exit_handler() {
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    for shmdir in /var/cudnn-local ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
-        umount -f ${shmdir}
-      fi
-    done
-  fi
-  hold_nvidia_packages
+  echo "no operations in gpu exit handler"
 }
diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions
new file mode 100644
index 000000000..5b8455c19
--- /dev/null
+++ b/templates/gpu/yarn_functions
@@ -0,0 +1,145 @@
+[% INSERT common/yarn_functions %]
+
+function configure_yarn_gpu_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
+
+function configure_gpu_script() {
+  # Download GPU discovery script
+  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
+  mkdir -p ${spark_gpu_script_dir}
+  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
+  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
+  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
+  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
+  cat > "${gpus_resources_script}" <<'EOF'
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
+
+ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
+
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+EOF
+
+  chmod a+rx "${gpus_resources_script}"
+
+  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
+
+  local executor_cores
+  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+  local executor_memory
+  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+  local task_cpus=2
+  local gpu_amount
+
+  # The current setting of spark.task.resource.gpu.amount (0.333) is
+  # not ideal to get the best performance from the RAPIDS Accelerator
+  # plugin. It's recommended to be 1/{executor core count} unless you
+  # have a special use case.
+#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
+
+# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
+
+  cat >>"${spark_defaults_conf}" <<EOF
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
+# query explain output won't show GPU operator, if the user has doubts
+# they can uncomment the line before seeing the GPU plan explain;
+# having AQE enabled gives user the best performance.
+spark.executor.resource.gpu.amount=${gpu_count}
+spark.plugins=com.nvidia.spark.SQLPlugin
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
+spark.executor.cores=${executor_cores}
+spark.executor.memory=${executor_memory_gb}G
+spark.dynamicAllocation.enabled=false
+# please update this config according to your application
+spark.task.resource.gpu.amount=${gpu_amount}
+spark.task.cpus=2
+spark.yarn.unmanagedAM.enabled=false
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
+}
+
+function configure_yarn_nodemanager_gpu() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
+  configure_yarn_nodemanager
+}
+
+function configure_gpu_isolation() {
+  # enable GPU isolation
+  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
+  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
+    # configure the container-executor.cfg to have major caps
+    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+  else
+    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+  fi
+
+  # Configure a systemd unit to ensure that permissions are set on restart
+  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
+[Unit]
+Description=Set permissions to allow YARN to access device directories
+
+[Service]
+ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+  systemctl enable dataproc-cgroup-device-permissions
+  systemctl start dataproc-cgroup-device-permissions
+}
+
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_gpu_resources
+
+  # When there is no GPU, but the installer is executing on a master node:
+  if [[ "${gpu_count}" == "0" ]] ; then
+    if [[ "${ROLE}" == "Master" ]]; then
+      configure_yarn_nodemanager
+    fi
+    return 0
+  fi
+
+  install_nvidia_container_toolkit
+  configure_yarn_nodemanager_gpu
+  configure_gpu_script
+  configure_gpu_isolation
+}
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index 4e46ab1d3..61b7247c0 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -16,20 +16,6 @@ set -euxo pipefail
 [% INSERT dask/util_functions %]
 
 function main() {
-  setup_gpu_yarn
-
-  echo "yarn setup complete"
-
-  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
-    install_nvidia_nccl
-    install_nvidia_cudnn
-  fi
-
-  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
-    echo "RAPIDS recognizes SPARK runtime - currently supported using gpu/install_gpu_driver.sh or spark-rapids/spark-rapids.sh"
-    exit 1
-  fi
-
   # Install Dask with RAPIDS
   install_dask_rapids
 
@@ -38,6 +24,7 @@ function main() {
   if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
     # Create cuda accelerated Dask YARN config file
     configure_dask_yarn
+    echo "yarn setup complete"
   else
     # Create Dask service
     install_systemd_dask_service
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 0bfc0b331..2435bb732 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -26,15 +26,14 @@
 set -euxo pipefail
 
 [% INSERT common/util_functions %]
-
-[% INSERT common/yarn_functions %]
-
+[% INSERT common/install_functions %]
 [% INSERT gpu/util_functions %]
+[% INSERT gpu/install_functions %]
+[% INCLUDE gpu/yarn_functions %]
+[% INSERT gpu/spark_functions %]
 
 function main() {
-  install_nvidia_gpu_driver
-  install_cuda
-  load_kernel_module
+  install_gpu_driver_and_cuda
 
   #Install GPU metrics collection in Stackdriver if needed
   if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
@@ -67,6 +66,7 @@ function main() {
 }
 
 function exit_handler() {
+  gpu_install_exit_handler
   gpu_exit_handler
   pip_exit_handler
   yarn_exit_handler
@@ -78,6 +78,7 @@ function prepare_to_install(){
   prepare_common_env
   prepare_pip_env
   prepare_gpu_env
+  prepare_gpu_install_env
   trap exit_handler EXIT
 }
 

From 119f1b1c7855a174f8331da4ef028d4dfbd3e318 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 18:50:33 -0800
Subject: [PATCH 111/130] templates/common/util_functions: * increased minimum
 memory threshold for ram disk * moved apt_add_repo and friends to
 common/install_functions

templates/dask/util_functions:
* validating conda tarball before caching to gcs

templates/generate-action.pl:
* improved usage documentation a little

templates/gpu/install_functions
* using /opt/conda/miniconda3/bin/python3 instead of /usr/bin/ for
  venv pre-install
---
 templates/common/util_functions | 56 +--------------------------------
 templates/dask/util_functions   |  4 +--
 templates/generate-action.pl    | 14 ++++++++-
 templates/gpu/install_functions |  4 +--
 4 files changed, 18 insertions(+), 60 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 4d9f983a4..336af37f8 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -297,7 +297,7 @@ function is_ramdisk() {
 function mount_ramdisk(){
   local free_mem
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+  if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi
 
   # Write to a ramdisk instead of churning the persistent disk
 
@@ -350,60 +350,6 @@ function check_os() {
   fi
 }
 
-#
-# Generate repo file under /etc/apt/sources.list.d/
-#
-function apt_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local -r include_src="${4:-yes}"
-  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
-
-  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
-  if [[ "${include_src}" == "yes" ]] ; then
-    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
-  fi
-
-  apt-get update -qq
-}
-
-#
-# Generate repo file under /etc/yum.repos.d/
-#
-function dnf_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
-  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
-
-  curl -s -L "${repo_url}" \
-    | dd of="${repo_path}" status=progress
-#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
-}
-
-#
-# Keyrings default to
-# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
-# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
-#
-function os_add_repo() {
-  local -r repo_name="$1"
-  local -r signing_key_url="$2"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local kr_path
-  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
-
-  mkdir -p "$(dirname "${kr_path}")"
-
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
-    | gpg --import --no-default-keyring --keyring "${kr_path}"
-
-  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
-                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
-}
-
 function configure_dkms_certs() {
   if test -v PSN && [[ -z "${PSN}" ]]; then
       echo "No signing secret provided.  skipping";
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index d67da1fc1..ce6964e94 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -510,8 +510,8 @@ function install_conda_packages() {
     if [[ "$retval" == "0" ]] ; then
       is_installed="1"
       pushd "${DASK_CONDA_ENV}"
-      time (
-        tar czf "${local_tarball}" .
+      time ( set -e
+        tar czf "${local_tarball}" . && tar tzf "${local_tarball}"
         gcloud storage cp "${local_tarball}" "${gcs_tarball}"
         rm "${local_tarball}"
       )
diff --git a/templates/generate-action.pl b/templates/generate-action.pl
index 7cc954a67..950bd15fe 100644
--- a/templates/generate-action.pl
+++ b/templates/generate-action.pl
@@ -10,7 +10,19 @@
 my $action = $ARGV[0];
 my $v = { template_path => "${action}.in" };
 
-sub usage{ die "Usage: $0 <action>" }
+sub usage{
+  # TODO: use File::Find to list the available actions for the user
+  my $message = <<EOF;
+This script evaluates a template to generate an initialization action.
+The output is printed to STDOUT.
+
+Action templates reside under templates/$action and end in .sh.in
+
+The <action> argument is the destination action name, not the source.
+EOF
+  print STDERR $message;
+  die "Usage:$/$0 <action>"
+}
 
 usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}" );
 
diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
index 2ea8ca4d2..f6aa9fcf9 100644
--- a/templates/gpu/install_functions
+++ b/templates/gpu/install_functions
@@ -798,7 +798,7 @@ function download_gpu_monitoring_agent(){
 
 function install_gpu_monitoring_agent_dependency(){
   cd /opt/google/compute-gpu-monitoring/linux
-  /usr/bin/python3 -m venv venv
+  /opt/conda/miniconda3/bin/python3 -m venv venv
   (
     source venv/bin/activate
     pip install wheel
@@ -829,7 +829,7 @@ function install_gpu_agent() {
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
   local venv="${install_dir}/venv"
-  /usr/bin/python3 -m venv "${venv}"
+  /opt/conda/miniconda3/bin/python3 -m venv "${venv}"
 (
   source "${venv}/bin/activate"
   python3 -m pip install --upgrade pip

From d45e16bc8474f5c745985dd65f76c31ac11046a5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 8 Jan 2025 22:10:42 -0800
Subject: [PATCH 112/130] templates/dask/util_functions: * increase wait time
 for scheduler to come online * reduce noise from tar -t

templates/gpu/yarn_functions,
templates/gpu/install_functions:
* protect many functions from running without attached accelerator

templates/gpu/install_gpu_driver.sh.in
* set +e in exit handler

templates/gpu/spark_functions:
* re-factor new function into this template

templates/spark-rapids/spark-rapids.sh.in
* removed redundant call to configure_gpu_script
* set +e in exit handler
---
 templates/dask/util_functions             |  7 ++++---
 templates/gpu/install_functions           | 11 +++++++++--
 templates/gpu/install_gpu_driver.sh.in    |  1 +
 templates/gpu/spark_functions             |  7 +++++++
 templates/gpu/yarn_functions              |  7 ++++++-
 templates/spark-rapids/spark-rapids.sh.in |  2 +-
 6 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index ce6964e94..afcaadb58 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -150,7 +150,8 @@ function start_systemd_dask_service() {
     # Pause while scheduler comes online
     retries=30
     while ! nc -vz "${MASTER}" 8786 ; do
-      sleep 3s
+      date
+      sleep 7s
       ((retries--))
       if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi
     done
@@ -399,7 +400,7 @@ function install_dask() {
       # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
       CONDA_PACKAGES+=("fiona<1.8.22")
     fi
-    CONDA_PACKAGES+=('dask-yarn=${dask_yarn_version}' "distributed<2022.2")
+    CONDA_PACKAGES+=("dask-yarn=${dask_yarn_version}" "distributed<2022.2")
   fi
 
   CONDA_PACKAGES+=(
@@ -511,7 +512,7 @@ function install_conda_packages() {
       is_installed="1"
       pushd "${DASK_CONDA_ENV}"
       time ( set -e
-        tar czf "${local_tarball}" . && tar tzf "${local_tarball}"
+        tar czf "${local_tarball}" . && tar tzf "${local_tarball}" > /dev/null
         gcloud storage cp "${local_tarball}" "${gcs_tarball}"
         rm "${local_tarball}"
       )
diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
index f6aa9fcf9..746eb79bb 100644
--- a/templates/gpu/install_functions
+++ b/templates/gpu/install_functions
@@ -685,9 +685,12 @@ function install_cuda_toolkit() {
 }
 
 function load_kernel_module() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
   for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+    ( set +e
+      rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+    )
   done
 
   depmod -a
@@ -700,6 +703,7 @@ function load_kernel_module() {
 
 function install_cuda(){
   is_complete cuda-repo && return
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     echo "installed with the driver on ${_shortname}"
@@ -740,6 +744,7 @@ function install_nvidia_container_toolkit() {
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
   is_complete gpu-driver && return
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
@@ -778,6 +783,7 @@ function install_ops_agent(){
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
 function install_gpu_monitoring_agent() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   download_gpu_monitoring_agent
   install_gpu_monitoring_agent_dependency
   start_gpu_monitoring_agent_service
@@ -861,6 +867,7 @@ EOF
 }
 
 function configure_gpu_exclusive_mode() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # only run this function when spark < 3.0
   if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
   # include exclusive mode on GPU
@@ -944,4 +951,4 @@ function gpu_install_exit_handler() {
     done
   fi
   hold_nvidia_packages
-}
\ No newline at end of file
+}
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index 001ef7acc..a7c4d353f 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -58,6 +58,7 @@ function main() {
 }
 
 function exit_handler() {
+  set +e
   gpu_install_exit_handler
   gpu_exit_handler
   pip_exit_handler
diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions
index 5da2530d4..fa29330de 100644
--- a/templates/gpu/spark_functions
+++ b/templates/gpu/spark_functions
@@ -1,3 +1,10 @@
+function download_spark_jar() {
+  local -r url=$1
+  local -r jar_name=${url##*/}
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${url}" -o "${SPARK_JARS_DIR}/${jar_name}"
+}
+
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
   local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions
index 5b8455c19..d9040b1d6 100644
--- a/templates/gpu/yarn_functions
+++ b/templates/gpu/yarn_functions
@@ -15,6 +15,7 @@ function configure_yarn_gpu_resources() {
 }
 
 function configure_gpu_script() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # Download GPU discovery script
   local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
   mkdir -p ${spark_gpu_script_dir}
@@ -89,6 +90,7 @@ EOF
 }
 
 function configure_yarn_nodemanager_gpu() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
@@ -98,6 +100,7 @@ function configure_yarn_nodemanager_gpu() {
 }
 
 function configure_gpu_isolation() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # enable GPU isolation
   sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
   if [[ $IS_MIG_ENABLED -ne 0 ]]; then
@@ -140,6 +143,8 @@ function setup_gpu_yarn() {
 
   install_nvidia_container_toolkit
   configure_yarn_nodemanager_gpu
-  configure_gpu_script
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    configure_gpu_script
+  fi
   configure_gpu_isolation
 }
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
index 2435bb732..16e67aba1 100644
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ b/templates/spark-rapids/spark-rapids.sh.in
@@ -51,7 +51,6 @@ function main() {
 
   if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
     install_spark_rapids
-    configure_gpu_script
     echo "RAPIDS initialized with Spark runtime"
   elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
     echo "This action only installs spark-rapids"
@@ -66,6 +65,7 @@ function main() {
 }
 
 function exit_handler() {
+  set +e
   gpu_install_exit_handler
   gpu_exit_handler
   pip_exit_handler

From a7b47071d55780060449c9cb65e168bf230ab9a5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 8 Jan 2025 23:11:50 -0800
Subject: [PATCH 113/130] refactored spark variable definition and reduced
 excess lines by bulking the readonly operations

---
 templates/common/util_functions | 59 +++++++++------------------------
 templates/gpu/spark_functions   | 29 ++++++++++++++++
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 336af37f8..9a6407a7b 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -329,25 +329,6 @@ function check_os() {
       exit 1
   fi
 
-  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-  readonly SPARK_VERSION
-  if version_lt "${SPARK_VERSION}" "3.1" || \
-     version_ge "${SPARK_VERSION}" "4.0" ; then
-    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-    exit 1
-  fi
-
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
 }
 
 function configure_dkms_certs() {
@@ -510,42 +491,30 @@ function prepare_conda_env() {
 }
 
 function prepare_common_env() {
-  define_os_comparison_functions
-
   # Verify OS compatability and Secure boot state
   check_os
   check_secure_boot
 
-  readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
-
-  # Dataproc configurations
-  readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-  readonly HIVE_CONF_DIR='/etc/hive/conf'
-  readonly SPARK_CONF_DIR='/etc/spark/conf'
-
+  # read-only configuration variables
+  _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+  HADOOP_CONF_DIR='/etc/hadoop/conf'
+  HIVE_CONF_DIR='/etc/hive/conf'
   OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
   ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
-  # master node
   MASTER="$(get_metadata_attribute dataproc-master)"
-  readonly MASTER
-
   workdir=/opt/install-dpgce
-  tmpdir=/tmp/
   temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
-  readonly temp_bucket
-  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  pkg_bucket="gs://${temp_bucket}/dpgce-packages"
   uname_r=$(uname -r)
-  readonly uname_r
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  export DEBIAN_FRONTEND=noninteractive
+  bdcfg="/usr/local/bin/bdconfig"
+  KNOX_HOME=/usr/lib/knox
 
-  # Knox config
-  readonly KNOX_HOME=/usr/lib/knox
+  readonly HADOOP_CONF_DIR HIVE_CONF_DIR OS_NAME ROLE MASTER workdir
+  readonly temp_bucket pkg_bucket uname_r bdconfig KNOX_HOME
+
+  tmpdir=/tmp/
+
+  export DEBIAN_FRONTEND=noninteractive
 
   mkdir -p "${workdir}/complete"
   set_proxy
@@ -685,3 +654,5 @@ print( "    samples-taken: ", scalar @siz, $/,
   fi
   echo "exit_handler has completed"
 }
+
+define_os_comparison_functions
diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions
index fa29330de..25a99221e 100644
--- a/templates/gpu/spark_functions
+++ b/templates/gpu/spark_functions
@@ -41,3 +41,32 @@ function install_spark_rapids() {
                         "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
                         "/usr/lib/spark/jars/${jar_basename}"
 }
+
+function prepare_spark_env() {
+  SPARK_NLP_VERSION="3.2.1" # Must include subminor version here
+  SPARK_JARS_DIR=/usr/lib/spark/jars
+  SPARK_CONF_DIR='/etc/spark/conf'
+  SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")"
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+
+  readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION
+
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+
+}

From 35ca7043d7205a63cf618988770ee58d8f2dd3c4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 8 Jan 2025 23:26:22 -0800
Subject: [PATCH 114/130] development on these scripts will happen in the
 spark-rapids-template-20241225 branch

---
 templates/spark-rapids/mig.sh.in          | 93 -----------------------
 templates/spark-rapids/spark-rapids.sh.in | 87 ---------------------
 2 files changed, 180 deletions(-)
 delete mode 100644 templates/spark-rapids/mig.sh.in
 delete mode 100644 templates/spark-rapids/spark-rapids.sh.in

diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
deleted file mode 100644
index 99b494c4f..000000000
--- a/templates/spark-rapids/mig.sh.in
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-#
-[% INSERT legal/license_header %]
-#
-# This script installs NVIDIA GPU drivers and enables MIG on Hopper
-# GPU architectures.
-#
-# This script should be specified in --initialization-actions= option
-# and --metadata=ENABLE_MIG can be used to enable or disable MIG. The
-# default is to enable it.  The script configures the MIG device based
-# on the user specified MIG_CGI profiles specified via:
-# --metadata=^:^MIG_CGI='9,9'. If MIG_CGI is not specified it assumes
-# it's using an H100 and configures 2 instances with profile id 9.
-#
-[% PROCESS common/template_disclaimer %]
-
-[% INSERT common/util_functions %]
-
-[% INSERT common/yarn_functions %]
-
-[% INSERT gpu/mig_functions %]
-
-[% INSERT gpu/util_functions %]
-
-set -euxo pipefail
-
-function main() {
-  if [[ "${nvsmi_works}" == "1" ]] ; then
-    # if this is called without the MIG script then the drivers are not installed
-    query_nvsmi
-    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-    set +e
-    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
-    set -e
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
-  fi
-
-  # if mig is enabled drivers would have already been installed
-  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-    install_nvidia_gpu_driver
-    install_cuda
-    load_kernel_module
-
-    #Install GPU metrics collection in Stackdriver if needed
-    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-      install_gpu_agent
-#      install_gpu_monitoring_agent
-      echo 'GPU metrics agent successfully deployed.'
-    else
-      echo 'GPU metrics agent has not been installed.'
-    fi
-    configure_gpu_exclusive_mode
-  fi
-
-  setup_gpu_yarn
-
-  echo "yarn setup complete"
-
-  enable_and_configure_mig
-
-  echo "main complete"
-  return 0
-}
-
-function exit_handler() {
-  gpu_exit_handler
-  pip_exit_handler
-  yarn_exit_handler
-  common_exit_handler
-  return 0
-}
-
-function prepare_to_install(){
-  prepare_common_env
-  prepare_pip_env
-  prepare_gpu_env
-  trap exit_handler EXIT
-}
-
-prepare_to_install
-
-main
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in
deleted file mode 100644
index 16e67aba1..000000000
--- a/templates/spark-rapids/spark-rapids.sh.in
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-#
-[% INSERT legal/license_header %]
-#
-[% PROCESS common/template_disclaimer %]
-#
-# This script installs NVIDIA GPU drivers.
-#
-# Dataproc 2.0:  Driver version 530.30.02, CUDA version 12.1.1, Rapids 23.08.2
-# Dataproc 2.1:  Driver version   550.135, CUDA version 12.4.1, Rapids 24.08.1
-# Dataproc 2.2:  Driver version 560.35.03, CUDA version 12.6.2, Rapids 24.08.1
-#
-# Additionally, it installs the RAPIDS Spark plugin, configures Spark
-# and YARN, and installs an agent to collect GPU utilization metrics.
-# The installer is regularly exercised with Debian, Ubuntu, and Rocky
-# Linux distributions.
-#
-# Note that the script is designed to work both when secure boot is
-# enabled with a custom image and when disabled during cluster
-# creation.
-#
-# For details see
-# github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot
-#
-
-set -euxo pipefail
-
-[% INSERT common/util_functions %]
-[% INSERT common/install_functions %]
-[% INSERT gpu/util_functions %]
-[% INSERT gpu/install_functions %]
-[% INCLUDE gpu/yarn_functions %]
-[% INSERT gpu/spark_functions %]
-
-function main() {
-  install_gpu_driver_and_cuda
-
-  #Install GPU metrics collection in Stackdriver if needed
-  if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-#    install_gpu_agent
-    install_gpu_monitoring_agent
-    echo 'GPU metrics agent successfully deployed.'
-  else
-    echo 'GPU metrics agent has not been installed.'
-  fi
-  configure_gpu_exclusive_mode
-
-  setup_gpu_yarn
-
-  echo "yarn setup complete"
-
-  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
-    install_spark_rapids
-    echo "RAPIDS initialized with Spark runtime"
-  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
-    echo "This action only installs spark-rapids"
-    exit 1
-  else
-    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
-    exit 1
-  fi
-
-  echo "main complete"
-  return 0
-}
-
-function exit_handler() {
-  set +e
-  gpu_install_exit_handler
-  gpu_exit_handler
-  pip_exit_handler
-  yarn_exit_handler
-  common_exit_handler
-  return 0
-}
-
-function prepare_to_install(){
-  prepare_common_env
-  prepare_pip_env
-  prepare_gpu_env
-  prepare_gpu_install_env
-  trap exit_handler EXIT
-}
-
-prepare_to_install
-
-main

From 43232b25d25b99dcfbb3a3e5e2933c02a239fd2d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 12:00:42 -0800
Subject: [PATCH 115/130] revert dask/ to master

---
 dask/test_dask.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dask/test_dask.py b/dask/test_dask.py
index 1126d7d80..440493511 100644
--- a/dask/test_dask.py
+++ b/dask/test_dask.py
@@ -56,13 +56,16 @@ def _run_dask_test_script(self, name, script):
     )
     def test_dask(self, configuration, instances, runtime):
 
+        if self.getImageVersion() < pkg_resources.parse_version("2.0"):
+            self.skipTest("Not supported in pre-2.0 images")
+
         metadata = None
         if runtime:
             metadata = "dask-runtime={}".format(runtime)
 
         self.createCluster(configuration,
                            self.INIT_ACTIONS,
-                           machine_type='n1-highmem-8',
+                           machine_type='n1-standard-16',
                            metadata=metadata,
                            timeout_in_minutes=20)
 

From 4b6e520812ac77597e3c5833175fc7a80da04062 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 8 Jan 2025 00:46:00 -0800
Subject: [PATCH 116/130] moving that .in suffix to the correct variable

---
 templates/generate-action.pl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/templates/generate-action.pl b/templates/generate-action.pl
index 950bd15fe..690acb409 100644
--- a/templates/generate-action.pl
+++ b/templates/generate-action.pl
@@ -8,7 +8,10 @@
 use strict;
 
 my $action = $ARGV[0];
-my $v = { template_path => "${action}.in" };
+my $v = {
+  template_path => "${action}",
+  IA_VERSION    => "${IA_VERSION}",
+};
 
 sub usage{
   # TODO: use File::Find to list the available actions for the user
@@ -24,7 +27,7 @@ sub usage{
   die "Usage:$/$0 <action>"
 }
 
-usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}" );
+usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}.in" );
 
 my $tt = Template->new( {
   INCLUDE_PATH => "$ENV{PWD}/templates",
@@ -33,4 +36,4 @@ sub usage{
 }) || die "$Template::ERROR$/";
 
 
-$tt->process($v->{template_path}) or die( $tt->error(), "\n" );
+$tt->process("$v->{template_path}.in") or die( $tt->error(), "\n" );

From 4a024e0548d48e126bb65bf53ca0835e26d94f37 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 12:27:52 -0800
Subject: [PATCH 117/130] reverted to master ; changes ended up in
 gpu-template-20250107

---
 gpu/BUILD                 |    6 +-
 gpu/install_gpu_driver.sh | 2685 +++++++++++++------------------------
 gpu/test_gpu.py           |  302 ++---
 gpu/verify_pyspark.py     |   45 -
 4 files changed, 1033 insertions(+), 2005 deletions(-)
 delete mode 100644 gpu/verify_pyspark.py

diff --git a/gpu/BUILD b/gpu/BUILD
index bd5500ccb..b481c5b33 100644
--- a/gpu/BUILD
+++ b/gpu/BUILD
@@ -6,11 +6,7 @@ py_test(
     name = "test_gpu",
     size = "enormous",
     srcs = ["test_gpu.py"],
-    data = [
-      "install_gpu_driver.sh",
-      "verify_pyspark.py",
-      "mig.sh"
-    ],
+    data = ["install_gpu_driver.sh", "mig.sh"],
     local = True,
     shard_count = 15,
     deps = [
diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 91ad4ede0..25efb2a49 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 #
-# Copyright 2015 Google LLC and contributors
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,14 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-#
-# This initialization action is generated from
-# initialization-actions/templates/gpu/install_gpu_driver.sh.in
-#
-# Modifications made directly to the generated file will be lost when
-# the template is re-evaluated
-
 #
 # This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
 
@@ -30,38 +20,32 @@ function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | x
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
 
-# For version (or real number) comparison
-# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second
-# ( version_ge 2.0 2.1 ) evaluates to false
-# ( version_ge 2.2 2.1 ) evaluates to true
 function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
 function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
 function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
 
-function define_os_comparison_functions() {
-
-  readonly -A supported_os=(
-    ['debian']="10 11 12"
-    ['rocky']="8 9"
-    ['ubuntu']="18.04 20.04 22.04"
-  )
-
-  # dynamically define OS version test utility functions
-  if [[ "$(os_id)" == "rocky" ]];
-  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
-  else _os_version="$(os_version)"; fi
-  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
-    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
-
-    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
-      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
-      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
-      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
-    done
+readonly -A supported_os=(
+  ['debian']="10 11 12"
+  ['rocky']="8 9"
+  ['ubuntu']="18.04 20.04 22.04"
+)
+
+# dynamically define OS version test utility functions
+if [[ "$(os_id)" == "rocky" ]];
+then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+else _os_version="$(os_version)"; fi
+for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
   done
-  eval "function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )"
-}
+done
+
+function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
 
 function os_vercat()   ( set +x
   if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
@@ -69,7 +53,7 @@ function os_vercat()   ( set +x
                    else os_version ; fi ; )
 
 function repair_old_backports {
-  if ! is_debuntu ; then return ; fi
+  if ge_debian12 || ! is_debuntu ; then return ; fi
   # This script uses 'apt-get update' and is therefore potentially dependent on
   # backports repositories which have been archived.  In order to mitigate this
   # problem, we will use archive.debian.org for the oldoldstable repo
@@ -110,7 +94,6 @@ function print_metadata_value_if_exists() {
   return ${return_code}
 }
 
-# replicates /usr/share/google/get_metadata_value
 function get_metadata_value() (
   set +x
   local readonly varname=$1
@@ -134,13 +117,226 @@ function get_metadata_attribute() (
   get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
 )
 
+OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+readonly OS_NAME
+
+# node role
+ROLE="$(get_metadata_attribute dataproc-role)"
+readonly ROLE
+
+# CUDA version and Driver version
+# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+# https://developer.nvidia.com/cuda-downloads
+# Rocky8: 12.0: 525.147.05
+readonly -A DRIVER_FOR_CUDA=(
+          ["11.8"]="560.35.03"
+          ["12.0"]="525.60.13"  ["12.4"]="560.35.03"  ["12.6"]="560.35.03"
+)
+# https://developer.nvidia.com/cudnn-downloads
+if is_debuntu ; then
+readonly -A CUDNN_FOR_CUDA=(
+          ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.4"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+)
+elif is_rocky ; then
+# rocky:
+#   12.0: 8.8.1.3
+#   12.1: 8.9.3.28
+#   12.2: 8.9.7.29
+#   12.3: 9.0.0.312
+#   12.4: 9.1.1.17
+#   12.5: 9.2.1.18
+#   12.6: 9.5.1.17
+readonly -A CUDNN_FOR_CUDA=(
+          ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"   ["12.4"]="9.1.1.17"   ["12.6"]="9.5.1.17"
+)
+fi
+# https://developer.nvidia.com/nccl/nccl-download
+# 12.2: 2.19.3, 12.5: 2.21.5
+readonly -A NCCL_FOR_CUDA=(
+          ["11.8"]="2.15.5"
+          ["12.0"]="2.16.5"  ["12.4"]="2.23.4"     ["12.6"]="2.23.4"
+)
+readonly -A CUDA_SUBVER=(
+          ["11.8"]="11.8.0"
+          ["12.0"]="12.0.0"  ["12.4"]="12.4.1"     ["12.6"]="12.6.2"
+)
+
+RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+readonly DEFAULT_CUDA_VERSION='12.4'
+CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then
+  # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27
+  CUDA_VERSION="${DEFAULT_CUDA_VERSION}"
+fi
+
+if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then
+  # Only CUDA 12.0 supported on older debuntu
+  CUDA_VERSION="12.0"
+fi
+readonly CUDA_VERSION
+readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}"
+
+function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
+function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
+function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
+
+function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
+function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
+function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
+
+DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
+if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
+                                         DEFAULT_DRIVER="560.28.03"  ; fi
+if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03"  ; fi
+if ( is_rocky    && le_cuda11 )   ; then DEFAULT_DRIVER="525.147.05" ; fi
+if ( is_ubuntu20 && le_cuda11 )   ; then DEFAULT_DRIVER="535.183.06" ; fi
+if ( is_rocky9   && ge_cuda12 )   ; then DEFAULT_DRIVER="565.57.01"  ; fi
+DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
+
+readonly DRIVER_VERSION
+readonly DRIVER=${DRIVER_VERSION%%.*}
+
+readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+# Parameters for NVIDIA-provided cuDNN library
+readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
+function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
+# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
+  # cuDNN v8 is not distribution for ubuntu20+, debian12
+  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
+  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+  CUDNN_VERSION="8.8.0.121"
+fi
+readonly CUDNN_VERSION
+
+readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
+# Parameters for NVIDIA-provided Debian GPU driver
+readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+# Short name for urls
+if is_ubuntu22  ; then
+    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
+    # https://developer.download.nvidia.com/compute/machine-learning/repos/
+    # use packages from previous release until such time as nvidia
+    # release ubuntu2204 builds
+
+    nccl_shortname="ubuntu2004"
+    shortname="$(os_id)$(os_vercat)"
+elif ge_rocky9 ; then
+    # use packages from previous release until such time as nvidia
+    # release rhel9 builds
+
+    nccl_shortname="rhel8"
+    shortname="rhel9"
+elif is_rocky ; then
+    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
+    nccl_shortname="${shortname}"
+else
+    shortname="$(os_id)$(os_vercat)"
+    nccl_shortname="${shortname}"
+fi
+
+# Parameters for NVIDIA-provided package repositories
+readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+
+# Parameters for NVIDIA-provided NCCL library
+readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
+NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
+readonly NCCL_REPO_URL
+readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
+
+function set_cuda_runfile_url() {
+  local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}"
+  local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}"
+
+  if ge_cuda12 ; then
+    if ( le_debian11 || le_ubuntu18 ) ; then
+      RUNFILE_DRIVER_VERSION="525.60.13"
+      RUNFILE_CUDA_VERSION="12.0.0"
+    elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then
+      RUNFILE_DRIVER_VERSION="525.147.05"
+      RUNFILE_CUDA_VERSION="12.0.0"
+    fi
+  else
+    RUNFILE_DRIVER_VERSION="520.61.05"
+    RUNFILE_CUDA_VERSION="11.8.0"
+  fi
+
+  readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run"
+  CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}"
+  DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}"
+  readonly DEFAULT_NVIDIA_CUDA_URL
+
+  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
+  readonly NVIDIA_CUDA_URL
+}
+
+set_cuda_runfile_url
+
+# Parameter for NVIDIA-provided Rocky Linux GPU driver
+readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+
+CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
+CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
+if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
+  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
+  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
+    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
+    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
+  fi
+  # Use legacy url format with one of the tarball name formats depending on version as above
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
+fi
+if ( version_ge "${CUDA_VERSION}" "12.0" ); then
+  # Use modern url format When cuda version is greater than or equal to 12.0
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
+fi
+readonly CUDNN_TARBALL
+readonly CUDNN_TARBALL_URL
+
+# Whether to install NVIDIA-provided or OS-provided GPU driver
+GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+readonly GPU_DRIVER_PROVIDER
+
+# Stackdriver GPU agent parameters
+readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+readonly INSTALL_GPU_AGENT
+
+# Dataproc configurations
+readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+readonly HIVE_CONF_DIR='/etc/hive/conf'
+readonly SPARK_CONF_DIR='/etc/spark/conf'
+
+NVIDIA_SMI_PATH='/usr/bin'
+MIG_MAJOR_CAPS=0
+IS_MIG_ENABLED=0
+
 function execute_with_retries() (
   set +x
   local -r cmd="$*"
 
   if [[ "$cmd" =~ "^apt-get install" ]] ; then
     apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    apt-get -y autoremove
   fi
   for ((i = 0; i < 3; i++)); do
     set -x
@@ -152,1154 +348,154 @@ function execute_with_retries() (
   return 1
 )
 
-function cache_fetched_package() {
-  local src_url="$1"
-  local gcs_fn="$2"
-  local local_fn="$3"
+CUDA_KEYRING_PKG_INSTALLED="0"
+function install_cuda_keyring_pkg() {
+  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
+  local kr_ver=1.1
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
+    -o "${tmpdir}/cuda-keyring.deb"
+  dpkg -i "${tmpdir}/cuda-keyring.deb"
+  rm -f "${tmpdir}/cuda-keyring.deb"
+  CUDA_KEYRING_PKG_INSTALLED="1"
+}
+
+function uninstall_cuda_keyring_pkg() {
+  apt-get purge -yq cuda-keyring
+  CUDA_KEYRING_PKG_INSTALLED="0"
+}
+
+CUDA_LOCAL_REPO_INSTALLED="0"
+function install_local_cuda_repo() {
+  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  CUDA_LOCAL_REPO_INSTALLED="1"
+  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
+  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
+  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
+  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
+  readonly DIST_KEYRING_DIR="/var/${pkgname}"
 
-  while ! command -v gcloud ; do sleep 5s ; done
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
 
-  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
-    time gcloud storage cp "${gcs_fn}" "${local_fn}"
-  else
-    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
-           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+
+  if is_ubuntu ; then
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
+      -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
 }
+function uninstall_local_cuda_repo(){
+  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
+  CUDA_LOCAL_REPO_INSTALLED="0"
+}
 
-function add_contrib_component() {
-  if ! is_debuntu ; then return ; fi
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-kernel-open-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib"
+CUDNN_LOCAL_REPO_INSTALLED="0"
+CUDNN_PKG_NAME=""
+function install_local_cudnn_repo() {
+  if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  pkgname="cudnn-local-repo-${shortname}-${CUDNN}"
+  CUDNN_PKG_NAME="${pkgname}"
+  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}"
 
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
-  fi
+  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
+
+  dpkg -i "${tmpdir}/local-installer.deb"
+
+  rm -f "${tmpdir}/local-installer.deb"
+
+  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+
+  CUDNN_LOCAL_REPO_INSTALLED="1"
 }
 
-function set_hadoop_property() {
-  local -r config_file=$1
-  local -r property=$2
-  local -r value=$3
-  "${bdcfg}" set_property \
-    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
-    --name "${property}" --value "${value}" \
-    --clobber
+function uninstall_local_cudnn_repo() {
+  apt-get purge -yq "${CUDNN_PKG_NAME}"
+  CUDNN_LOCAL_REPO_INSTALLED="0"
 }
 
-function configure_yarn_resources() {
-  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
-  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+CUDNN8_LOCAL_REPO_INSTALLED="0"
+CUDNN8_PKG_NAME=""
+function install_local_cudnn8_repo() {
+  if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
+  elif is_debian ; then cudnn8_shortname="debian11"
+  else return 0 ; fi
+  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
+  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
+  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
 
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
+  CUDNN8_PKG_NAME="${pkgname}"
 
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
+  deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_fn="${tmpdir}/${deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+      "${local_deb_url}" -o "${local_deb_fn}"
 
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+  dpkg -i "${local_deb_fn}"
 
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
+  rm -f "${local_deb_fn}"
 
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  CUDNN8_LOCAL_REPO_INSTALLED="1"
+}
 
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
-  fi
+function uninstall_local_cudnn8_repo() {
+  apt-get purge -yq "${CUDNN8_PKG_NAME}"
+  CUDNN8_LOCAL_REPO_INSTALLED="0"
 }
 
-function clean_up_sources_lists() {
-  #
-  # bigtop (primary)
-  #
-  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+function install_nvidia_nccl() {
+  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
 
-  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
-    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+  if is_rocky ; then
+    execute_with_retries \
+      dnf -y -q install \
+        "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}"
+    sync
+  elif is_ubuntu ; then
+    install_cuda_keyring_pkg
 
-    local regional_bigtop_repo_uri
-    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
-      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
-      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
-      cut -d ' ' -f 2 |
-      head -1)
+    apt-get update -qq
 
-    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    if is_ubuntu18 ; then
+      execute_with_retries \
+        apt-get install -q -y \
+          libnccl2 libnccl-dev
+      sync
     else
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+      execute_with_retries \
+        apt-get install -q -y \
+          "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}"
+      sync
     fi
-
-    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
-    rm -f "${bigtop_kr_path}"
-    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
-
-    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  else
+    echo "Unsupported OS: '${OS_NAME}'"
+    # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems
+    # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
+    # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
+    # nvhpc_2024_247_Linux_x86_64_cuda_multi/install
+    return
   fi
+}
 
-  #
-  # adoptium
-  #
-  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
-  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
-  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
-  rm -f "${adoptium_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
-   | gpg --dearmor -o "${adoptium_kr_path}"
-  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
-   > /etc/apt/sources.list.d/adoptium.list
+function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
+function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
-
-  #
-  # docker
-  #
-  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
-  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
-  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
-
-  rm -f "${docker_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
-    | gpg --dearmor -o "${docker_kr_path}"
-  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
-    > ${docker_repo_file}
-
-  #
-  # google cloud + logging/monitoring
-  #
-  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
-    rm -f /usr/share/keyrings/cloud.google.gpg
-    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
-    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
-      list_file="/etc/apt/sources.list.d/${list}.list"
-      if [[ -f "${list_file}" ]]; then
-        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
-      fi
-    done
-  fi
-
-  #
-  # cran-r
-  #
-  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
-    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
-    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
-    rm -f /usr/share/keyrings/cran-r.gpg
-    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
-      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
-    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
-  fi
-
-  #
-  # mysql
-  #
-  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
-    rm -f /usr/share/keyrings/mysql.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
-      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
-    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
-  fi
-
-  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
-
-}
-
-function set_proxy(){
-  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
-
-  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
-
-  export METADATA_HTTP_PROXY
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
-  local no_proxy_svc
-  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
-                      bigquery composer      pubsub bigquerydatatransfer dataflow \
-                      storage  datafusion    ; do
-    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
-  done
-
-  export NO_PROXY="${no_proxy}"
-}
-
-function is_ramdisk() {
-  if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi
-  if   ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0
-  elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi
-
-  if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then
-    IS_RAMDISK="true"
-    return 0
-  else
-    IS_RAMDISK="false"
-    return 1
-  fi
-}
-
-function mount_ramdisk(){
-  local free_mem
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
-
-  # Write to a ramdisk instead of churning the persistent disk
-
-  tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}/pkgs_dirs"
-  mount -t tmpfs tmpfs "${tmpdir}"
-
-  # Download conda packages to tmpfs
-  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs"
-
-  # Download OS packages to tmpfs
-  if is_debuntu ; then
-    mount -t tmpfs tmpfs /var/cache/apt/archives
-  else
-    mount -t tmpfs tmpfs /var/cache/dnf
-  fi
-  is_ramdisk -f
-}
-
-function check_os() {
-  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
-      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
-      exit 1
-  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
-      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
-      exit 1
-  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
-      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
-      exit 1
-  fi
-
-  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-  readonly SPARK_VERSION
-  if version_lt "${SPARK_VERSION}" "3.1" || \
-     version_ge "${SPARK_VERSION}" "4.0" ; then
-    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-    exit 1
-  fi
-
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
-}
-
-#
-# Generate repo file under /etc/apt/sources.list.d/
-#
-function apt_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local -r include_src="${4:-yes}"
-  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
-
-  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
-  if [[ "${include_src}" == "yes" ]] ; then
-    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
-  fi
-
-  apt-get update -qq
-}
-
-#
-# Generate repo file under /etc/yum.repos.d/
-#
-function dnf_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
-  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
-
-  curl -s -L "${repo_url}" \
-    | dd of="${repo_path}" status=progress
-#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
-}
-
-#
-# Keyrings default to
-# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
-# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
-#
-function os_add_repo() {
-  local -r repo_name="$1"
-  local -r signing_key_url="$2"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local kr_path
-  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
-
-  mkdir -p "$(dirname "${kr_path}")"
-
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
-    | gpg --import --no-default-keyring --keyring "${kr_path}"
-
-  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
-                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
-}
-
-function configure_dkms_certs() {
-  if test -v PSN && [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping";
-      return 0
-  fi
-
-  mkdir -p "${CA_TMPDIR}"
-
-  # If the private key exists, verify it
-  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
-    echo "Private key material exists"
-
-    local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
-      modulus_md5sum="${expected_modulus_md5sum}"
-
-      # Verify that cert md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key"
-      fi
-
-      # Verify that key md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert"
-      fi
-    else
-      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-
-    return
-  fi
-
-  # Retrieve cloud secrets keys
-  local sig_priv_secret_name
-  sig_priv_secret_name="${PSN}"
-  local sig_pub_secret_name
-  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
-  local sig_secret_project
-  sig_secret_project="$(get_metadata_attribute secret_project)"
-  local sig_secret_version
-  sig_secret_version="$(get_metadata_attribute secret_version)"
-
-  # If metadata values are not set, do not write mok keys
-  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
-
-  # Write private material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_priv_secret_name}" \
-      | dd status=none of="${CA_TMPDIR}/db.rsa"
-
-  # Write public material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_pub_secret_name}" \
-      | base64 --decode \
-      | dd status=none of="${CA_TMPDIR}/db.der"
-
-  local mok_directory="$(dirname "${mok_key}")"
-  mkdir -p "${mok_directory}"
-
-  # symlink private key and copy public cert from volatile storage to DKMS directory
-  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
-
-  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
-}
-
-function clear_dkms_key {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping" >&2
-      return 0
-  fi
-  rm -rf "${CA_TMPDIR}" "${mok_key}"
-}
-
-function check_secure_boot() {
-  local SECURE_BOOT="disabled"
-  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
-
-  PSN="$(get_metadata_attribute private_secret_name)"
-  readonly PSN
-
-  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
-    echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster."
-    return
-  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
-    echo "Secure boot is enabled, but no signing material provided."
-    echo "Consider either disabling secure boot or provide signing material as per"
-    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-    return
-  fi
-
-  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-  readonly CA_TMPDIR
-
-  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
-                      mok_der=/var/lib/shim-signed/mok/MOK.der
-                 else mok_key=/var/lib/dkms/mok.key
-                      mok_der=/var/lib/dkms/mok.pub ; fi
-}
-
-function restart_knox() {
-  systemctl stop knox
-  rm -rf "${KNOX_HOME}/data/deployments/*"
-  systemctl start knox
-}
-
-function install_dependencies() {
-  test -f "${workdir}/complete/install-dependencies" && return 0
-  pkg_list="screen"
-  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
-  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
-  touch "${workdir}/complete/install-dependencies"
-}
-
-function prepare_pip_env() {
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv"
-  source "${tmpdir}/python-venv/bin/activate"
-
-  pip cache purge || echo "unable to purge pip cache"
-  if is_ramdisk ; then
-    # Download pip packages to tmpfs
-    mkdir -p "${tmpdir}/cache-dir"
-    pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir"
-  fi
-}
-
-
-function prepare_common_env() {
-  define_os_comparison_functions
-
-  # Verify OS compatability and Secure boot state
-  check_os
-  check_secure_boot
-
-  readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
-
-  # Dataproc configurations
-  readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-  readonly HIVE_CONF_DIR='/etc/hive/conf'
-  readonly SPARK_CONF_DIR='/etc/spark/conf'
-
-  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
-  ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
-  # master node
-  MASTER="$(get_metadata_attribute dataproc-master)"
-  readonly MASTER
-
-  workdir=/opt/install-dpgce
-  tmpdir=/tmp/
-  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
-  readonly temp_bucket
-  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
-  uname_r=$(uname -r)
-  readonly uname_r
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  export DEBIAN_FRONTEND=noninteractive
-
-  # Knox config
-  readonly KNOX_HOME=/usr/lib/knox
-
-  mkdir -p "${workdir}/complete"
-  set_proxy
-  mount_ramdisk
-
-  readonly install_log="${tmpdir}/install.log"
-
-  if test -f "${workdir}/complete/prepare.common" ; then return ; fi
-
-  repair_old_backports
-
-  if is_debuntu ; then
-    clean_up_sources_lists
-    apt-get update -qq
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-    if ge_debian12 ; then
-    apt-mark unhold systemd libsystemd0 ; fi
-    if is_ubuntu ; then
-      while ! command -v gcloud ; do sleep 5s ; done
-    fi
-  else
-    dnf clean all
-  fi
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-
- ( set +e
-    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  )
-
-    install_dependencies
-
-    # Monitor disk usage in a screen session
-    df / > "/run/disk-usage.log"
-    touch "/run/keep-running-df"
-    screen -d -m -LUS keep-running-df \
-      bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
- fi
-
-  touch "${workdir}/complete/prepare.common"
-}
-
-function pip_exit_handler() {
-  if is_ramdisk ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
-  fi
-}
-
-function common_exit_handler() {
-  set +ex
-  echo "Exit handler invoked"
-
-  # Restart YARN services if they are running already
-  for svc in resourcemanager nodemanager; do
-    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl  stop "hadoop-yarn-${svc}.service"
-      systemctl start "hadoop-yarn-${svc}.service"
-    fi
-  done
-
-  # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
-        umount -f ${shmdir}
-      fi
-    done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-  fi
-
-  if is_debuntu ; then
-    # Clean up OS package cache
-    apt-get -y -qq clean
-    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
-    # re-hold systemd package
-    if ge_debian12 ; then
-    apt-mark hold systemd libsystemd0 ; fi
-  else
-    dnf clean all
-  fi
-
-  # When creating image, print disk usage statistics, zero unused disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    # print disk usage statistics for large components
-    if is_ubuntu ; then
-      du -hs \
-        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-        /usr/lib \
-        /opt/nvidia/* \
-        /opt/conda/miniconda3 | sort -h
-    elif is_debian ; then
-      du -x -hs \
-        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
-        /var/lib/{docker,mysql,} \
-        /opt/nvidia/* \
-        /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
-        /usr/bin \
-        /usr \
-        /var \
-        / 2>/dev/null | sort -h
-    else
-      du -hs \
-        /var/lib/docker \
-        /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
-        /usr/lib64/google-cloud-sdk \
-        /opt/nvidia/* \
-        /opt/conda/miniconda3
-    fi
-
-    # Process disk usage logs from installation period
-    rm -f /run/keep-running-df
-    sync
-    sleep 5.01s
-    # compute maximum size of disk during installation
-    # Log file contains logs like the following (minus the preceeding #):
-#Filesystem     1K-blocks    Used Available Use% Mounted on
-#/dev/vda2        7096908 2611344   4182932  39% /
-    df / | tee -a "/run/disk-usage.log"
-
-    perl -e \
-          '@siz=( sort { $a => $b }
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting};
-print( "    samples-taken: ", scalar @siz, $/,
-       "starting-disk-used: $starting", $/,
-       "maximum-disk-used:  $max", $/,
-       "minimum-disk-used:  $min", $/,
-       "     increased-by:  $inc", $/ )' < "/run/disk-usage.log"
-
-
-    # zero free disk space
-    dd if=/dev/zero of=/zero
-    sync
-    sleep 3s
-    rm -f /zero
-  fi
-  echo "exit_handler has completed"
-}
-
-
-function set_support_matrix() {
-  # CUDA version and Driver version
-  # https://docs.nvidia.com/deploy/cuda-compatibility/
-  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
-  # https://developer.nvidia.com/cuda-downloads
-
-  # Minimum supported version for open kernel driver is 515.43.04
-  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
-  # Rocky8: 12.0: 525.147.05
-  local latest
-  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
-  readonly -A DRIVER_FOR_CUDA=(
-          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
-          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
-  )
-  readonly -A DRIVER_SUBVER=(
-          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
-          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
-  )
-  # https://developer.nvidia.com/cudnn-downloads
-  if is_debuntu ; then
-  readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
-  )
-  elif is_rocky ; then
-  # rocky:
-  #   12.0: 8.8.1.3
-  #   12.1: 8.9.3.28
-  #   12.2: 8.9.7.29
-  #   12.3: 9.0.0.312
-  #   12.4: 9.1.1.17
-  #   12.5: 9.2.1.18
-  #   12.6: 9.5.1.17
-  readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
-  )
-  fi
-  # https://developer.nvidia.com/nccl/nccl-download
-  # 12.2: 2.19.3, 12.5: 2.21.5
-  readonly -A NCCL_FOR_CUDA=(
-          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
-          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
-  )
-  readonly -A CUDA_SUBVER=(
-          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
-          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
-  )
-}
-
-set_support_matrix
-
-function set_cuda_version() {
-  case "${DATAPROC_IMAGE_VERSION}" in
-    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
-    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
-    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
-    *   )
-      echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
-      exit 1
-      ;;
-  esac
-  local cuda_url
-  cuda_url=$(get_metadata_attribute 'cuda-url' '')
-  if [[ -n "${cuda_url}" ]] ; then
-    # if cuda-url metadata variable has been passed, extract default version from url
-    local CUDA_URL_VERSION
-    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
-    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
-      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
-    fi
-  fi
-  readonly DEFAULT_CUDA_VERSION
-
-  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
-  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
-    CUDA_FULL_VERSION="${CUDA_VERSION}"
-    CUDA_VERSION="${CUDA_VERSION%.*}"
-  fi
-  readonly CUDA_VERSION
-  if ( ! test -v CUDA_FULL_VERSION ) ; then
-    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
-  fi
-  readonly CUDA_FULL_VERSION
-}
-
-function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
-function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
-function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
-
-function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
-function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
-function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
-
-function set_driver_version() {
-  local gpu_driver_url
-  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
-
-  local cuda_url
-  cuda_url=$(get_metadata_attribute 'cuda-url' '')
-
-  local DEFAULT_DRIVER
-  # Take default from gpu-driver-url metadata value
-  if [[ -n "${gpu_driver_url}" ]] ; then
-    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
-    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
-  # Take default from cuda-url metadata value as a backup
-  elif [[ -n "${cuda_url}" ]] ; then
-    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
-    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
-      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
-      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
-      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
-        # use the version indicated by the cuda url as the default if it exists
-	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
-      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
-        # use the maximum sub-version available for the major version indicated in cuda url as the default
-	DEFAULT_DRIVER="${driver_max_maj_version}"
-      fi
-    fi
-  fi
-
-  if ( ! test -v DEFAULT_DRIVER ) ; then
-    # If a default driver version has not been extracted, use the default for this version of CUDA
-    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
-  fi
-
-  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-
-  readonly DRIVER_VERSION
-  readonly DRIVER="${DRIVER_VERSION%%.*}"
-
-  export DRIVER_VERSION DRIVER
-
-  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
-    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
-    exit 1
-  fi
-}
-
-function set_cudnn_version() {
-  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
-
-  # Parameters for NVIDIA-provided cuDNN library
-  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
-    # cuDNN v8 is not distribution for ubuntu20+, debian12
-    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
-    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-    CUDNN_VERSION="8.8.0.121"
-  fi
-  readonly CUDNN_VERSION
-}
-
-
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-
-function set_cuda_repo_shortname() {
-# Short name for urls
-# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
-  if is_rocky ; then
-    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-  else
-    shortname="$(os_id)$(os_vercat)"
-  fi
-}
-
-function set_nv_urls() {
-  # Parameters for NVIDIA-provided package repositories
-  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
-
-  # Parameter for NVIDIA-provided Rocky Linux GPU driver
-  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
-}
-
-function set_cuda_runfile_url() {
-  local MAX_DRIVER_VERSION
-  local MAX_CUDA_VERSION
-
-  local MIN_OPEN_DRIVER_VER="515.48.07"
-  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
-  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
-
-  if is_cuda12 ; then
-    if is_debian12 ; then
-      MIN_DRIVER_VERSION="545.23.06"
-      MIN_CUDA_VERSION="12.3.0"
-    elif is_debian10 ; then
-      MAX_DRIVER_VERSION="555.42.02"
-      MAX_CUDA_VERSION="12.5.0"
-    elif is_ubuntu18 ; then
-      MAX_DRIVER_VERSION="530.30.02"
-      MAX_CUDA_VERSION="12.1.1"
-    fi
-  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
-    if le_debian10 ; then
-      # cuda 11 is not supported for <= debian10
-      MAX_CUDA_VERSION="0"
-      MAX_DRIVER_VERSION="0"
-    fi
-  else
-    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  fi
-
-  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
-    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
-    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
-  fi
-  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
-    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
-  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
-    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
-  fi
-
-  # driver version named in cuda runfile filename
-  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
-  readonly -A drv_for_cuda=(
-          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
-          ["11.8.0"]="520.61.05"
-          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
-          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
-          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
-          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
-          ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
-          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
-          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
-  )
-
-  # Verify that the file with the indicated combination exists
-  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
-  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
-  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
-  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
-
-  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
-  readonly NVIDIA_CUDA_URL
-
-  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
-  readonly CUDA_RUNFILE
-
-  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
-    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
-    exit 1
-  fi
-
-  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
-    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
-  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
-    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
-    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
-    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
-  fi
-}
-
-function set_cudnn_tarball_url() {
-CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
-CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
-if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
-  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
-  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
-    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
-    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
-  fi
-  # Use legacy url format with one of the tarball name formats depending on version as above
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
-fi
-if ( version_ge "${CUDA_VERSION}" "12.0" ); then
-  # Use modern url format When cuda version is greater than or equal to 12.0
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
-fi
-readonly CUDNN_TARBALL
-readonly CUDNN_TARBALL_URL
-}
-
-function install_cuda_keyring_pkg() {
-  if ( test -v CUDA_KEYRING_PKG_INSTALLED &&
-       [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi
-  local kr_ver=1.1
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
-    -o "${tmpdir}/cuda-keyring.deb"
-  dpkg -i "${tmpdir}/cuda-keyring.deb"
-  rm -f "${tmpdir}/cuda-keyring.deb"
-  CUDA_KEYRING_PKG_INSTALLED="1"
-}
-
-function uninstall_cuda_keyring_pkg() {
-  apt-get purge -yq cuda-keyring
-  CUDA_KEYRING_PKG_INSTALLED="0"
-}
-
-function install_local_cuda_repo() {
-  if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi
-
-  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  CUDA_LOCAL_REPO_INSTALLED="1"
-  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
-  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
-  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
-  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
-  readonly DIST_KEYRING_DIR="/var/${pkgname}"
-
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-
-  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
-
-  if is_ubuntu ; then
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
-      -o /etc/apt/preferences.d/cuda-repository-pin-600
-  fi
-
-  touch "${workdir}/complete/install-local-cuda-repo"
-}
-function uninstall_local_cuda_repo(){
-  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cuda-repo"
-}
-
-function install_local_cudnn_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
-  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
-  CUDNN_PKG_NAME="${pkgname}"
-  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
-
-  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
-
-  dpkg -i "${tmpdir}/local-installer.deb"
-
-  rm -f "${tmpdir}/local-installer.deb"
-
-  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
-
-  touch "${workdir}/complete/install-local-cudnn-repo"
-}
-
-function uninstall_local_cudnn_repo() {
-  apt-get purge -yq "${CUDNN_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cudnn-repo"
-}
-
-function install_local_cudnn8_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
-
-  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
-  elif is_debian ; then cudnn8_shortname="debian11"
-  else return 0 ; fi
-  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
-  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
-  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
-
-  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
-  CUDNN8_PKG_NAME="${pkgname}"
-
-  deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_fn="${tmpdir}/${deb_fn}"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
-
-  # cache the cudnn package
-  cache_fetched_package "${local_deb_url}" \
-                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
-                        "${local_deb_fn}"
-
-  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
-  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
-  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
-    mkdir -p "${cudnn_path}"
-    mount -t tmpfs tmpfs "${cudnn_path}"
-  fi
-
-  dpkg -i "${local_deb_fn}"
-
-  rm -f "${local_deb_fn}"
-
-  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  touch "${workdir}/complete/install-local-cudnn8-repo"
-}
-
-function uninstall_local_cudnn8_repo() {
-  apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cudnn8-repo"
-}
-
-function install_nvidia_nccl() {
-  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
-
-  if test -f "${workdir}/complete/nccl" ; then return ; fi
-
-  if is_cuda11 && is_debian12 ; then
-    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
-    return
-  fi
-
-  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
-
-  # https://github.com/NVIDIA/nccl/blob/master/README.md
-  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Fermi:     SM_20,             compute_30
-  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
-  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
-  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
-
-  # The following architectures are suppored by open kernel driver
-  # Volta:     SM_70,SM_72,       compute_70,compute_72
-  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
-
-  # The following architectures are supported by CUDA v11.8+
-  # Ada:       SM_89,             compute_89
-  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
-  # Blackwell: SM_100,            compute_100
-                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
-  if version_ge "${CUDA_VERSION}" "11.8" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
-  fi
-  if version_ge "${CUDA_VERSION}" "12.0" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
-  fi
-
-  mkdir -p "${workdir}"
-  pushd "${workdir}"
-
-  test -d "${workdir}/nccl" || {
-    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
-      | tar xz
-    mv "nccl-${NCCL_VERSION}-1" nccl
-  }
-
-  local build_path
-  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
-                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
-
-  test -d "${workdir}/nccl/build" || {
-    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
-    local local_tarball="${workdir}/${build_tarball}"
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
-
-    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
-    if echo "${output}" | grep -q "${gcs_tarball}" ; then
-      # cache hit - unpack from cache
-      echo "cache hit"
-    else
-      # build and cache
-      pushd nccl
-      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
-      install_build_dependencies
-      if is_debuntu ; then
-        # These packages are required to build .deb packages from source
-        execute_with_retries \
-          apt-get install -y -qq build-essential devscripts debhelper fakeroot
-        export NVCC_GENCODE
-        execute_with_retries make -j$(nproc) pkg.debian.build
-      elif is_rocky ; then
-        # These packages are required to build .rpm packages from source
-        execute_with_retries \
-          dnf -y -q install rpm-build rpmdevtools
-        export NVCC_GENCODE
-        execute_with_retries make -j$(nproc) pkg.redhat.build
-      fi
-      tar czvf "/${local_tarball}" "../${build_path}"
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
-      make clean
-      popd
-    fi
-    gcloud storage cat "${gcs_tarball}" | tar xz
-  }
-
-  if is_debuntu ; then
-    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
-  elif is_rocky ; then
-    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
-  fi
-
-  popd
-  touch "${workdir}/complete/nccl"
-}
-
-function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
-function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
-
-function install_nvidia_cudnn() {
-  if test -f "${workdir}/complete/cudnn" ; then return ; fi
-  local major_version
-  major_version="${CUDNN_VERSION%%.*}"
-  local cudnn_pkg_version
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
+function install_nvidia_cudnn() {
+  local major_version
+  major_version="${CUDNN_VERSION%%.*}"
+  local cudnn_pkg_version
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
 
   if is_rocky ; then
     if is_cudnn8 ; then
@@ -1319,6 +515,7 @@ function install_nvidia_cudnn() {
     if ge_debian12 && is_src_os ; then
       apt-get -y install nvidia-cudnn
     else
+      local CUDNN="${CUDNN_VERSION%.*}"
       if is_cudnn8 ; then
         install_local_cudnn8_repo
 
@@ -1328,8 +525,6 @@ function install_nvidia_cudnn() {
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
             "libcudnn8-dev=${cudnn_pkg_version}"
-
-        uninstall_local_cudnn8_repo
 	sync
       elif is_cudnn9 ; then
 	install_cuda_keyring_pkg
@@ -1346,15 +541,118 @@ function install_nvidia_cudnn() {
         echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
       fi
     fi
+  elif is_ubuntu ; then
+    local -a packages
+    packages=(
+      "libcudnn${major_version}=${cudnn_pkg_version}"
+      "libcudnn${major_version}-dev=${cudnn_pkg_version}")
+    execute_with_retries \
+      apt-get install -q -y --no-install-recommends "${packages[*]}"
+    sync
   else
-    echo "Unsupported OS: '${_shortname}'"
+    echo "Unsupported OS: '${OS_NAME}'"
     exit 1
   fi
 
   ldconfig
 
-  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
-  touch "${workdir}/complete/cudnn"
+  echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
+}
+
+CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+PSN="$(get_metadata_attribute private_secret_name)"
+readonly PSN
+function configure_dkms_certs() {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+    else
+      modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
+    fi
+
+    # Verify that cert md5sum matches expected md5sum
+    if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key modulus"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
+
+    # Verify that key md5sum matches expected md5sum
+    if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert modulus"
+    fi
+
+    return
+  fi
+
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  # symlink private key and copy public cert from volatile storage for DKMS
+  if is_ubuntu ; then
+    mkdir -p /var/lib/shim-signed/mok
+    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv
+    cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der
+  else
+    mkdir -p /var/lib/dkms/
+    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
+    cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub
+  fi
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv
+}
+
+function add_contrib_component() {
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
 }
 
 function add_nonfree_components() {
@@ -1370,93 +668,76 @@ function add_nonfree_components() {
   fi
 }
 
-#
-# Install package signing key and add corresponding repository
-# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 function add_repo_nvidia_container_toolkit() {
-  local nvctk_root="https://nvidia.github.io/libnvidia-container"
-  local signing_key_url="${nvctk_root}/gpgkey"
-  local repo_data
-
-  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
-                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+  if is_debuntu ; then
+      local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+      local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
+      # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+      test -f "${kr_path}" ||
+        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+          | gpg --dearmor -o "${kr_path}"
 
-  os_add_repo nvidia-container-toolkit \
-              "${signing_key_url}" \
-              "${repo_data}" \
-              "no"
+      test -f "${sources_list_path}" ||
+        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+          | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
+          | tee "${sources_list_path}"
+  fi
 }
 
 function add_repo_cuda() {
   if is_debuntu ; then
-    install_cuda_keyring_pkg # 11.7+, 12.0+
+    local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
+    local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
+    echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
+    | sudo tee "${sources_list_path}"
+    curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
+      -o "${kr_path}"
   elif is_rocky ; then
     execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
+    execute_with_retries "dnf clean all"
   fi
 }
 
+readonly uname_r=$(uname -r)
 function build_driver_from_github() {
-  # non-GPL driver will have been built on rocky8
-  if is_rocky8 ; then return 0 ; fi
+  if is_ubuntu ; then
+    mok_key=/var/lib/shim-signed/mok/MOK.priv
+    mok_der=/var/lib/shim-signed/mok/MOK.der
+  else
+    mok_key=/var/lib/dkms/mok.key
+    mok_der=/var/lib/dkms/mok.pub
+  fi
+  workdir=/opt/install-nvidia-driver
+  mkdir -p "${workdir}"
   pushd "${workdir}"
-
   test -d "${workdir}/open-gpu-kernel-modules" || {
-    local tarball_fn="${DRIVER_VERSION}.tar.gz"
+    tarball_fn="${DRIVER_VERSION}.tar.gz"
     curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
       "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
       | tar xz
     mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
   }
+  cd open-gpu-kernel-modules
 
-  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
-  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
-    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
-    local local_tarball="${workdir}/${build_tarball}"
-    local def_dir="${modulus_md5sum:-unsigned}"
-    local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
-
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+  time make -j$(nproc) modules \
+    >  /var/log/open-gpu-kernel-modules-build.log \
+    2> /var/log/open-gpu-kernel-modules-build_error.log
+  sync
 
-    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
-      echo "cache hit"
-    else
-      # build the kernel modules
-      pushd open-gpu-kernel-modules
-      install_build_dependencies
-      if ( is_cuda11 && is_ubuntu22 ) ; then
-        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
-        exit 1
-      fi
-      execute_with_retries make -j$(nproc) modules \
-        >  kernel-open/build.log \
-        2> kernel-open/build_error.log
-      # Sign kernel modules
-      if [[ -n "${PSN}" ]]; then
-        configure_dkms_certs
-        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
-          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
-          "${mok_key}" \
-          "${mok_der}" \
-          "${module}"
-        done
-	clear_dkms_key
-      fi
-      make modules_install \
-        >>  kernel-open/build.log \
-        2>> kernel-open/build_error.log
-      # Collect build logs and installed binaries
-      tar czvf "${local_tarball}" \
-        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
-        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
-      make clean
-      popd
-    fi
-    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
-    depmod -a
-  }
+  if [[ -n "${PSN}" ]]; then
+    #configure_dkms_certs
+    for module in $(find kernel-open -name '*.ko'); do
+      "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+      "${mok_key}" \
+      "${mok_der}" \
+      "${module}"
+    done
+    #clear_dkms_key
+  fi
 
+  make modules_install \
+    >> /var/log/open-gpu-kernel-modules-build.log \
+    2>> /var/log/open-gpu-kernel-modules-build_error.log
   popd
 }
 
@@ -1479,10 +760,12 @@ function build_driver_from_packages() {
     add_contrib_component
     apt-get update -qq
     execute_with_retries apt-get install -y -qq --no-install-recommends dkms
+    #configure_dkms_certs
     execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
     sync
 
   elif is_rocky ; then
+    #configure_dkms_certs
     if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
       echo "nvidia-driver:${DRIVER}-dkms installed successfully"
     else
@@ -1490,108 +773,26 @@ function build_driver_from_packages() {
     fi
     sync
   fi
+  #clear_dkms_key
 }
 
 function install_nvidia_userspace_runfile() {
-  # Parameters for NVIDIA-provided Debian GPU driver
-  readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-
-  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
-
-  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
-  readonly USERSPACE_FILENAME
-
-  # This .run file contains NV's OpenGL implementation as well as
-  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
-  # including glib (https://docs.gtk.org/glib/), and what appears to
-  # be a copy of the source from the kernel-open directory of for
-  # example DRIVER_VERSION=560.35.03
-  #
-  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
-  #
-  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
-  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  if test -f "${workdir}/complete/userspace" ; then return ; fi
-  local local_fn="${tmpdir}/userspace.run"
-
-  cache_fetched_package "${USERSPACE_URL}" \
-                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
-                        "${local_fn}"
-
-  local runfile_args
-  runfile_args=""
-  local cache_hit="0"
-  local local_tarball
-
-  if is_rocky8 ; then
-    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
-    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
-      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
-      local_tarball="${workdir}/${build_tarball}"
-      local def_dir="${modulus_md5sum:-unsigned}"
-      local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
-
-      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
-
-      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
-        cache_hit="1"
-        runfile_args="--no-kernel-modules"
-        echo "cache hit"
-      else
-        install_build_dependencies
-        configure_dkms_certs
-        local signing_options
-        signing_options=""
-        if [[ -n "${PSN}" ]]; then
-          signing_options="--module-signing-hash sha256 \
-          --module-signing-x509-hash sha256 \
-          --module-signing-secret-key \"${mok_key}\" \
-          --module-signing-public-key \"${mok_der}\" \
-          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
-          "
-        fi
-        runfile_args="--no-dkms ${signing_options}"
-      fi
-    }
-  else
-    runfile_args="--no-kernel-modules"
-  fi
-
-  execute_with_retries bash "${local_fn}" -e -q \
-    ${runfile_args} \
-    --ui=none \
-    --install-libglvnd \
-    --tmpdir="${tmpdir}"
-
-  if is_rocky8 ; then
-    if [[ "${cache_hit}" == "1" ]] ; then
-      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
-      depmod -a
-    else
-      clear_dkms_key
-      tar czvf "${local_tarball}" \
-        /var/log/nvidia-installer.log \
-        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-    fi
-  fi
-
-  rm -f "${local_fn}"
-  touch "${workdir}/complete/userspace"
+  if test -f "${tmpdir}/userspace-complete" ; then return ; fi
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
+  execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
+  rm -f "${tmpdir}/userspace.run"
+  touch "${tmpdir}/userspace-complete"
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${workdir}/complete/cuda" ; then return ; fi
-  local local_fn="${tmpdir}/cuda.run"
-
-  cache_fetched_package "${NVIDIA_CUDA_URL}" \
-			"${pkg_bucket}/${CUDA_RUNFILE}" \
-                        "${local_fn}"
-
-  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
-  rm -f "${local_fn}"
-  touch "${workdir}/complete/cuda"
+  if test -f "${tmpdir}/cuda-complete" ; then return ; fi
+  time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
+  execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
+  rm -f "${tmpdir}/cuda.run"
+  touch "${tmpdir}/cuda-complete"
   sync
 }
 
@@ -1607,11 +808,12 @@ function install_cuda_toolkit() {
   if is_debuntu ; then
 #    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
     execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
+    sync
   elif is_rocky ; then
     # rocky9: cuda-11-[7,8], cuda-12-[1..6]
     execute_with_retries dnf -y -q install "${cudatk_package}"
+    sync
   fi
-  sync
 }
 
 function load_kernel_module() {
@@ -1628,120 +830,57 @@ function load_kernel_module() {
   # TODO: if peermem is available, also modprobe nvidia-peermem
 }
 
-function install_cuda(){
-  if test -f "${workdir}/complete/cuda-repo" ; then return ; fi
-
-  if ( ge_debian12 && is_src_os ) ; then
-    echo "installed with the driver on ${_shortname}"
-    return 0
-  fi
-
-  # The OS package distributions are unreliable
-  install_cuda_runfile
-
-  # Includes CUDA packages
-  add_repo_cuda
-
-  touch "${workdir}/complete/cuda-repo"
-}
-
-function install_nvidia_container_toolkit() {
-  local container_runtime_default
-    if command -v docker     ; then container_runtime_default='docker'
-  elif command -v containerd ; then container_runtime_default='containerd'
-  elif command -v crio       ; then container_runtime_default='crio'
-                               else container_runtime_default='' ; fi
-  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
-
-  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
-
-  add_repo_nvidia_container_toolkit
-  if is_debuntu ; then
-    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
-    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
-  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
-  systemctl restart "${CONTAINER_RUNTIME}"
-}
-
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if test -f "${workdir}/complete/gpu-driver" ; then return ; fi
-
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
+    add_repo_nvidia_container_toolkit
     apt-get update -qq
+    #configure_dkms_certs
     apt-get -yq install \
-        dkms \
-        nvidia-open-kernel-dkms \
-        nvidia-open-kernel-support \
-        nvidia-smi \
-        libglvnd0 \
-        libcuda1
-    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
-    return 0
-  fi
+          nvidia-container-toolkit \
+          dkms \
+          nvidia-open-kernel-dkms \
+          nvidia-open-kernel-support \
+          nvidia-smi \
+          libglvnd0 \
+          libcuda1
+    #clear_dkms_key
+  elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then
 
-  # OS driver packages do not produce reliable driver ; use runfile
-  install_nvidia_userspace_runfile
+    install_nvidia_userspace_runfile
 
-  build_driver_from_github
-
-  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  touch "${workdir}/complete/gpu-driver"
-}
+    build_driver_from_github
 
-function install_ops_agent(){
-  if test -f "${workdir}/complete/ops-agent" ; then return ; fi
+    install_cuda_runfile
+  elif is_debuntu ; then
+    install_cuda_keyring_pkg
 
-  mkdir -p /opt/google
-  cd /opt/google
-  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
-  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
-  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
+    build_driver_from_packages
 
-  touch "${workdir}/complete/ops-agent"
-}
+    install_cuda_toolkit
+  elif is_rocky ; then
+    add_repo_cuda
 
-# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
-function install_gpu_monitoring_agent() {
-  download_gpu_monitoring_agent
-  install_gpu_monitoring_agent_dependency
-  start_gpu_monitoring_agent_service
-}
+    build_driver_from_packages
 
-function download_gpu_monitoring_agent(){
-  if is_rocky ; then
-    execute_with_retries "dnf -y -q install git"
+    install_cuda_toolkit
   else
-    execute_with_retries "apt-get install git -y"
+    echo "Unsupported OS: '${OS_NAME}'"
+    exit 1
+  fi
+  ldconfig
+  if is_src_os ; then
+    echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully"
+  else
+    echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
   fi
-  mkdir -p /opt/google
-  chmod 777 /opt/google
-  cd /opt/google
-  test -d compute-gpu-monitoring || \
-    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
-}
-
-function install_gpu_monitoring_agent_dependency(){
-  cd /opt/google/compute-gpu-monitoring/linux
-  python3 -m venv venv
-  venv/bin/pip install wheel
-  venv/bin/pip install -Ur requirements.txt
-}
-
-function start_gpu_monitoring_agent_service(){
-  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
-  systemctl daemon-reload
-  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
 function install_gpu_agent() {
-  # Stackdriver GPU agent parameters
-#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
-  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
-  if ( ! command -v pip && is_debuntu ) ; then
-    execute_with_retries "apt-get install -y -qq python3-pip"
+  if ! command -v pip; then
+    execute_with_retries "apt-get install -y -qq python-pip"
   fi
   local install_dir=/opt/gpu-utilization-agent
   mkdir -p "${install_dir}"
@@ -1751,13 +890,7 @@ function install_gpu_agent() {
     "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
-  local venv="${install_dir}/venv"
-  python3 -m venv "${venv}"
-(
-  source "${venv}/bin/activate"
-  python3 -m pip install --upgrade pip
-  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
-)
+  execute_with_retries pip install -r "${install_dir}/requirements.txt"
   sync
 
   # Generate GPU service.
@@ -1768,7 +901,7 @@ Description=GPU Utilization Metric Agent
 [Service]
 Type=simple
 PIDFile=/run/gpu_agent.pid
-ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
+ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"'
 User=root
 Group=root
 WorkingDirectory=/
@@ -1783,57 +916,75 @@ EOF
   systemctl --no-reload --now enable gpu-utilization-agent.service
 }
 
-function configure_gpu_exclusive_mode() {
-  # only run this function when spark < 3.0
-  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
-  # include exclusive mode on GPU
-  nvsmi -c EXCLUSIVE_PROCESS
-  clear_nvsmi_cache
-}
-
-function fetch_mig_scripts() {
-  mkdir -p /usr/local/yarn-mig-scripts
-  chmod 755 /usr/local/yarn-mig-scripts
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  chmod 755 /usr/local/yarn-mig-scripts/*
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
 }
 
-function install_spark_rapids() {
-  # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
-  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
+function configure_yarn() {
+  if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
 
-  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
-  local -r scala_ver="2.12"
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
 
-  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
-  fi
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
 
-  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
-  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
 
-  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
-  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
-  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
 
-  local jar_basename
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
 
-  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
-  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
 
-  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
-  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
+function configure_gpu_exclusive_mode() {
+  # check if running spark 3, if not, enable GPU exclusive mode
+  local spark_version
+  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
+  if [[ ${spark_version} != 3.* ]]; then
+    # include exclusive mode on GPU
+    nvsmi -c EXCLUSIVE_PROCESS
+  fi
+}
 
-  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
-  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
+function fetch_mig_scripts() {
+  mkdir -p /usr/local/yarn-mig-scripts
+  sudo chmod 755 /usr/local/yarn-mig-scripts
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
+  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
+  sudo chmod 755 /usr/local/yarn-mig-scripts/*
 }
 
 function configure_gpu_script() {
@@ -1863,7 +1014,6 @@ function configure_gpu_script() {
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
 ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
@@ -1872,51 +1022,10 @@ EOF
 
   chmod a+rx "${gpus_resources_script}"
 
-  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-
-  local executor_cores
-  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
-  local executor_memory
-  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
-  local task_cpus=2
-  local gpu_amount
-
-  # The current setting of spark.task.resource.gpu.amount (0.333) is
-  # not ideal to get the best performance from the RAPIDS Accelerator
-  # plugin. It's recommended to be 1/{executor core count} unless you
-  # have a special use case.
-#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
-
-# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
-
-  cat >>"${spark_defaults_conf}" <<EOF
-###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
-# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
-# query explain output won't show GPU operator, if the user has doubts
-# they can uncomment the line before seeing the GPU plan explain;
-# having AQE enabled gives user the best performance.
-spark.executor.resource.gpu.amount=${gpu_count}
-spark.plugins=com.nvidia.spark.SQLPlugin
-spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
-spark.executor.cores=${executor_cores}
-spark.executor.memory=${executor_memory_gb}G
-spark.dynamicAllocation.enabled=false
-# please update this config according to your application
-spark.task.resource.gpu.amount=${gpu_amount}
-spark.task.cpus=2
-spark.yarn.unmanagedAM.enabled=false
-###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
-EOF
-}
-
-function configure_yarn_nodemanager_gpu() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
-  configure_yarn_nodemanager
+  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
+  if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
+    echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
+  fi
 }
 
 function configure_gpu_isolation() {
@@ -1949,12 +1058,12 @@ EOF
 
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
-  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2
   elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
   elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
   else nvsmi_works="1" ; fi
 
-  if test -v 1 && [[ "$1" == "-L" ]] ; then
+  if [[ "$1" == "-L" ]] ; then
     local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
     if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
     else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
@@ -1965,35 +1074,14 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
-function clear_nvsmi_cache() {
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
-    rm "${nvsmi_query_xml}"
-  fi
-}
-
-function query_nvsmi() {
-  if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
-  nvsmi -q -x --dtd > "${nvsmi_query_xml}"
-}
-
-function install_build_dependencies() {
-  if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
-
+function install_dependencies() {
   if is_debuntu ; then
-    if is_ubuntu22 && is_cuda12 ; then
-      # On ubuntu22, the default compiler does not build some kernel module versions
-      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
-      execute_with_retries apt-get install -y -qq gcc-12
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-12
-    fi
-
+    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen
   elif is_rocky ; then
-    execute_with_retries dnf -y -q install gcc
+    execute_with_retries dnf -y -q install pciutils gcc screen
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
+    local install_log="${tmpdir}/install.log"
     set +e
     eval "${dnf_cmd}" > "${install_log}" 2>&1
     local retval="$?"
@@ -2016,259 +1104,364 @@ function install_build_dependencies() {
 
     execute_with_retries "${dnf_cmd}"
   fi
-  touch "${workdir}/complete/build-dependencies"
 }
 
-function prepare_gpu_env(){
-  set +e
-  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
-  set -e
-  echo "gpu_count=[${gpu_count}]"
-  nvsmi_works="0"
-  nvsmi_query_xml="${tmpdir}/nvsmi.xml"
-  xmllint="/opt/conda/miniconda3/bin/xmllint"
-  NVIDIA_SMI_PATH='/usr/bin'
-  MIG_MAJOR_CAPS=0
-  IS_MIG_ENABLED=0
-  CUDNN_PKG_NAME=""
-  CUDNN8_PKG_NAME=""
-  CUDA_LOCAL_REPO_INSTALLED="0"
+function main() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn
 
-  # Whether to install NVIDIA-provided or OS-provided GPU driver
-  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
-  readonly GPU_DRIVER_PROVIDER
+  # Detect NVIDIA GPU
+  if (lspci | grep -q NVIDIA); then
+    # if this is called without the MIG script then the drivers are not installed
+    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
+    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
 
-  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-  readonly INSTALL_GPU_AGENT
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
+      fi
+    fi
 
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-  readonly RAPIDS_RUNTIME
+    # if mig is enabled drivers would have already been installed
+    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+      install_nvidia_gpu_driver
 
-  # determine whether we have nvidia-smi installed and working
-  nvsmi
+      load_kernel_module
 
-  set_cuda_version
-  set_driver_version
-  set_cuda_repo_shortname
-  set_nv_urls
-  set_cuda_runfile_url
-  set_cudnn_version
-  set_cudnn_tarball_url
+      if [[ -n ${CUDNN_VERSION} ]]; then
+        install_nvidia_nccl
+        install_nvidia_cudnn
+      fi
+      #Install GPU metrics collection in Stackdriver if needed
+      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+        install_gpu_agent
+        echo 'GPU metrics agent successfully deployed.'
+      else
+        echo 'GPU metrics agent will not be installed.'
+      fi
 
-  if   is_cuda11 ; then gcc_ver="11"
-  elif is_cuda12 ; then gcc_ver="12" ; fi
-}
+      # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
+      for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
+        rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+      done
 
-# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
-# Users should run apt-mark unhold before they wish to upgrade these packages
-function hold_nvidia_packages() {
-  if ! is_debuntu ; then return ; fi
+      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
+      if test -n "$(nvsmi -L)" ; then
+	# cache the result of the gpu query
+        ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
+        echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
+      fi
+      NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
+      if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+        # enable MIG on every GPU
+	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
+	  nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
+	done
+
+        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+        MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
+        fetch_mig_scripts
+      else
+        configure_gpu_exclusive_mode
+      fi
+    fi
 
-  apt-mark hold nvidia-*
-  apt-mark hold libnvidia-*
-  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
-    apt-mark hold xserver-xorg-video-nvidia*
+    configure_yarn_nodemanager
+    configure_gpu_script
+    configure_gpu_isolation
+  elif [[ "${ROLE}" == "Master" ]]; then
+    configure_yarn_nodemanager
+    configure_gpu_script
   fi
-}
-
-function delete_mig_instances() (
-  # delete all instances
-  set +e
-  nvidia-smi mig -dci
-
-  case "${?}" in
-    "0" ) echo "compute instances deleted"            ;;
-    "2" ) echo "invalid argument"                     ;;
-    "6" ) echo "No compute instances found to delete" ;;
-    *   ) echo "unrecognized return code"             ;;
-  esac
-
-  nvidia-smi mig -dgi
-  case "${?}" in
-    "0" ) echo "compute instances deleted"        ;;
-    "2" ) echo "invalid argument"                 ;;
-    "6" ) echo "No GPU instances found to delete" ;;
-    *   ) echo "unrecognized return code"         ;;
-  esac
-)
 
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
-function configure_mig_cgi() {
-  delete_mig_instances
-  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
-  if test -n "${META_MIG_CGI_VALUE}"; then
-    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
-  else
-    # https://pci-ids.ucw.cz/v2.2/pci.ids
-    local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)"
-    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then
-      # run the following command to list placement profiles
-      # nvidia-smi mig -lgipp
-      #
-      # This is the result when using H100 instances on 20241220
-      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
-      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
-      # GPU  0 Profile ID  9 Placements: {0,4}:4
-      # GPU  0 Profile ID  5 Placement : {0}:4
-      # GPU  0 Profile ID  0 Placement : {0}:8
-
-      # For H100 3D controllers, consider profile 19, 7x1G instances
-      nvidia-smi mig -cgi 9,9 -C
-    elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
-      # Dataproc only supports H100s right now ; split in 2 if not specified
-      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
-      nvidia-smi mig -cgi 9,9 -C
-    else
-      echo "unrecognized 3D controller"
-    fi
+  # Restart YARN services if they are running already
+  if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then
+    systemctl restart hadoop-yarn-resourcemanager.service
+  fi
+  if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then
+    systemctl restart hadoop-yarn-nodemanager.service
   fi
-  clear_nvsmi_cache
 }
 
-function enable_mig() {
-  if test -f "${workdir}/complete/enable-mig" ; then return ; fi
-
-  # Start persistenced if it's not already running
-  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
-  for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
-    # Write an ascii zero to the numa node indicator
-    echo "0" | dd of="${f}" status=none
-  done
-  time nvsmi --gpu-reset # 30s
-  nvsmi -mig 1
-  clear_nvsmi_cache
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
 
-  touch "${workdir}/complete/enable-mig"
-}
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
 
-function enable_and_configure_mig() {
-  # default MIG to on when this script is used
-  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
 
-  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    fi
 
-  enable_mig
-  query_nvsmi
-  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-  mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
 
-  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
-  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured but NOT enabled.  Failing" ; exit 1 ; fi
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
 
-  echo "MIG is fully enabled"
-  configure_mig_cgi
-}
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
 
-function setup_gpu_yarn() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn_resources
 
-  # When there is no GPU, but the installer is executing on a master node:
-  if [[ "${gpu_count}" == "0" ]] ; then
-    if [[ "${ROLE}" == "Master" ]]; then
-      configure_yarn_nodemanager
-    fi
-    return 0
-  fi
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
 
-  if [[ "${nvsmi_works}" == "1" ]] ; then
-    # if this is called without the MIG script then the drivers are not installed
-    query_nvsmi
-    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-    set +e
-    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
-    set -e
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
 
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
       fi
-    fi
+    done
   fi
 
-  # if mig is enabled drivers would have already been installed
-  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-    install_nvidia_gpu_driver
-    install_cuda
-    load_kernel_module
-
-    #Install GPU metrics collection in Stackdriver if needed
-    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-      install_gpu_agent
-#      install_gpu_monitoring_agent
-      echo 'GPU metrics agent successfully deployed.'
-    else
-      echo 'GPU metrics agent has not been installed.'
-    fi
-    configure_gpu_exclusive_mode
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
+
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
   fi
 
-  install_nvidia_container_toolkit
-  configure_yarn_nodemanager_gpu
-  configure_gpu_script
-  configure_gpu_isolation
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+
 }
 
-function gpu_exit_handler() {
+function exit_handler() {
+  set +ex
+  echo "Exit handler invoked"
+
+  # Purge private key material until next grant
+  clear_dkms_key
+
+  # Clear pip cache
+  pip cache purge || echo "unable to purge pip cache"
+
+  # If system memory was sufficient to mount memory-backed filesystems
   if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    for shmdir in /var/cudnn-local ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
+      if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then
         umount -f ${shmdir}
       fi
     done
+
+    # restart services stopped during preparation stage
+    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
   fi
-  hold_nvidia_packages
-}
 
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
 
-function main() {
-  setup_gpu_yarn
+  # print disk usage statistics for large components
+  if is_ubuntu ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3 | sort -h
+  elif is_debian ; then
+    du -hs \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3 | sort -h
+  else
+    du -hs \
+      /var/lib/docker \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib64/google-cloud-sdk \
+      /usr/lib \
+      /opt/nvidia/* \
+      /usr/local/cuda-1?.? \
+      /opt/conda/miniconda3
+  fi
+
+  # Process disk usage logs from installation period
+  rm -f /run/keep-running-df
+  sync
+  sleep 5.01s
+  # compute maximum size of disk during installation
+  # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+  df / | tee -a "/run/disk-usage.log"
 
-  echo "yarn setup complete"
+  perl -e '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+print( "    samples-taken: ", scalar @siz, $/,
+       "maximum-disk-used: $max", $/,
+       "minimum-disk-used: $min", $/,
+       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
 
-  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
-    install_nvidia_nccl
-    install_nvidia_cudnn
-  fi
+  echo "exit_handler has completed"
 
-  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
-    install_spark_rapids
-    configure_gpu_script
-    echo "RAPIDS initialized with Spark runtime"
-  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
-    # we are not currently tooled for installing dask in this action.
-    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
-  else
-    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
   fi
 
-  echo "main complete"
   return 0
 }
 
-function exit_handler() {
-  gpu_exit_handler
-  pip_exit_handler
-  common_exit_handler
-  return 0
+function set_proxy(){
+  export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)"
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  export no_proxy=metadata.google.internal,169.254.169.254
+  export NO_PROXY=metadata.google.internal,169.254.169.254
+}
+
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Download pip packages to tmpfs
+  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
 }
 
 function prepare_to_install(){
-  prepare_common_env
-  prepare_pip_env
-  prepare_gpu_env
+  nvsmi_works="0"
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  tmpdir=/tmp/
+  if ! is_debuntu && ! is_rocky ; then
+    echo "Unsupported OS: '$(os_name)'"
+    exit 1
+  fi
+
+  repair_old_backports
+
+  export DEBIAN_FRONTEND=noninteractive
+
   trap exit_handler EXIT
+  mount_ramdisk
+  install_log="${tmpdir}/install.log"
+
+  set_proxy
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    sleep 5s
+    apt-get -y -qq autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
+
+  # zero free disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  ) fi
+
+  configure_dkms_certs
+
+  install_dependencies
+
+  # Monitor disk usage in a screen session
+  df / > "/run/disk-usage.log"
+  touch "/run/keep-running-df"
+  screen -d -m -US keep-running-df \
+    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 }
 
 prepare_to_install
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 1f3328eaa..f8438915f 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -4,27 +4,27 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 
-import unittest
-
 from integration_tests.dataproc_test_case import DataprocTestCase
 
-DEFAULT_TIMEOUT = 15  # minutes
-DEFAULT_CUDA_VERSION = "12.4"
 
 class NvidiaGpuDriverTestCase(DataprocTestCase):
   COMPONENT = "gpu"
   INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
+  GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a
+  GPU_A100 = "type=nvidia-tesla-a100"
   GPU_H100 = "type=nvidia-h100-80gb,count=8"
 
   def verify_instance(self, name):
     # Verify that nvidia-smi works
-    import random
-    # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions
-    time.sleep( 3 + random.randint(1, 30) )
+    time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience
     self.assert_instance_command(name, "nvidia-smi", 1)
 
+  def verify_pyspark(self, name):
+    # Verify that pyspark works
+    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -41,27 +41,6 @@ def verify_instance_nvcc(self, name, cuda_version):
     self.assert_instance_command(
         name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) )
 
-  def verify_instance_pyspark(self, name):
-    # Verify that pyspark works
-    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
-
-  def verify_instance_cuda_version(self, name, cuda_version):
-    self.assert_instance_command(
-        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) )
-
-  def verify_instance_driver_version(self, name, driver_version):
-    self.assert_instance_command(
-        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) )
-
-  def verify_pyspark(self):
-    self.assert_dataproc_job(
-      self.getClusterName(),
-      "pyspark",
-      """--properties="spark.executor.resource.gpu.amount=1" \
-         --properties="spark.task.resource.gpu.amount=0.01" \
-         '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO)
-    )
-
   def verify_instance_spark(self):
     self.assert_dataproc_job(
       self.getClusterName(),
@@ -77,22 +56,6 @@ def verify_instance_spark(self):
       +   "spark.yarn.unmanagedAM.enabled=false"
     )
 
-  def verify_driver_signature(self, name):
-    cert_path='/var/lib/dkms/mok.pub'
-    if self.getImageOs() == 'ubuntu':
-      cert_path='/var/lib/shim-signed/mok/MOK.der'
-
-    cert_verification_cmd = """
-perl -Mv5.10 -e '
-my $cert = ( qx{openssl x509 -inform DER -in {} -text}
-             =~ /Serial Number:.*? +(.+?)\s*$/ms );
-my $kmod = ( qx{modinfo nvidia}
-             =~ /^sig_key:\s+(\S+)/ms );
-exit 1 unless $cert eq lc $kmod
-'
-"""
-    self.assert_instance_command( name, cert_verification_cmd.format(cert_path) )
-
   @parameterized.parameters(
       ("SINGLE",   ["m"], GPU_T4, None, None),
 #      ("STANDARD", ["m"], GPU_T4, None, None),
@@ -101,14 +64,8 @@ def verify_driver_signature(self, name):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
-
-    if configuration == 'SINGLE' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
     metadata = None
     if driver_provider is not None:
@@ -116,18 +73,17 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-standard-32",
+        machine_type="n1-highmem-8",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90, # This cluster is sized and timed appropriately to build the kernel driver and nccl
-        boot_disk_size="60GB")
+        timeout_in_minutes=90,
+        boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
-      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
-      self.verify_instance_pyspark(machine_name)
-    self.verify_pyspark()
+      if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ):
+        self.verify_pyspark(machine_name)
 
   @parameterized.parameters(
       ("SINGLE", ["m"], GPU_T4, None, None),
@@ -135,16 +91,13 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
   def test_install_gpu_without_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
+
     self.skipTest("No need to regularly test not installing the agent")
 
-    metadata = "install-gpu-agent=false"
-    if configuration == 'SINGLE' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
+    metadata = "install-gpu-agent=false"
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
     self.createCluster(
@@ -154,27 +107,22 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90,
+        timeout_in_minutes=30,
         boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
-      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-    self.verify_pyspark()
+      self.verify_instance("{}-{}".format(self.getClusterName(),
+                                          machine_suffix))
+
   @parameterized.parameters(
-      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
+      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
 #      ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"),
 #      ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"),
   )
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
-    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
-
-    if configuration == 'KERBEROS' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -186,47 +134,40 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90,
+        timeout_in_minutes=30,
         boot_disk_size="50GB",
         scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
-      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_gpu_agent(machine_name)
-    self.verify_pyspark()
+      self.verify_instance("{}-{}".format(self.getClusterName(),
+                                          machine_suffix))
+      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
+                                                    machine_suffix))
 
   @parameterized.parameters(
-        ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
-#        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
+#       ("SINGLE", ["m"],               GPU_T4, None,   "12.0"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
       ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
+#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.8"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if configuration == 'KERBEROS' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
+    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
+      self.skipTest("CUDA == 12.0 not supported on debian 12")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
-
-    if configuration == 'SINGLE' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
-
+      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -236,41 +177,40 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90,
+        timeout_in_minutes=30,
         boot_disk_size="50GB")
-
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
       self.verify_instance_nvcc(machine_name, cuda_version)
-      self.verify_instance_pyspark(machine_name)
-    self.verify_pyspark()
 
   @parameterized.parameters(
-      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "11.8"),
-#      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.0"),
-      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.4"),
+      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
+#      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"),
+      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"),
   )
   def test_install_gpu_with_mig(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider, cuda_version):
-    # Operation [projects/.../regions/.../operations/...] failed:
-    # Invalid value for field 'resource.machineType': \
-    # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \
-    # 'machineTypes/a3-highgpu-8g'. \
-    # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature..
-    # ('This use case not thoroughly tested')
-    unittest.expectedFailure(self)
-    self.skipTest("known to fail")
-
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
+
+    self.skipTest("Test is known to fail.  Skipping so that we can exercise others")
+
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+
+    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
+      self.skipTest("CUDA == 12.0 not supported on debian 12")
+
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -278,11 +218,11 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
         configuration,
         self.INIT_ACTIONS,
         master_machine_type="a3-highgpu-8g",
-        worker_machine_type="a3-highgpu-8g",
+        worker_machine_type="a2-highgpu-2g",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90,
+        timeout_in_minutes=30,
         boot_disk_size="50GB",
         startup_script="gpu/mig.sh")
 
@@ -296,13 +236,12 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   )
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if configuration == 'SINGLE' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
+    and configuration == 'SINGLE':
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
 
     metadata = None
     if driver_provider is not None:
@@ -316,9 +255,9 @@ def test_gpu_allocation(self, configuration, master_accelerator,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         boot_disk_size="50GB",
-        timeout_in_minutes=90)
+        timeout_in_minutes=30)
 
-    self.verify_pyspark()
+    self.verify_instance_spark()
 
   @parameterized.parameters(
     ("SINGLE", ["m"], GPU_T4, None, "11.8"),
@@ -331,83 +270,28 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
-    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
-          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
-    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
+    and configuration == 'SINGLE':
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty")
 
-    if configuration == 'SINGLE' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
+    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
+      self.skipTest("CUDA == 12.0 not supported on debian 12")
 
-    metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
-    self.createCluster(
-      configuration,
-      self.INIT_ACTIONS,
-      machine_type="n1-highmem-8",
-      master_accelerator=master_accelerator,
-      worker_accelerator=worker_accelerator,
-      metadata=metadata,
-      timeout_in_minutes=90,
-      boot_disk_size="50GB",
-      scopes="https://www.googleapis.com/auth/monitoring.write")
-
-    for machine_suffix in machine_suffixes:
-      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_gpu_agent(machine_name)
-    self.verify_pyspark()
-
-  @parameterized.parameters(
-#    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
-#    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
-    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.1.1", 'rocky', '2.0'),
-#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
-#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
-#    ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
-#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
-#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
-  )
-  def tests_driver_signing(self, configuration, machine_suffixes,
-                           master_accelerator, worker_accelerator,
-                           cuda_version, image_os, image_version):
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
-
-    if configuration == 'KERBEROS' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
-      self.skipTest("known to fail")
-
-    kvp_array=[]
-    import os
-
-    if "private_secret_name" in os.environ:
-      for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']:
-        kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) )
-
-      if kvp_array[0] == "public_secret_name=":
-        self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
-    else:
-      self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
-
-    metadata = ",".join( kvp_array )
-
-    if self.getImageOs() != image_os:
-      self.skipTest("This test is only run on os {}".format(image_os))
-    if self.getImageVersion() != image_version:
-      self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os))
+      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
 
+    metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
       configuration,
       self.INIT_ACTIONS,
@@ -415,16 +299,16 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
-      timeout_in_minutes=90,
+      timeout_in_minutes=30,
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
-      hostname="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(hostname)
-      self.verify_instance_gpu_agent(hostname)
-      self.verify_driver_signature(hostname)
+      self.verify_instance("{}-{}".format(self.getClusterName(),
+                                          machine_suffix))
+      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
+                                                    machine_suffix))
 
-    self.verify_pyspark()
+    self.verify_instance_spark()
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py
deleted file mode 100644
index 9f2b18683..000000000
--- a/gpu/verify_pyspark.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3 
-# 
-# Copyright 2025 Google LLC and contributors
-# 
-# Licensed under the Apache License, Version 2.0 (the "License"); 
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at 
-# 
-#      http://www.apache.org/licenses/LICENSE-2.0 
-# 
-# Unless required by applicable law or agreed to in writing, software 
-# distributed under the License is distributed on an "AS-IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
-# See the License for the specific language governing permissions and 
-# limitations under the License. 
-# 
-import matplotlib.pyplot as plt
-import numpy as np
-
-from pyspark import SparkContext
-from pyspark.sql import SparkSession
-from pyspark import SparkConf, StorageLevel
-from tqdm import tqdm
-from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
-import pyspark.sql.functions as f
-
-spark = SparkSession.builder.appName("spark-rapids").getOrCreate()
-
-#from utils import SimpleTimer, ResultsLogger, visualize_data 
-
-conf = (SparkConf().setMaster("local[*]")
-                   .setAppName("SparkVectorizer")
-                   .set('spark.driver.memory', '300G')
-                   .set('spark.driver.maxResultSize', '20G')
-                   .set('spark.network.timeout', '7200s')
-        )
-
-sc = SparkContext.getOrCreate(conf=conf)
-sc.setLogLevel("FATAL")
-spark = SparkSession(sc)
-print(sc._conf.getAll()) # check context settings 
-
-x = np.linspace(0, 3*np.pi, 500)
-plt.plot(x, np.sin(x**2))
-plt.title('A simple chirp');

From f00e2f80696c537f2143cd408e9831e3fbf57c46 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 13:20:31 -0800
Subject: [PATCH 118/130] including libtemplate-perl as a dependency

---
 cloudbuild/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
index aebaffd84..644219305 100644
--- a/cloudbuild/Dockerfile
+++ b/cloudbuild/Dockerfile
@@ -22,7 +22,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
     dd of="${bazel_repo_file}" status=none && \
     apt-get update -qq
 RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
+    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} \
+                           libtemplate-perl > /dev/null 2>&1 && \
     apt-get clean
 
 # Set bazel-${bazel_version} as the default bazel alternative in this container

From 7118ebf5704602394fe8d24cdd79ccc3b27e32af Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 13:39:08 -0800
Subject: [PATCH 119/130] moved to dask-template-20250104

---
 templates/dask/dask.sh.in | 63 ---------------------------------------
 1 file changed, 63 deletions(-)
 delete mode 100644 templates/dask/dask.sh.in

diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in
deleted file mode 100644
index 2f8450dd6..000000000
--- a/templates/dask/dask.sh.in
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-#
-[% INSERT legal/license_header %]
-#
-[% PROCESS common/template_disclaimer %]
-#
-# This initialization action script will install Dask and other relevant
-# libraries on a Dataproc cluster. This is supported for either "yarn" or
-# "standalone" runtimes Please see dask.org and yarn.dask.org for more
-# information.
-
-set -euxo pipefail
-
-[% INSERT common/util_functions %]
-
-[% INSERT dask/util_functions %]
-
-function main() {
-  # Install Dask
-  install_dask
-
-  # In "standalone" mode, Dask relies on a systemd unit to launch.
-  # In "yarn" mode, it relies on a config.yaml file.
-  if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
-    # Create Dask YARN config file
-    configure_dask_yarn
-  elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then
-    # Create Dask service
-    install_systemd_dask_service
-    start_systemd_dask_service
-
-    configure_knox_for_dask
-
-    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')"
-    if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
-      configure_fluentd_for_dask
-    fi
-  else
-    echo "Unsupported Dask Runtime: ${DASK_RUNTIME}"
-    exit 1
-  fi
-
-  echo "Dask for ${DASK_RUNTIME} successfully initialized."
-}
-
-function exit_handler() {
-  pip_exit_handler
-  common_exit_handler
-  return 0
-}
-
-function prepare_to_install(){
-  prepare_common_env
-  prepare_conda_env
-  conda_env="$(get_metadata_attribute conda-env 'dask')"
-  readonly conda_env
-  prepare_dask_env
-  trap exit_handler EXIT
-}
-
-prepare_to_install
-
-main

From f2b50f74d4a8463f0c678e8042b0c61a7ec60cb2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 13:39:40 -0800
Subject: [PATCH 120/130] moved to gpu-template-20250107

---
 templates/gpu/install_gpu_driver.sh.in | 80 --------------------------
 1 file changed, 80 deletions(-)
 delete mode 100644 templates/gpu/install_gpu_driver.sh.in

diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
deleted file mode 100644
index a7c4d353f..000000000
--- a/templates/gpu/install_gpu_driver.sh.in
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-#
-[% INSERT legal/license_header %]
-#
-[% PROCESS common/template_disclaimer %]
-#
-# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
-
-set -euxo pipefail
-
-[% INSERT common/util_functions %]
-
-[% INSERT common/install_functions %]
-
-[% INSERT gpu/util_functions %]
-
-[% INSERT gpu/install_functions %]
-
-[% INCLUDE gpu/yarn_functions %]
-
-[% INSERT gpu/spark_functions %]
-
-function main() {
-  install_gpu_driver_and_cuda
-
-  #Install GPU metrics collection in Stackdriver if needed
-  if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-    install_gpu_agent
-#    install_gpu_monitoring_agent
-    echo 'GPU metrics agent successfully deployed.'
-  else
-    echo 'GPU metrics agent has not been installed.'
-  fi
-  configure_gpu_exclusive_mode
-
-  setup_gpu_yarn
-
-  echo "yarn setup complete"
-
-  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
-    install_nvidia_nccl
-    install_nvidia_cudnn
-  fi
-
-  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
-    install_spark_rapids
-    configure_gpu_script
-    echo "RAPIDS initialized with Spark runtime"
-  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
-    # we are not currently tooled for installing dask in this action.
-    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
-  else
-    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
-  fi
-
-  echo "main complete"
-  return 0
-}
-
-function exit_handler() {
-  set +e
-  gpu_install_exit_handler
-  gpu_exit_handler
-  pip_exit_handler
-  yarn_exit_handler
-  common_exit_handler
-  return 0
-}
-
-function prepare_to_install(){
-  prepare_common_env
-  prepare_pip_env
-  prepare_gpu_env
-  prepare_gpu_install_env
-  trap exit_handler EXIT
-}
-
-prepare_to_install
-
-main

From 900c10a0d34bae4dbc50685c3cc42b7e7b45341b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 13:56:26 -0800
Subject: [PATCH 121/130] * include version in template disclaimer * include
 version in action generator

---
 templates/common/template_disclaimer | 9 ++++++---
 templates/generate-action.pl         | 3 +++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/templates/common/template_disclaimer b/templates/common/template_disclaimer
index 3b417deff..1c2d22b04 100644
--- a/templates/common/template_disclaimer
+++ b/templates/common/template_disclaimer
@@ -1,5 +1,8 @@
+#
+# Google Cloud Dataproc Initialization Actions v[% IA_VERSION %]
+#
 # This initialization action is generated from
-# initialization-actions/templates/[% template_path %]
+# initialization-actions/templates/[% template_path %].in
 #
-# Modifications made directly to the generated file will be lost when
-# the template is re-evaluated
+# Modifications made directly to generated files will be lost when the
+# templates are next evaluated.
diff --git a/templates/generate-action.pl b/templates/generate-action.pl
index 690acb409..2e1d344ff 100644
--- a/templates/generate-action.pl
+++ b/templates/generate-action.pl
@@ -7,6 +7,9 @@
 use Template;
 use strict;
 
+# Version of Initialization Actions we will generate
+my $IA_VERSION="0.0.1";
+
 my $action = $ARGV[0];
 my $v = {
   template_path => "${action}",

From bef08b17cc0018b7426469b5fff49ab3b8bc255b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 19:02:43 -0800
Subject: [PATCH 122/130] migrated rapids.sh base template to
 rapids-template-20250106

---
 templates/rapids/rapids.sh.in | 63 -----------------------------------
 1 file changed, 63 deletions(-)
 delete mode 100644 templates/rapids/rapids.sh.in

diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
deleted file mode 100644
index 61b7247c0..000000000
--- a/templates/rapids/rapids.sh.in
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-#
-[% INSERT legal/license_header %]
-#
-[% PROCESS common/template_disclaimer %]
-#
-# This initialization action script will install rapids on a Dataproc
-# cluster.
-
-set -euxo pipefail
-
-[% INSERT common/util_functions %]
-
-[% INSERT gpu/util_functions %]
-
-[% INSERT dask/util_functions %]
-
-function main() {
-  # Install Dask with RAPIDS
-  install_dask_rapids
-
-  # In "standalone" mode, Dask relies on a systemd unit to launch.
-  # In "yarn" mode, it relies a config.yaml file.
-  if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
-    # Create cuda accelerated Dask YARN config file
-    configure_dask_yarn
-    echo "yarn setup complete"
-  else
-    # Create Dask service
-    install_systemd_dask_service
-    start_systemd_dask_service
-
-    configure_knox_for_dask
-
-    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')"
-    if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
-      configure_fluentd_for_dask
-    fi
-  fi
-}
-
-function exit_handler() {
-  gpu_exit_handler
-  pip_exit_handler
-  conda_exit_handler
-  common_exit_handler
-  return 0
-}
-
-function prepare_to_install(){
-  prepare_common_env
-  conda_env="$(get_metadata_attribute conda-env 'dask-rapids')"
-  readonly conda_env
-  prepare_dask_rapids_env
-  prepare_conda_env
-  prepare_pip_env
-  prepare_gpu_env
-  trap exit_handler EXIT
-}
-
-prepare_to_install
-
-main

From aa792c39fb597322c36a4f835503b8011e12c498 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 19:14:11 -0800
Subject: [PATCH 123/130] script to generate all actions from templates

---
 templates/generate-all-actions.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 templates/generate-all-actions.sh

diff --git a/templates/generate-all-actions.sh b/templates/generate-all-actions.sh
new file mode 100644
index 000000000..2b25d99c5
--- /dev/null
+++ b/templates/generate-all-actions.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+for tt in $(find templates -name '*.sh.in') ; do
+  genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"`
+  perl templates/generate-action.pl "${genfile}" | tee "${genfile}" > /tmp/$(basename $genfile)
+done

From 824bcf85e1d74287c86636f3e5e285fd680595db Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 20:16:02 -0800
Subject: [PATCH 124/130] spark prepare steps belong in common

---
 templates/common/util_functions | 26 ++++++++++++++++++++++++++
 templates/gpu/spark_functions   | 29 -----------------------------
 2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 9a6407a7b..7cbef0849 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -491,6 +491,32 @@ function prepare_conda_env() {
 }
 
 function prepare_common_env() {
+  SPARK_NLP_VERSION="3.2.1" # Must include subminor version here
+  SPARK_JARS_DIR=/usr/lib/spark/jars
+  SPARK_CONF_DIR='/etc/spark/conf'
+  SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")"
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+
+  readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION
+
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+
   # Verify OS compatability and Secure boot state
   check_os
   check_secure_boot
diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions
index 25a99221e..fa29330de 100644
--- a/templates/gpu/spark_functions
+++ b/templates/gpu/spark_functions
@@ -41,32 +41,3 @@ function install_spark_rapids() {
                         "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
                         "/usr/lib/spark/jars/${jar_basename}"
 }
-
-function prepare_spark_env() {
-  SPARK_NLP_VERSION="3.2.1" # Must include subminor version here
-  SPARK_JARS_DIR=/usr/lib/spark/jars
-  SPARK_CONF_DIR='/etc/spark/conf'
-  SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")"
-  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-
-  readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION
-
-  if version_lt "${SPARK_VERSION}" "3.1" || \
-     version_ge "${SPARK_VERSION}" "4.0" ; then
-    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-    exit 1
-  fi
-
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
-
-}

From 374ff96149207fc8e5a2ab705640f84f7beb4d74 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 11 Jan 2025 19:31:36 -0800
Subject: [PATCH 125/130] less noise in temp directory

---
 templates/generate-all-actions.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/templates/generate-all-actions.sh b/templates/generate-all-actions.sh
index 2b25d99c5..ce5caef35 100644
--- a/templates/generate-all-actions.sh
+++ b/templates/generate-all-actions.sh
@@ -2,5 +2,6 @@
 
 for tt in $(find templates -name '*.sh.in') ; do
   genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"`
-  perl templates/generate-action.pl "${genfile}" | tee "${genfile}" > /tmp/$(basename $genfile)
+  mkdir -p /tmp/init/$(dirname $genfile)
+  perl templates/generate-action.pl "${genfile}" | tee "${genfile}" > "/tmp/init/${genfile}"
 done

From 5a37d94ca77f1a82525bad7336fda2dca92a73ce Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 15 Jan 2025 16:41:45 -0800
Subject: [PATCH 126/130] tested with much older versions of CUDA on an old
 dataproc image from pre-2023

---
 templates/common/util_functions |  2 +-
 templates/gpu/install_functions | 60 ++++++++++++++++++++------
 templates/gpu/mig_functions     | 27 +++++++++++-
 templates/gpu/util_functions    | 76 ++++++++++++++++++---------------
 4 files changed, 114 insertions(+), 51 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 7cbef0849..9c7bfeba9 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -554,7 +554,7 @@ function prepare_common_env() {
 
   if is_debuntu ; then
     clean_up_sources_lists
-    apt-get update -qq
+    apt-get update -qq --allow-releaseinfo-change
     apt-get -y clean
     apt-get -o DPkg::Lock::Timeout=60 -y autoremove
     if ge_debian12 ; then
diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
index 746eb79bb..1ba76c236 100644
--- a/templates/gpu/install_functions
+++ b/templates/gpu/install_functions
@@ -46,7 +46,7 @@ function set_cuda_runfile_url() {
   local MAX_DRIVER_VERSION
   local MAX_CUDA_VERSION
 
-  local MIN_OPEN_DRIVER_VER="515.48.07"
+  MIN_OPEN_DRIVER_VER="515.43.04"
   local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
   local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
 
@@ -84,7 +84,33 @@ function set_cuda_runfile_url() {
 
   # driver version named in cuda runfile filename
   # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+#          10.0.130/410.48   =https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
+#          10.1.234/418.87.00=https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run
+#          10.2.89/440.33.01 =https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
+#          11.0.3/450.51.06  =https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run
+#          11.1.1/455.42.00  =https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
+#          11.2.2/460.32.03  =https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
+#          11.3.1/465.19.01  =https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+#          11.4.4/470.82.01  =https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run
+#          11.5.2/495.29.05  =https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
+#          11.6.2/510.47.03  =https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run
+#          11.7.1/515.65.01  =https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run
+#          11.8.0/520.61.05  =https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+#          12.0.1/525.85.12  =https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run
+#          12.1.1/530.30.02  =https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
+#          12.2.2/535.104.05 =https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
+#          12.3.2/545.23.08  =https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
   readonly -A drv_for_cuda=(
+          ["10.0.130"]="410.48"
+          ["10.1.234"]="418.87.00"
+          ["10.2.89"]="440.33.01"
+          ["11.0.3"]="450.51.06"
+          ["11.1.1"]="455.42.00"
+          ["11.2.2"]="460.32.03"
+          ["11.3.1"]="465.19.01"
+          ["11.4.4"]="470.82.01"
+          ["11.5.2"]="495.29.05"
+          ["11.6.2"]="510.47.03"
           ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
           ["11.8.0"]="520.61.05"
           ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
@@ -108,7 +134,8 @@ function set_cuda_runfile_url() {
   CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
   readonly CUDA_RUNFILE
 
-  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+  # version naming and archive url were erratic prior to 11.0.3
+  if ( version_ge "${CUDA_FULL_VERSION}" "11.0.3" &&  ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ) ; then
     echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
     exit 1
   fi
@@ -292,13 +319,13 @@ function install_nvidia_nccl() {
   # Hopper:    SM_90,SM_90a       compute_90,compute_90a
   # Blackwell: SM_100,            compute_100
                   NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
+  if version_gt "${CUDA_VERSION}" "11.6" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
   if version_ge "${CUDA_VERSION}" "11.8" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
-  fi
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
   if version_ge "${CUDA_VERSION}" "12.0" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
-  fi
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
 
   mkdir -p "${workdir}"
   pushd "${workdir}"
@@ -464,8 +491,8 @@ function add_repo_cuda() {
 }
 
 function build_driver_from_github() {
-  # non-GPL driver will have been built on rocky8
-  if is_rocky8 ; then return 0 ; fi
+  # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then return 0 ; fi
   pushd "${workdir}"
 
   test -d "${workdir}/open-gpu-kernel-modules" || {
@@ -592,7 +619,7 @@ function install_nvidia_userspace_runfile() {
   local cache_hit="0"
   local local_tarball
 
-  if is_rocky8 ; then
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then
     local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
     test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
       local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
@@ -604,7 +631,9 @@ function install_nvidia_userspace_runfile() {
 
       if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
         cache_hit="1"
-        runfile_args="--no-kernel-modules"
+        if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
+          runfile_args="${runfile_args} --no-kernel-modules"
+        fi
         echo "cache hit"
       else
         install_build_dependencies
@@ -619,10 +648,13 @@ function install_nvidia_userspace_runfile() {
           --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
           "
         fi
-        runfile_args="--no-dkms ${signing_options}"
+        runfile_args="${signing_options}"
+        if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
+          runfile_args="${runfile_args} --no-dkms"
+        fi
       fi
     }
-  else
+  elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
     runfile_args="--no-kernel-modules"
   fi
 
@@ -632,7 +664,7 @@ function install_nvidia_userspace_runfile() {
     --install-libglvnd \
     --tmpdir="${tmpdir}"
 
-  if is_rocky8 ; then
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then
     if [[ "${cache_hit}" == "1" ]] ; then
       gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
       depmod -a
diff --git a/templates/gpu/mig_functions b/templates/gpu/mig_functions
index 233b2d02c..7ec29aa25 100644
--- a/templates/gpu/mig_functions
+++ b/templates/gpu/mig_functions
@@ -66,13 +66,36 @@ function enable_mig() {
   is_complete enable-mig && return
 
   # Start persistenced if it's not already running
-  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
+#  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
   for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
     # Write an ascii zero to the numa node indicator
     echo "0" | dd of="${f}" status=none
   done
-  time nvsmi --gpu-reset # 30s
+  # nvidia-smi --query-compute-apps=pid --format=csv,noheader
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl stop "hadoop-yarn-${svc}.service"
+    fi
+  done
+  time nvsmi --gpu-reset || { # 30s
+    echo "unable to reset gpu.  Trying to stop services and kernel modules which may have a lock."
+    # TODO: find a way to reset the A100 without reboot
+    for tryno in {1..25} ; do ; removed="1"
+      for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do
+        if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done
+      if [[ "${removed}" == "1" ]] ; then
+        echo "modules removed successfully"
+        nvsmi --gpu-reset
+        break
+      fi
+    done
+  }
   nvsmi -mig 1
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl start "hadoop-yarn-${svc}.service"
+    fi
+  done
   clear_nvsmi_cache
 
   mark_complete enable-mig
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 48473d13b..565ec3ba0 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -1,51 +1,56 @@
 function set_support_matrix() {
   # CUDA version and Driver version
   # https://docs.nvidia.com/deploy/cuda-compatibility/
-  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html#framework-matrix
   # https://developer.nvidia.com/cuda-downloads
 
   # Minimum supported version for open kernel driver is 515.43.04
   # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
-  # Rocky8: 12.0: 525.147.05
   local latest
   latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
   readonly -A DRIVER_FOR_CUDA=(
-          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
-          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+          ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
+          ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
+          ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
+          ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
+          ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08"
+          ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03"
   )
   readonly -A DRIVER_SUBVER=(
-          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
-          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+          ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" ["430"]="430.64"
+          ["435"]="435.21" ["440"]="440.100" ["450"]="450.119.03"
+          ["455"]="455.45.01" ["460"]="460.91.03" ["465"]="465.31"
+          ["470"]="470.256.02" ["495"]="495.46" ["510"]="510.108.03"
+          ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05"
+          ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142"
+          ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.77"
   )
   # https://developer.nvidia.com/cudnn-downloads
-  if is_debuntu ; then
   readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+          ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" ["11.0"]="8.0.4"
+          ["11.1"]="8.0.5" ["11.2"]="8.1.1" ["11.3"]="8.2.1" ["11.4"]="8.2.4.15"
+          ["11.5"]="8.3.1.22" ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29"
+          ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28"
+          ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70"
+          ["12.5"]="9.2.1.18" ["12.6"]="9.6.0.74"
   )
-  elif is_rocky ; then
-  # rocky:
-  #   12.0: 8.8.1.3
-  #   12.1: 8.9.3.28
-  #   12.2: 8.9.7.29
-  #   12.3: 9.0.0.312
-  #   12.4: 9.1.1.17
-  #   12.5: 9.2.1.18
-  #   12.6: 9.5.1.17
-  readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
-  )
-  fi
   # https://developer.nvidia.com/nccl/nccl-download
-  # 12.2: 2.19.3, 12.5: 2.21.5
   readonly -A NCCL_FOR_CUDA=(
-          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
-          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+          ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
+          ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" ["11.5"]="2.11.4"
+          ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5"
+          ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3"
+          ["12.3"]="2.19.4" ["12.4"]="2.23.4" ["12.5"]="2.22.3"
+          ["12.6"]="2.23.4"
   )
   readonly -A CUDA_SUBVER=(
-          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
-          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
+          ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
+          ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2"
+          ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2"
+          ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
+          ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
+          ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
+          ["12.6"]="12.6.3"
   )
 }
 
@@ -131,7 +136,7 @@ function set_driver_version() {
 
   export DRIVER_VERSION DRIVER
 
-  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  gpu_driver_url="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
   if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
     echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
     exit 1
@@ -197,19 +202,22 @@ function prepare_gpu_env(){
 
   # Verify SPARK compatability
   RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
-  readonly RAPIDS_RUNTIME
+  INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
+  readonly RAPIDS_RUNTIME INCLUDE_GPUS
 
   # determine whether we have nvidia-smi installed and working
   nvsmi
 }
 
-# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
-# Users should run apt-mark unhold before they wish to upgrade these packages
+# Hold all NVIDIA-related packages from upgrading either unintenionally or
+# through use of services like unattended-upgrades
+#
+# Users should run apt-mark unhold before upgrading these packages
 function hold_nvidia_packages() {
   if ! is_debuntu ; then return ; fi
 
-  apt-mark hold nvidia-*
-  apt-mark hold libnvidia-*
+  apt-mark hold nvidia-*    > /dev/null 2>&1
+  apt-mark hold libnvidia-* > /dev/null 2>&1
   if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
     apt-mark hold xserver-xorg-video-nvidia*
   fi

From 7662215766b520d006def883c8b0cf8dba440a1f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 16 Jan 2025 10:56:23 -0800
Subject: [PATCH 127/130] exercised older CUDA and mig a100 use case more ;
 added pytorch installation functionality

---
 templates/gpu/install_functions | 112 ++++++++++++++++++++++----------
 templates/gpu/mig_functions     |  55 ++++++++++++++--
 templates/gpu/util_functions    |   7 +-
 3 files changed, 131 insertions(+), 43 deletions(-)

diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
index 1ba76c236..8effce9b4 100644
--- a/templates/gpu/install_functions
+++ b/templates/gpu/install_functions
@@ -1,14 +1,15 @@
 function set_cudnn_version() {
-  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+  readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
+  readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
   readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
 
   # Parameters for NVIDIA-provided cuDNN library
   DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
   readonly DEFAULT_CUDNN_VERSION
   CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+  # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
+  if ( is_rocky  && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
+    CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
   elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
     # cuDNN v8 is not distribution for ubuntu20+, debian12
     CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
@@ -303,30 +304,6 @@ function install_nvidia_nccl() {
 
   local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
 
-  # https://github.com/NVIDIA/nccl/blob/master/README.md
-  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Fermi:     SM_20,             compute_30
-  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
-  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
-  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
-
-  # The following architectures are suppored by open kernel driver
-  # Volta:     SM_70,SM_72,       compute_70,compute_72
-  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
-
-  # The following architectures are supported by CUDA v11.8+
-  # Ada:       SM_89,             compute_89
-  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
-  # Blackwell: SM_100,            compute_100
-                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
-  if version_gt "${CUDA_VERSION}" "11.6" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
-  if version_ge "${CUDA_VERSION}" "11.8" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
-  if version_ge "${CUDA_VERSION}" "12.0" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
-
   mkdir -p "${workdir}"
   pushd "${workdir}"
 
@@ -347,6 +324,30 @@ function install_nvidia_nccl() {
     local local_tarball="${workdir}/${build_tarball}"
     local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
 
+    # https://github.com/NVIDIA/nccl/blob/master/README.md
+    # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    # Fermi:     SM_20,             compute_30
+    # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+    # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+    # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+    # The following architectures are suppored by open kernel driver
+    # Volta:     SM_70,SM_72,       compute_70,compute_72
+    # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+    # The following architectures are supported by CUDA v11.8+
+    # Ada:       SM_89,             compute_89
+    # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+    # Blackwell: SM_100,            compute_100
+                    NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
+    if version_gt "${CUDA_VERSION}" "11.6" ; then
+      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
+    if version_ge "${CUDA_VERSION}" "11.8" ; then
+      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
+    if version_ge "${CUDA_VERSION}" "12.0" ; then
+      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
+
     output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
     if echo "${output}" | grep -q "${gcs_tarball}" ; then
       # cache hit - unpack from cache
@@ -369,11 +370,12 @@ function install_nvidia_nccl() {
         export NVCC_GENCODE
         execute_with_retries make -j$(nproc) pkg.redhat.build
       fi
-      tar czvf "/${local_tarball}" "../${build_path}"
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
+      tar czvf "${local_tarball}" "../${build_path}"
       make clean
       popd
+      tar xzvf "${local_tarball}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
     fi
     gcloud storage cat "${gcs_tarball}" | tar xz
   }
@@ -415,16 +417,16 @@ function install_nvidia_cudnn() {
       apt-get -y install nvidia-cudnn
     else
       if is_cudnn8 ; then
-        install_local_cudnn8_repo
+        add_repo_cuda
 
         apt-get update -qq
+       # Ignore version requested and use the latest version in the package index
+       cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"
 
         execute_with_retries \
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
             "libcudnn8-dev=${cudnn_pkg_version}"
-
-        uninstall_local_cudnn8_repo
 	sync
       elif is_cudnn9 ; then
 	install_cuda_keyring_pkg
@@ -452,6 +454,48 @@ function install_nvidia_cudnn() {
   mark_complete cudnn
 }
 
+function install_pytorch() {
+  if test -f "${workdir}/complete/pytorch" ; then return ; fi
+  local env
+  env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
+  local mc3=/opt/conda/miniconda3
+  local envpath="${mc3}/envs/${env}"
+  # Set numa node to 0 for all GPUs
+  for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
+  local verb=create
+  if test -d "${envpath}" ; then verb=install ; fi
+
+  readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
+  case "${INCLUDE_PYTORCH^^}" in
+    "1" | "YES" | "TRUE" )
+      local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
+      local local_tarball="${workdir}/${build_tarball}"
+      local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
+
+      output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+      if echo "${output}" | grep -q "${gcs_tarball}" ; then
+        # cache hit - unpack from cache
+        echo "cache hit"
+        mkdir -p "${envpath}"
+        gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
+      else
+        cudart_spec="cuda-cudart"
+        if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
+        "${mc3}/bin/mamba" "${verb}" -n "${env}" \
+          -c conda-forge -c nvidia -c rapidsai \
+          numba pytorch tensorflow[and-cuda] rapids pyspark \
+          "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
+        pushd "${envpath}"
+        tar czf "${local_tarball}" .
+        popd
+        gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      fi
+      ;;
+    * ) echo "skip pytorch install" ;;
+  esac
+  touch "${workdir}/complete/pytorch"
+}
+
 function add_nonfree_components() {
   if is_src_nvidia ; then return; fi
   if ge_debian12 ; then
diff --git a/templates/gpu/mig_functions b/templates/gpu/mig_functions
index 7ec29aa25..7d94b7dcf 100644
--- a/templates/gpu/mig_functions
+++ b/templates/gpu/mig_functions
@@ -65,38 +65,81 @@ function configure_mig_cgi() {
 function enable_mig() {
   is_complete enable-mig && return
 
-  # Start persistenced if it's not already running
-#  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
+  # All devices on the same numa node
   for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
     # Write an ascii zero to the numa node indicator
     echo "0" | dd of="${f}" status=none
   done
+
+  echo "Stopping services and kernel modules in preparation for enabling mig."
+  if ( ps auwx | grep -i nvidia\\-persistenced ) ; then killall -9 nvidia-persistenced ; fi
+
   # nvidia-smi --query-compute-apps=pid --format=csv,noheader
   for svc in resourcemanager nodemanager; do
     if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
       systemctl stop "hadoop-yarn-${svc}.service"
     fi
   done
+  # can lsof be used to determine what processes have a file with name =~ /nvidia/ under the /dev/ directory ?
+  # if so, stop the service which launches the process with the open filehandle
+
+  MIG_GPU_LIST="`nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n ""`"
+  NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
+
+# root@cluster-1718310842-m:/tmp# for m in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do sudo rmmod $m ; done
+# rmmod: ERROR: Module nvidia_drm is not currently loaded
+# rmmod: ERROR: Module nvidia_modeset is not currently loaded
+# rmmod: ERROR: Module nvidia_uvm is not currently loaded
+# rmmod: ERROR: Module nvidia is not currently loaded
+# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --gpu-reset
+# Resetting GPU 00000000:00:04.0 is not supported.
+# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --multi-instance-gpu=1
+# Warning: MIG mode is in pending enable state for GPU 00000000:00:04.0:Not Supported
+# Reboot the system or try nvidia-smi --gpu-reset to make MIG mode effective on GPU 00000000:00:04.0
+# All done.
+# root@cluster-1718310842-m:/tmp# echo $?
+# 0
+# root@cluster-1718310842-m:/tmp# /usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader
+# Disabled
+
+  if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+
   time nvsmi --gpu-reset || { # 30s
-    echo "unable to reset gpu.  Trying to stop services and kernel modules which may have a lock."
     # TODO: find a way to reset the A100 without reboot
+    removed="1"
     for tryno in {1..25} ; do ; removed="1"
       for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do
         if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done
       if [[ "${removed}" == "1" ]] ; then
         echo "modules removed successfully"
-        nvsmi --gpu-reset
-        break
+        nvsmi --gpu-reset && break
       fi
     done
   }
-  nvsmi -mig 1
+
+  if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+    for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
+      if version_le "${CUDA_VERSION}" "11.6" ; then
+        nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
+      else
+        nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
+      fi
+    done
+  fi
+  if test -n "$(nvsmi -L)" ; then
+    # cache the result of the gpu query
+    ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
+    echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
+    chmod a+r "/var/run/nvidia-gpu-index.txt"
+  fi
   for svc in resourcemanager nodemanager; do
     if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
       systemctl start "hadoop-yarn-${svc}.service"
     fi
   done
   clear_nvsmi_cache
+  # Start persistenced if it's not already running
+  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
 
   mark_complete enable-mig
 }
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 565ec3ba0..eea7b3dd5 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -200,10 +200,11 @@ function prepare_gpu_env(){
     readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
   fi
 
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
+  # Set variables from metadata
+  RAPIDS_RUNTIME="$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")"
   INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
-  readonly RAPIDS_RUNTIME INCLUDE_GPUS
+  INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')"
+  readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH
 
   # determine whether we have nvidia-smi installed and working
   nvsmi

From 0c3eb5162580d6e15692f1b2d2c3f7aa5ea9dd80 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 18 Jan 2025 21:35:48 -0800
Subject: [PATCH 128/130] create function to harden sshd config ; execute it
 before repairing old backports

---
 templates/common/util_functions | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 9c7bfeba9..b99387d79 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -490,6 +490,24 @@ function prepare_conda_env() {
   fi
 }
 
+function harden_sshd_config() {
+  # disable sha1 use in kex and kex-gss features
+  declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms")
+  for ftr in "${!feature_map[@]}" ; do
+    export feature=${feature_map[$ftr]}
+    sshd_config_line=$(
+      (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
+       ssh -Q "${ftr}" ) \
+      | sort | uniq | grep -iv sha1 | perl -e '@a=<STDIN>;
+      print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if @a')
+    grep -v "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
+    echo "$sshd_config_line" >> /tmp/sshd_config_new
+    # TODO: test whether sshd will reload with this change before mv
+    mv /tmp/sshd_config_new /etc/ssh/sshd_config
+  done
+  systemctl reload ssh
+}
+
 function prepare_common_env() {
   SPARK_NLP_VERSION="3.2.1" # Must include subminor version here
   SPARK_JARS_DIR=/usr/lib/spark/jars
@@ -550,9 +568,10 @@ function prepare_common_env() {
 
   is_complete prepare.common && return
 
-  repair_old_backports
+  harden_sshd_config
 
   if is_debuntu ; then
+    repair_old_backports
     clean_up_sources_lists
     apt-get update -qq --allow-releaseinfo-change
     apt-get -y clean

From 576bbb61036e84531b37f324ce3482271d5cd0a6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 22 Jan 2025 16:40:01 -0800
Subject: [PATCH 129/130] reviewed #1275 and brought closer to parity

---
 templates/common/util_functions |  22 +++---
 templates/generate-action.pl    |   6 +-
 templates/gpu/install_functions | 134 ++++++++++++++------------------
 templates/gpu/util_functions    |  74 +++++++++---------
 templates/gpu/yarn_functions    |   3 +-
 5 files changed, 114 insertions(+), 125 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index b99387d79..aeea8a294 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -466,11 +466,11 @@ function install_dependencies() {
 }
 
 function prepare_pip_env() {
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv"
+  test -d "${workdir}/python-venv" || /opt/conda/miniconda3/bin/python3 -m venv "${workdir}/python-venv"
   source "${workdir}/python-venv/bin/activate"
 
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
   pip cache purge || echo "unable to purge pip cache"
   if is_ramdisk ; then
     # Download pip packages to tmpfs
@@ -491,21 +491,25 @@ function prepare_conda_env() {
 }
 
 function harden_sshd_config() {
-  # disable sha1 use in kex and kex-gss features
-  declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms")
+  # disable sha1 and md5 use in kex and kex-gss features
+  declare -A feature_map=(["kex"]="kexalgorithms")
+  if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then
+    feature_map["kex-gss"]="gssapikexalgorithms" ; fi
   for ftr in "${!feature_map[@]}" ; do
     export feature=${feature_map[$ftr]}
     sshd_config_line=$(
       (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
        ssh -Q "${ftr}" ) \
-      | sort | uniq | grep -iv sha1 | perl -e '@a=<STDIN>;
-      print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if @a')
-    grep -v "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
+      | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}<STDIN>;
+      print("$ENV{feature} ",join(",",map{ chomp; $_ }@a), $/) if "@a"')
+    grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
     echo "$sshd_config_line" >> /tmp/sshd_config_new
     # TODO: test whether sshd will reload with this change before mv
     mv /tmp/sshd_config_new /etc/ssh/sshd_config
   done
-  systemctl reload ssh
+  local svc=ssh
+  if is_rocky ; then svc="sshd" ; fi
+  systemctl reload "${svc}"
 }
 
 function prepare_common_env() {
diff --git a/templates/generate-action.pl b/templates/generate-action.pl
index 2e1d344ff..334d6ecac 100644
--- a/templates/generate-action.pl
+++ b/templates/generate-action.pl
@@ -8,7 +8,7 @@
 use strict;
 
 # Version of Initialization Actions we will generate
-my $IA_VERSION="0.0.1";
+my $IA_VERSION="0.1.1";
 
 my $action = $ARGV[0];
 my $v = {
@@ -22,7 +22,7 @@ sub usage{
 This script evaluates a template to generate an initialization action.
 The output is printed to STDOUT.
 
-Action templates reside under templates/$action and end in .sh.in
+Action templates reside under templates/\${action}.in
 
 The <action> argument is the destination action name, not the source.
 EOF
@@ -34,7 +34,7 @@ sub usage{
 
 my $tt = Template->new( {
   INCLUDE_PATH => "$ENV{PWD}/templates",
-  VARIABLES => $v,
+  VARIABLES    => $v,
   INTERPOLATE  => 0,
 }) || die "$Template::ERROR$/";
 
diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
index 8effce9b4..68183bc1f 100644
--- a/templates/gpu/install_functions
+++ b/templates/gpu/install_functions
@@ -4,16 +4,15 @@ function set_cudnn_version() {
   readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
 
   # Parameters for NVIDIA-provided cuDNN library
-  DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-  readonly DEFAULT_CUDNN_VERSION
+  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
   CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
   # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
   if ( is_rocky  && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
     CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
-  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
+  elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
     # cuDNN v8 is not distribution for ubuntu20+, debian12
     CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
+  elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
     # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
     CUDNN_VERSION="8.8.0.121"
   fi
@@ -106,7 +105,7 @@ function set_cuda_runfile_url() {
           ["10.1.234"]="418.87.00"
           ["10.2.89"]="440.33.01"
           ["11.0.3"]="450.51.06"
-          ["11.1.1"]="455.42.00"
+          ["11.1.1"]="455.32.00"
           ["11.2.2"]="460.32.03"
           ["11.3.1"]="465.19.01"
           ["11.4.4"]="470.82.01"
@@ -130,17 +129,21 @@ function set_cuda_runfile_url() {
   local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
 
   NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
-  readonly NVIDIA_CUDA_URL
-
-  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
-  readonly CUDA_RUNFILE
 
   # version naming and archive url were erratic prior to 11.0.3
-  if ( version_ge "${CUDA_FULL_VERSION}" "11.0.3" &&  ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ) ; then
+  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
     echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then
+      echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead"
+    fi
     exit 1
   fi
 
+  readonly NVIDIA_CUDA_URL
+
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
   if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
     echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
   elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
@@ -152,50 +155,24 @@ function set_cuda_runfile_url() {
   fi
 }
 
-function set_cudnn_tarball_url() {
-CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
-CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
-if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
-  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
-  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
-    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
-    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
-  fi
-  # Use legacy url format with one of the tarball name formats depending on version as above
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
-fi
-if ( version_ge "${CUDA_VERSION}" "12.0" ); then
-  # Use modern url format When cuda version is greater than or equal to 12.0
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
-fi
-readonly CUDNN_TARBALL
-readonly CUDNN_TARBALL_URL
-}
-
 function install_cuda_keyring_pkg() {
-  if ( test -v CUDA_KEYRING_PKG_INSTALLED &&
-       [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi
+  is_complete cuda-keyring-installed && return
   local kr_ver=1.1
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
     -o "${tmpdir}/cuda-keyring.deb"
   dpkg -i "${tmpdir}/cuda-keyring.deb"
   rm -f "${tmpdir}/cuda-keyring.deb"
-  CUDA_KEYRING_PKG_INSTALLED="1"
+  mark_complete cuda-keyring-installed
 }
 
 function uninstall_cuda_keyring_pkg() {
   apt-get purge -yq cuda-keyring
-  CUDA_KEYRING_PKG_INSTALLED="0"
+  mark_incomplete cuda-keyring-installed
 }
 
 function install_local_cuda_repo() {
   is_complete install-local-cuda-repo && return
-
-  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  CUDA_LOCAL_REPO_INSTALLED="1"
   pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
   CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
   readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
@@ -219,7 +196,7 @@ function install_local_cuda_repo() {
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cuda-repo"
+  mark_incomplete install-local-cuda-repo
 }
 
 function install_local_cudnn_repo() {
@@ -268,7 +245,7 @@ function install_local_cudnn8_repo() {
 
   # cache the cudnn package
   cache_fetched_package "${local_deb_url}" \
-                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${pkg_bucket}/nvidia/cudnn/${CUDNN8_CUDA_VER}/${deb_fn}" \
                         "${local_deb_fn}"
 
   local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
@@ -322,41 +299,42 @@ function install_nvidia_nccl() {
   test -d "${workdir}/nccl/build" || {
     local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
     local local_tarball="${workdir}/${build_tarball}"
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
-
-    # https://github.com/NVIDIA/nccl/blob/master/README.md
-    # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-    # Fermi:     SM_20,             compute_30
-    # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
-    # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
-    # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
-
-    # The following architectures are suppored by open kernel driver
-    # Volta:     SM_70,SM_72,       compute_70,compute_72
-    # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
-
-    # The following architectures are supported by CUDA v11.8+
-    # Ada:       SM_89,             compute_89
-    # Hopper:    SM_90,SM_90a       compute_90,compute_90a
-    # Blackwell: SM_100,            compute_100
-                    NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
-    if version_gt "${CUDA_VERSION}" "11.6" ; then
-      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
-    if version_ge "${CUDA_VERSION}" "11.8" ; then
-      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
-    if version_ge "${CUDA_VERSION}" "12.0" ; then
-      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
+    local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}"
 
     output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
     if echo "${output}" | grep -q "${gcs_tarball}" ; then
       # cache hit - unpack from cache
       echo "cache hit"
+      gcloud storage cat "${gcs_tarball}" | tar xvz
     else
       # build and cache
       pushd nccl
       # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
       install_build_dependencies
+      # https://github.com/NVIDIA/nccl/blob/master/README.md
+      # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+      # Fermi:     SM_20,             compute_30
+      # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+      # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+      # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+      # The following architectures are suppored by open kernel driver
+      # Volta:     SM_70,SM_72,       compute_70,compute_72
+      # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+      # The following architectures are supported by CUDA v11.8+
+      # Ada:       SM_89,             compute_89
+      # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+      # Blackwell: SM_100,            compute_100
+                      NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
+      if version_gt "${CUDA_VERSION}" "11.6" ; then
+        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
+      if version_ge "${CUDA_VERSION}" "11.8" ; then
+        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
+      if version_ge "${CUDA_VERSION}" "12.0" ; then
+        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
+
       if is_debuntu ; then
         # These packages are required to build .deb packages from source
         execute_with_retries \
@@ -391,8 +369,8 @@ function install_nvidia_nccl() {
 }
 
 function install_nvidia_cudnn() {
+  if le_debian10 ; then return ; fi
   is_complete cudnn && return
-
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -427,9 +405,10 @@ function install_nvidia_cudnn() {
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
             "libcudnn8-dev=${cudnn_pkg_version}"
-	sync
+
+        sync
       elif is_cudnn9 ; then
-	install_cuda_keyring_pkg
+        install_cuda_keyring_pkg
 
         apt-get update -qq
 
@@ -438,7 +417,8 @@ function install_nvidia_cudnn() {
           "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
           "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
           "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
-	sync
+
+        sync
       else
         echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
       fi
@@ -462,8 +442,6 @@ function install_pytorch() {
   local envpath="${mc3}/envs/${env}"
   # Set numa node to 0 for all GPUs
   for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
-  local verb=create
-  if test -d "${envpath}" ; then verb=install ; fi
 
   readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
   case "${INCLUDE_PYTORCH^^}" in
@@ -479,6 +457,8 @@ function install_pytorch() {
         mkdir -p "${envpath}"
         gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
       else
+        local verb=create
+        if test -d "${envpath}" ; then verb=install ; fi
         cudart_spec="cuda-cudart"
         if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
         "${mc3}/bin/mamba" "${verb}" -n "${env}" \
@@ -536,7 +516,7 @@ function add_repo_cuda() {
 
 function build_driver_from_github() {
   # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version
-  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then return 0 ; fi
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi
   pushd "${workdir}"
 
   test -d "${workdir}/open-gpu-kernel-modules" || {
@@ -554,7 +534,7 @@ function build_driver_from_github() {
     local def_dir="${modulus_md5sum:-unsigned}"
     local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
 
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
     if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
       echo "cache hit"
@@ -618,6 +598,7 @@ function build_driver_from_packages() {
     add_contrib_component
     apt-get update -qq
     execute_with_retries apt-get install -y -qq --no-install-recommends dkms
+    configure_dkms_certs
     execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
     sync
 
@@ -629,6 +610,7 @@ function build_driver_from_packages() {
     fi
     sync
   fi
+  clear_dkms_key
 }
 
 function install_nvidia_userspace_runfile() {
@@ -708,7 +690,7 @@ function install_nvidia_userspace_runfile() {
     --install-libglvnd \
     --tmpdir="${tmpdir}"
 
-  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then
     if [[ "${cache_hit}" == "1" ]] ; then
       gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
       depmod -a
@@ -732,7 +714,7 @@ function install_cuda_runfile() {
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
-			"${pkg_bucket}/${CUDA_RUNFILE}" \
+                        "${pkg_bucket}/${CUDA_RUNFILE}" \
                         "${local_fn}"
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index eea7b3dd5..69d55a2cb 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -7,50 +7,52 @@ function set_support_matrix() {
   # Minimum supported version for open kernel driver is 515.43.04
   # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
   local latest
-  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+  latest="$(curl -s https://us.download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
   readonly -A DRIVER_FOR_CUDA=(
-          ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
-          ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
-          ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
-          ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
-          ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08"
-          ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03"
+      ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
+      ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
+      ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
+      ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
+      ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08"
+      ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03"
   )
   readonly -A DRIVER_SUBVER=(
-          ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" ["430"]="430.64"
-          ["435"]="435.21" ["440"]="440.100" ["450"]="450.119.03"
-          ["455"]="455.45.01" ["460"]="460.91.03" ["465"]="465.31"
-          ["470"]="470.256.02" ["495"]="495.46" ["510"]="510.108.03"
-          ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05"
-          ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142"
-          ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.77"
+      ["410"]="410.104" ["415"]="415.27" ["418"]="418.113"
+      ["430"]="430.64" ["435"]="435.21" ["440"]="440.100"
+      ["450"]="450.119.03" ["455"]="455.45.01" ["460"]="460.91.03"
+      ["465"]="465.31" ["470"]="470.256.02" ["495"]="495.46"
+      ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05"
+      ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06"
+      ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03"
+      ["565"]="565.77"
   )
   # https://developer.nvidia.com/cudnn-downloads
   readonly -A CUDNN_FOR_CUDA=(
-          ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" ["11.0"]="8.0.4"
-          ["11.1"]="8.0.5" ["11.2"]="8.1.1" ["11.3"]="8.2.1" ["11.4"]="8.2.4.15"
-          ["11.5"]="8.3.1.22" ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29"
-          ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28"
-          ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70"
-          ["12.5"]="9.2.1.18" ["12.6"]="9.6.0.74"
+      ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5"
+      ["11.0"]="8.0.4" ["11.1"]="8.0.5" ["11.2"]="8.1.1"
+      ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" ["11.5"]="8.3.1.22"
+      ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17"
+      ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5"
+      ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18"
+      ["12.6"]="9.6.0.74"
   )
   # https://developer.nvidia.com/nccl/nccl-download
   readonly -A NCCL_FOR_CUDA=(
-          ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
-          ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" ["11.5"]="2.11.4"
-          ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5"
-          ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3"
-          ["12.3"]="2.19.4" ["12.4"]="2.23.4" ["12.5"]="2.22.3"
-          ["12.6"]="2.23.4"
+      ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
+      ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4"
+      ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12"
+      ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3"
+      ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4"
+      ["12.5"]="2.22.3" ["12.6"]="2.23.4"
   )
   readonly -A CUDA_SUBVER=(
-          ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
-          ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2"
-          ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2"
-          ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
-          ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
-          ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
-          ["12.6"]="12.6.3"
+      ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
+      ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2"
+      ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2"
+      ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
+      ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
+      ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
+      ["12.6"]="12.6.3"
   )
 }
 
@@ -71,7 +73,7 @@ function set_cuda_version() {
     local CUDA_URL_VERSION
     CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
     if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
-      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION}"
     fi
   fi
   readonly DEFAULT_CUDA_VERSION
@@ -114,10 +116,10 @@ function set_driver_version() {
     if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
       major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
       driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
-      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+      if curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
         # use the version indicated by the cuda url as the default if it exists
 	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
-      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+      elif curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
         # use the maximum sub-version available for the major version indicated in cuda url as the default
 	DEFAULT_DRIVER="${driver_max_maj_version}"
       fi
diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions
index d9040b1d6..c4194a2ea 100644
--- a/templates/gpu/yarn_functions
+++ b/templates/gpu/yarn_functions
@@ -52,6 +52,7 @@ EOF
   chmod a+rx "${gpus_resources_script}"
 
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
+  if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi
 
   local executor_cores
   executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
@@ -75,9 +76,9 @@ EOF
 # query explain output won't show GPU operator, if the user has doubts
 # they can uncomment the line before seeing the GPU plan explain;
 # having AQE enabled gives user the best performance.
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.resource.gpu.amount=${gpu_count}
 spark.plugins=com.nvidia.spark.SQLPlugin
-spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.cores=${executor_cores}
 spark.executor.memory=${executor_memory_gb}G
 spark.dynamicAllocation.enabled=false

From 989b445b20a2be99b22f169ab9e85f8def9be534 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 28 Jan 2025 17:39:25 -0800
Subject: [PATCH 130/130] changes from testing PR #1275

---
 templates/common/util_functions |  7 ++-
 templates/gpu/install_functions | 96 +++++++++++++++++++++------------
 templates/gpu/spark_functions   |  5 +-
 templates/gpu/util_functions    |  2 +-
 templates/gpu/yarn_functions    | 30 ++++++++++-
 5 files changed, 98 insertions(+), 42 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index aeea8a294..42f01278b 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -7,9 +7,9 @@ function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | c
 # ( version_ge 2.0 2.1 ) evaluates to false
 # ( version_ge 2.2 2.1 ) evaluates to true
 function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
-function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
+function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge "$1" "$2" ; )
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
-function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
+function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le "$1" "$2" ; )
 
 function define_os_comparison_functions() {
 
@@ -500,8 +500,7 @@ function harden_sshd_config() {
     sshd_config_line=$(
       (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
        ssh -Q "${ftr}" ) \
-      | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}<STDIN>;
-      print("$ENV{feature} ",join(",",map{ chomp; $_ }@a), $/) if "@a"')
+      | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)
     grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
     echo "$sshd_config_line" >> /tmp/sshd_config_new
     # TODO: test whether sshd will reload with this change before mv
diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions
index 68183bc1f..0ed0e8c8f 100644
--- a/templates/gpu/install_functions
+++ b/templates/gpu/install_functions
@@ -119,7 +119,7 @@ function set_cuda_runfile_url() {
           ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
           ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
           ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
-          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
   )
 
   # Verify that the file with the indicated combination exists
@@ -200,6 +200,7 @@ function uninstall_local_cuda_repo(){
 }
 
 function install_local_cudnn_repo() {
+  # https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
   is_complete install-local-cudnn-repo && return
 
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
@@ -368,6 +369,7 @@ function install_nvidia_nccl() {
   mark_complete nccl
 }
 
+# https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
 function install_nvidia_cudnn() {
   if le_debian10 ; then return ; fi
   is_complete cudnn && return
@@ -435,45 +437,64 @@ function install_nvidia_cudnn() {
 }
 
 function install_pytorch() {
-  if test -f "${workdir}/complete/pytorch" ; then return ; fi
+  is_complete pytorch && return
+
   local env
   env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
   local mc3=/opt/conda/miniconda3
   local envpath="${mc3}/envs/${env}"
+  if [[ "${env}" == "base" ]]; then
+    echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
   # Set numa node to 0 for all GPUs
   for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
 
-  readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
-  case "${INCLUDE_PYTORCH^^}" in
-    "1" | "YES" | "TRUE" )
-      local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
-      local local_tarball="${workdir}/${build_tarball}"
-      local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
+  local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
+  local local_tarball="${workdir}/${build_tarball}"
+  local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
 
-      output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
-      if echo "${output}" | grep -q "${gcs_tarball}" ; then
-        # cache hit - unpack from cache
-        echo "cache hit"
-        mkdir -p "${envpath}"
-        gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
-      else
-        local verb=create
-        if test -d "${envpath}" ; then verb=install ; fi
-        cudart_spec="cuda-cudart"
-        if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
-        "${mc3}/bin/mamba" "${verb}" -n "${env}" \
-          -c conda-forge -c nvidia -c rapidsai \
-          numba pytorch tensorflow[and-cuda] rapids pyspark \
-          "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
-        pushd "${envpath}"
-        tar czf "${local_tarball}" .
-        popd
-        gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      fi
-      ;;
-    * ) echo "skip pytorch install" ;;
-  esac
-  touch "${workdir}/complete/pytorch"
+  if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+    # do not build in tests with < 32 cores
+    sleep $(( ( RANDOM % 11 ) + 10 ))
+    while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
+      sleep 5m
+    done
+  fi
+
+  output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+  if echo "${output}" | grep -q "${gcs_tarball}" ; then
+    # cache hit - unpack from cache
+    echo "cache hit"
+    mkdir -p "${envpath}"
+    gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
+  else
+    touch "${local_tarball}.building"
+    gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+    local verb=create
+    if test -d "${envpath}" ; then verb=install ; fi
+    cudart_spec="cuda-cudart"
+    if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
+
+    # Install pytorch and company to this environment
+    "${mc3}/bin/mamba" "${verb}" -n "${env}" \
+      -c conda-forge -c nvidia -c rapidsai \
+      numba pytorch tensorflow[and-cuda] rapids pyspark \
+      "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
+
+    # Install jupyter kernel in this environment
+    "${envpath}/bin/python3" -m pip install ipykernel
+
+    # package environment and cache in GCS
+    pushd "${envpath}"
+    tar czf "${local_tarball}" .
+    popd
+    gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+  fi
+
+  # register the environment as a selectable kernel
+  "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"
+
+  mark_complete pytorch
 }
 
 function add_nonfree_components() {
@@ -508,7 +529,16 @@ function add_repo_nvidia_container_toolkit() {
 
 function add_repo_cuda() {
   if is_debuntu ; then
-    install_cuda_keyring_pkg # 11.7+, 12.0+
+    if version_le "${CUDA_VERSION}" 11.6 ; then
+      local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
+      local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
+      echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
+      | sudo tee "${sources_list_path}"
+      curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
+        -o "${kr_path}"
+    else
+      install_cuda_keyring_pkg # 11.7+, 12.0+
+    fi
   elif is_rocky ; then
     execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
   fi
diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions
index fa29330de..731e01756 100644
--- a/templates/gpu/spark_functions
+++ b/templates/gpu/spark_functions
@@ -7,14 +7,15 @@ function download_spark_jar() {
 
 function install_spark_rapids() {
   # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_SPARK_RAPIDS_VERSION
+  DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
   local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
 
   if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
 
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
index 69d55a2cb..0270b41f3 100644
--- a/templates/gpu/util_functions
+++ b/templates/gpu/util_functions
@@ -60,7 +60,7 @@ function set_cuda_version() {
   case "${DATAPROC_IMAGE_VERSION}" in
     "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
     "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
-    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
     *   )
       echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
       exit 1
diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions
index c4194a2ea..d7accf8f1 100644
--- a/templates/gpu/yarn_functions
+++ b/templates/gpu/yarn_functions
@@ -12,6 +12,25 @@ function configure_yarn_gpu_resources() {
     'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
 
   set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  # Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below
+  if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
+    fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml"
+    set_hadoop_property 'yarn-site.xml' \
+      'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler'
+    set_hadoop_property 'yarn-site.xml' \
+      "yarn.scheduler.fair.user-as-default-queue" "false"
+    set_hadoop_property 'yarn-site.xml' \
+      "yarn.scheduler.fair.allocation.file" "${fs_xml}"
+    set_hadoop_property 'yarn-site.xml' \
+      'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+    cat > "${fs_xml}" <<EOF
+<!-- ${fs_xml} -->
+<allocations>
+  <queueMaxAppsDefault>1</queueMaxAppsDefault>
+</allocations>
+EOF
+  fi
 }
 
 function configure_gpu_script() {
@@ -44,9 +63,15 @@ function configure_gpu_script() {
 #
 # Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
+set -e
+resources_json="/dev/shm/nvidia/gpusResources.json"
+if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi
+
+mkdir -p "$(dirname ${resources_json})"
+
 ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
-echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}"
 EOF
 
   chmod a+rx "${gpus_resources_script}"
@@ -78,7 +103,6 @@ EOF
 # having AQE enabled gives user the best performance.
 spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.resource.gpu.amount=${gpu_count}
-spark.plugins=com.nvidia.spark.SQLPlugin
 spark.executor.cores=${executor_cores}
 spark.executor.memory=${executor_memory_gb}G
 spark.dynamicAllocation.enabled=false
@@ -86,6 +110,7 @@ spark.dynamicAllocation.enabled=false
 spark.task.resource.gpu.amount=${gpu_amount}
 spark.task.cpus=2
 spark.yarn.unmanagedAM.enabled=false
+spark.plugins=com.nvidia.spark.SQLPlugin
 ###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
 EOF
 }
@@ -97,6 +122,7 @@ function configure_yarn_nodemanager_gpu() {
     'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
+
   configure_yarn_nodemanager
 }