From 4f49f650d172b865bbb753d37d760d19a36c3ddf Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 19 Dec 2024 18:34:05 -0800 Subject: [PATCH 001/130] [template] generate gpu/install_gpu_driver.sh from templates --- templates/common/template_disclaimer | 5 + templates/common/util_functions | 365 ++++++++ templates/generate-action.pl | 25 + templates/gpu/install_gpu_driver.sh.in | 280 ++++++ templates/gpu/util_functions | 1192 ++++++++++++++++++++++++ templates/legal/license_header | 11 + templates/secure-boot/util_functions | 105 +++ 7 files changed, 1983 insertions(+) create mode 100644 templates/common/template_disclaimer create mode 100644 templates/common/util_functions create mode 100644 templates/generate-action.pl create mode 100644 templates/gpu/install_gpu_driver.sh.in create mode 100644 templates/gpu/util_functions create mode 100644 templates/legal/license_header create mode 100644 templates/secure-boot/util_functions diff --git a/templates/common/template_disclaimer b/templates/common/template_disclaimer new file mode 100644 index 000000000..3b417deff --- /dev/null +++ b/templates/common/template_disclaimer @@ -0,0 +1,5 @@ +# This initialization action is generated from +# initialization-actions/templates/[% template_path %] +# +# Modifications made directly to the generated file will be lost when +# the template is re-evaluated diff --git a/templates/common/util_functions b/templates/common/util_functions new file mode 100644 index 000000000..5b85cad65 --- /dev/null +++ b/templates/common/util_functions @@ -0,0 +1,365 @@ +function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) + +function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) +function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) +function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) +function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) + +readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" +) + +# dynamically define OS version test utility functions +if [[ "$(os_id)" == "rocky" ]]; +then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') +else _os_version="$(os_version)"; fi +for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" + done +done + +function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) + +function os_vercat() ( set +x + if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' + elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' + else os_version ; fi ; ) + +function repair_old_backports { + if ! is_debuntu ; then return ; fi + # This script uses 'apt-get update' and is therefore potentially dependent on + # backports repositories which have been archived. In order to mitigate this + # problem, we will use archive.debian.org for the oldoldstable repo + + # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 + debdists="https://deb.debian.org/debian/dists" + oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + + matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) + + for filename in "${matched_files[@]}"; do + # Fetch from archive.debian.org for ${oldoldstable}-backports + perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } + {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" + done +} + +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ + -s -o ${tmpfile} 2>/dev/null) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} +} + +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value ${url} + return_code=$? + return ${return_code} +} + +# replicates /usr/share/google/get_metadata_value +function get_metadata_value() ( + set +x + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + + return ${return_code} +) + +function get_metadata_attribute() ( + set +x + local -r attribute_name="$1" + local -r default_value="${2:-}" + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +) + +function execute_with_retries() ( + set +x + local -r cmd="$*" + + if [[ "$cmd" =~ "^apt-get install" ]] ; then + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + fi + for ((i = 0; i < 3; i++)); do + set -x + time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + set +x + if [[ $retval == 0 ]] ; then return 0 ; fi + sleep 5 + done + return 1 +) + +function cache_fetched_package() { + local src_url="$1" + local gcs_fn="$2" + local local_fn="$3" + + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then + time gcloud storage cp "${gcs_fn}" "${local_fn}" + else + time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + fi +} + +function add_contrib_component() { + if ge_debian12 ; then + # Include in sources file components on which nvidia-kernel-open-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi +} + +function set_hadoop_property() { + local -r config_file=$1 + local -r property=$2 + local -r value=$3 + "${bdcfg}" set_property \ + --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ + --name "${property}" --value "${value}" \ + --clobber +} + +function configure_yarn_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + + # Fix local dirs access permissions + local yarn_local_dirs=() + + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} + +function clean_up_sources_lists() { + # + # bigtop (primary) + # + local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" + + if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then + region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" + + local regional_bigtop_repo_uri + regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | + sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | + grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | + cut -d ' ' -f 2 | + head -1) + + if [[ "${regional_bigtop_repo_uri}" == */ ]]; then + local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" + else + local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + fi + + local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" + rm -f "${bigtop_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ + "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" + + sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + fi + + # + # adoptium + # + # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu + local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" + local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" + rm -f "${adoptium_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ + | gpg --dearmor -o "${adoptium_kr_path}" + echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ + > /etc/apt/sources.list.d/adoptium.list + + + # + # docker + # + local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" + local docker_repo_file="/etc/apt/sources.list.d/docker.list" + local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" + + rm -f "${docker_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ + | gpg --dearmor -o "${docker_kr_path}" + echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ + > ${docker_repo_file} + + # + # google cloud + logging/monitoring + # + if ls /etc/apt/sources.list.d/google-cloud*.list ; then + rm -f /usr/share/keyrings/cloud.google.gpg + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + for list in google-cloud google-cloud-logging google-cloud-monitoring ; do + list_file="/etc/apt/sources.list.d/${list}.list" + if [[ -f "${list_file}" ]]; then + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" + fi + done + fi + + # + # cran-r + # + if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi + rm -f /usr/share/keyrings/cran-r.gpg + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ + gpg --dearmor -o /usr/share/keyrings/cran-r.gpg + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + fi + + # + # mysql + # + if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then + rm -f /usr/share/keyrings/mysql.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + gpg --dearmor -o /usr/share/keyrings/mysql.gpg + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list + fi + + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + +} + +function set_proxy(){ + METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + + if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + + export METADATA_HTTP_PROXY + export http_proxy="${METADATA_HTTP_PROXY}" + export https_proxy="${METADATA_HTTP_PROXY}" + export HTTP_PROXY="${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="${METADATA_HTTP_PROXY}" + no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" + local no_proxy_svc + for no_proxy_svc in compute secretmanager dns servicedirectory logging \ + bigquery composer pubsub bigquerydatatransfer dataflow \ + storage datafusion ; do + no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" + done + + export NO_PROXY="${no_proxy}" +} + +function mount_ramdisk(){ + local free_mem + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + + # Write to a ramdisk instead of churning the persistent disk + + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + pip cache purge || echo "unable to purge pip cache" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi +} + +function check_os() { + if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then + echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." + exit 1 + elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then + echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." + exit 1 + elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then + echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." + exit 1 + fi +} + +readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' diff --git a/templates/generate-action.pl b/templates/generate-action.pl new file mode 100644 index 000000000..407dfe310 --- /dev/null +++ b/templates/generate-action.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl -w +# -*-CPerl-*- + +# Usage: Run this script from the root directory of the git clone: +# perl templates/generate-action.pl gpu/install_gpu_driver.sh + +use Template; +use strict; +use v5.10; + +my $tt = Template->new( { + INCLUDE_PATH => "$ENV{PWD}/templates", + INTERPOLATE => 0, +}) || die "$Template::ERROR$/"; + +my $action = $ARGV[0]; + +sub usage{ + die "Usage: $0 "; +} + +usage unless( -f "$ENV{PWD}/templates/${action}.in" ); + +$tt->process("${action}.in") + || die $tt->error(), "\n"; diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in new file mode 100644 index 000000000..e4924f51e --- /dev/null +++ b/templates/gpu/install_gpu_driver.sh.in @@ -0,0 +1,280 @@ +#!/bin/bash +# +[% template_path="gpu/install_gpu_driver.sh.in" %] +[% INSERT legal/license_header %] +# +[% PROCESS common/template_disclaimer %] +# +# This script installs NVIDIA GPU drivers and collects GPU utilization metrics. + +set -euxo pipefail + +[% INSERT common/util_functions %] + +[% INSERT gpu/util_functions %] + +[% INSERT 'secure-boot/util_functions' %] + +function main() { + # This configuration should be run on all nodes + # regardless if they have attached GPUs + configure_yarn_resources + + # Detect NVIDIA GPU + if (lspci | grep -q NVIDIA); then + # if this is called without the MIG script then the drivers are not installed + migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" + if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi + fi + fi + + # if mig is enabled drivers would have already been installed + if [[ $IS_MIG_ENABLED -eq 0 ]]; then + install_nvidia_gpu_driver + install_nvidia_container_toolkit + install_cuda + load_kernel_module + + if [[ -n ${CUDNN_VERSION} ]]; then + install_nvidia_nccl + install_nvidia_cudnn + fi + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + #install_ops_agent + install_gpu_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent will not be installed.' + fi + + # for some use cases, the kernel module needs to be removed before first use of nvidia-smi + for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + done + + MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")" + if test -n "$(nvsmi -L)" ; then + # cache the result of the gpu query + ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') + echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" + fi + NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + # enable MIG on every GPU + for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do + nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + done + + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)" + fetch_mig_scripts + else + configure_gpu_exclusive_mode + fi + fi + + configure_yarn_nodemanager + configure_gpu_script + configure_gpu_isolation + elif [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager + configure_gpu_script + fi + + # Restart YARN services if they are running already + if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-resourcemanager.service + fi + if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-nodemanager.service + fi +} + +function exit_handler() { + # Purge private key material until next grant + clear_dkms_key + + set +ex + echo "Exit handler invoked" + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done + + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi + + if is_debuntu ; then + # Clean up OS package cache + apt-get -y -qq clean + apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove + # re-hold systemd package + if ge_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -x -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /var/lib/{docker,mysql,} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib64/google-cloud-sdk \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + fi + + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero + fi + + return 0 +} + +function prepare_to_install(){ + # Verify OS compatability and Secure boot state + check_os + check_secure_boot + + prepare_gpu_env + + OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" + readonly OS_NAME + + # node role + ROLE="$(get_metadata_attribute dataproc-role)" + readonly ROLE + + workdir=/opt/install-dpgce + tmpdir=/tmp/ + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + uname_r=$(uname -r) + readonly uname_r + readonly bdcfg="/usr/local/bin/bdconfig" + export DEBIAN_FRONTEND=noninteractive + + mkdir -p "${workdir}" + trap exit_handler EXIT + set_proxy + mount_ramdisk + + readonly install_log="${tmpdir}/install.log" + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi + + if test -f "${workdir}/prepare-complete" ; then return ; fi + + repair_old_backports + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + if ge_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero + ) fi + + install_dependencies + + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -LUS keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + + touch "${workdir}/prepare-complete" +} + +prepare_to_install + +main diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions new file mode 100644 index 000000000..5727da537 --- /dev/null +++ b/templates/gpu/util_functions @@ -0,0 +1,1192 @@ +function set_support_matrix() { + # CUDA version and Driver version + # https://docs.nvidia.com/deploy/cuda-compatibility/ + # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html + # https://developer.nvidia.com/cuda-downloads + + # Minimum supported version for open kernel driver is 515.43.04 + # https://github.com/NVIDIA/open-gpu-kernel-modules/tags + # Rocky8: 12.0: 525.147.05 + local latest + latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" + readonly -A DRIVER_FOR_CUDA=( + ["11.7"]="515.65.01" ["11.8"]="525.147.05" + ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ) + readonly -A DRIVER_SUBVER=( + ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" + ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" + ) + # https://developer.nvidia.com/cudnn-downloads + if is_debuntu ; then + readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" + ) + elif is_rocky ; then + # rocky: + # 12.0: 8.8.1.3 + # 12.1: 8.9.3.28 + # 12.2: 8.9.7.29 + # 12.3: 9.0.0.312 + # 12.4: 9.1.1.17 + # 12.5: 9.2.1.18 + # 12.6: 9.5.1.17 + readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" + ) + fi + # https://developer.nvidia.com/nccl/nccl-download + # 12.2: 2.19.3, 12.5: 2.21.5 + readonly -A NCCL_FOR_CUDA=( + ["11.7"]="2.21.5" ["11.8"]="2.21.5" + ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" + ) + readonly -A CUDA_SUBVER=( + ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" + ) +} + +set_support_matrix + +function set_cuda_version() { + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + if [[ -n "${cuda_url}" ]] ; then + # if cuda-url metadata variable has been passed, extract default version from url + local CUDA_URL_VERSION + CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" + if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then + DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" + CUDA_FULL_VERSION="${CUDA_URL_VERSION}" + fi + fi + + if ( ! test -v DEFAULT_CUDA_VERSION ) ; then + DEFAULT_CUDA_VERSION='12.4' + fi + readonly DEFAULT_CUDA_VERSION + + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + readonly CUDA_VERSION + if ( ! test -v CUDA_FULL_VERSION ) ; then + CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} + fi + readonly CUDA_FULL_VERSION + +} + +set_cuda_version + +function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) +function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) +function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) + +function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) +function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) +function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) + +function set_driver_version() { + local gpu_driver_url + gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') + + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + + local DEFAULT_DRIVER + # Take default from gpu-driver-url metadata value + if [[ -n "${gpu_driver_url}" ]] ; then + DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" + if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi + # Take default from cuda-url metadata value as a backup + elif [[ -n "${cuda_url}" ]] ; then + local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" + if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then + major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" + driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} + if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the version indicated by the cuda url as the default if it exists + DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" + elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the maximum sub-version available for the major version indicated in cuda url as the default + DEFAULT_DRIVER="${driver_max_maj_version}" + fi + fi + fi + + if ( ! test -v DEFAULT_DRIVER ) ; then + # If a default driver version has not been extracted, use the default for this version of CUDA + DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} + fi + + DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") + + readonly DRIVER_VERSION + readonly DRIVER="${DRIVER_VERSION%%.*}" + + export DRIVER_VERSION DRIVER + + gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" + exit 1 + fi +} + +set_driver_version + +readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" +readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + +# Parameters for NVIDIA-provided cuDNN library +readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} +CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) +# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} +if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" +elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" +elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" +fi +readonly CUDNN_VERSION + +readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} +readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + +# Parameters for NVIDIA-provided Debian GPU driver +readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + +readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + +USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" +readonly USERSPACE_FILENAME + +# Short name for urls +if is_ubuntu22 ; then + # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at + # https://developer.download.nvidia.com/compute/machine-learning/repos/ + # use packages from previous release until such time as nvidia + # release ubuntu2204 builds + + shortname="$(os_id)$(os_vercat)" + nccl_shortname="ubuntu2004" +elif ge_rocky9 ; then + # use packages from previous release until such time as nvidia + # release rhel9 builds + + shortname="rhel9" + nccl_shortname="rhel8" +elif is_rocky ; then + shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" + nccl_shortname="${shortname}" +else + shortname="$(os_id)$(os_vercat)" + nccl_shortname="${shortname}" +fi + +# Parameters for NVIDIA-provided package repositories +readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' +readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" + +# Parameters for NVIDIA-provided NCCL library +readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" +NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") +readonly NCCL_REPO_URL +readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub + +function set_cuda_runfile_url() { + local MAX_DRIVER_VERSION + local MAX_CUDA_VERSION + + local MIN_OPEN_DRIVER_VER="515.48.07" + local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" + local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER + + if is_cuda12 ; then + if is_debian12 ; then + MIN_DRIVER_VERSION="545.23.06" + MIN_CUDA_VERSION="12.3.0" + elif is_debian10 ; then + MAX_DRIVER_VERSION="555.42.02" + MAX_CUDA_VERSION="12.5.0" + elif is_ubuntu18 ; then + MAX_DRIVER_VERSION="530.30.02" + MAX_CUDA_VERSION="12.1.1" + fi + elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + if le_debian10 ; then + # cuda 11 is not supported for <= debian10 + MAX_CUDA_VERSION="0" + MAX_DRIVER_VERSION="0" + fi + else + echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + + if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then + echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then + echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then + echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + fi + + # driver version named in cuda runfile filename + # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) + readonly -A drv_for_cuda=( + ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" + ["11.8.0"]="520.61.05" + ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" + ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" + ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" + ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" + ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ) + + # Verify that the file with the indicated combination exists + local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} + CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" + local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" + local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" + + NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") + readonly NVIDIA_CUDA_URL + + CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_RUNFILE + + if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" + exit 1 + fi + + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then + echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" + elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then + echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then + echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then + echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" + fi +} + +set_cuda_runfile_url + +# Parameter for NVIDIA-provided Rocky Linux GPU driver +readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" + +CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" +CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" +if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then + # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" + if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then + # When cuDNN version is greater than or equal to 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" + fi + # Use legacy url format with one of the tarball name formats depending on version as above + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" +fi +if ( version_ge "${CUDA_VERSION}" "12.0" ); then + # Use modern url format When cuda version is greater than or equal to 12.0 + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" +fi +readonly CUDNN_TARBALL +readonly CUDNN_TARBALL_URL + +# Whether to install NVIDIA-provided or OS-provided GPU driver +GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') +readonly GPU_DRIVER_PROVIDER + +# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver +INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') +readonly INSTALL_GPU_AGENT + +NVIDIA_SMI_PATH='/usr/bin' +MIG_MAJOR_CAPS=0 +IS_MIG_ENABLED=0 + +CUDA_KEYRING_PKG_INSTALLED="0" +function install_cuda_keyring_pkg() { + if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi + local kr_ver=1.1 + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ + -o "${tmpdir}/cuda-keyring.deb" + dpkg -i "${tmpdir}/cuda-keyring.deb" + rm -f "${tmpdir}/cuda-keyring.deb" + CUDA_KEYRING_PKG_INSTALLED="1" +} + +function uninstall_cuda_keyring_pkg() { + apt-get purge -yq cuda-keyring + CUDA_KEYRING_PKG_INSTALLED="0" +} + +function install_local_cuda_repo() { + if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi + + if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + CUDA_LOCAL_REPO_INSTALLED="1" + pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" + CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" + readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" + readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" + readonly DIST_KEYRING_DIR="/var/${pkgname}" + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" + + dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" + rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + + if is_ubuntu ; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ + -o /etc/apt/preferences.d/cuda-repository-pin-600 + fi + + touch "${workdir}/install-local-cuda-repo-complete" +} +function uninstall_local_cuda_repo(){ + apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" + rm -f "${workdir}/install-local-cuda-repo-complete" +} + +CUDNN_PKG_NAME="" +function install_local_cudnn_repo() { + if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi + pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" + CUDNN_PKG_NAME="${pkgname}" + local_deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" + + # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${local_deb_url}" -o "${tmpdir}/local-installer.deb" + + dpkg -i "${tmpdir}/local-installer.deb" + + rm -f "${tmpdir}/local-installer.deb" + + cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings + + touch "${workdir}/install-local-cudnn-repo-complete" +} + +function uninstall_local_cudnn_repo() { + apt-get purge -yq "${CUDNN_PKG_NAME}" + rm -f "${workdir}/install-local-cudnn-repo-complete" +} + +CUDNN8_LOCAL_REPO_INSTALLED="0" +CUDNN8_PKG_NAME="" +function install_local_cudnn8_repo() { + if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi + + if is_ubuntu ; then cudnn8_shortname="ubuntu2004" + elif is_debian ; then cudnn8_shortname="debian11" + else return 0 ; fi + if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 + elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 + else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" + + pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" + CUDNN8_PKG_NAME="${pkgname}" + + deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_fn="${tmpdir}/${deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" + + # cache the cudnn package + cache_fetched_package "${local_deb_url}" \ + "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ + "${local_deb_fn}" + + local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" + # If we are using a ram disk, mount another where we will unpack the cudnn local installer + if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then + mkdir -p "${cudnn_path}" + mount -t tmpfs tmpfs "${cudnn_path}" + fi + + dpkg -i "${local_deb_fn}" + + rm -f "${local_deb_fn}" + + cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings + touch "${workdir}/install-local-cudnn8-repo-complete" +} + +function uninstall_local_cudnn8_repo() { + apt-get purge -yq "${CUDNN8_PKG_NAME}" + rm -f "${workdir}/install-local-cudnn8-repo-complete" +} + +function install_nvidia_nccl() { + if test -f "${workdir}/nccl-complete" ; then return ; fi + + if is_cuda11 && is_debian12 ; then + echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" + return + fi + + local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" + + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" + fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" + fi + + mkdir -p "${workdir}" + pushd "${workdir}" + + test -d "${workdir}/nccl" || { + local tarball_fn="v${NCCL_VERSION}-1.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "nccl-${NCCL_VERSION}-1" nccl + } + + local build_path + if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else + build_path="nccl/build/pkg/rpm/x86_64" ; fi + + test -d "${workdir}/nccl/build" || { + local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + else + # build and cache + pushd nccl + # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install + install_build_dependencies + if is_debuntu ; then + # These packages are required to build .deb packages from source + execute_with_retries \ + apt-get install -y -qq build-essential devscripts debhelper fakeroot + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.debian.build + elif is_rocky ; then + # These packages are required to build .rpm packages from source + execute_with_retries \ + dnf -y -q install rpm-build rpmdevtools + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.redhat.build + fi + tar czvf "/${local_tarball}" "../${build_path}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar xz + } + + if is_debuntu ; then + dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" + elif is_rocky ; then + rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" + fi + + popd + touch "${workdir}/nccl-complete" +} + +function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) +function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) + +function install_nvidia_cudnn() { + if test -f "${workdir}/cudnn-complete" ; then return ; fi + local major_version + major_version="${CUDNN_VERSION%%.*}" + local cudnn_pkg_version + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" + + if is_rocky ; then + if is_cudnn8 ; then + execute_with_retries dnf -y -q install \ + "libcudnn${major_version}" \ + "libcudnn${major_version}-devel" + sync + elif is_cudnn9 ; then + execute_with_retries dnf -y -q install \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" + sync + else + echo "Unsupported cudnn version: '${major_version}'" + fi + elif is_debuntu; then + if ge_debian12 && is_src_os ; then + apt-get -y install nvidia-cudnn + else + if is_cudnn8 ; then + install_local_cudnn8_repo + + apt-get update -qq + + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn8=${cudnn_pkg_version}" \ + "libcudnn8-dev=${cudnn_pkg_version}" + + uninstall_local_cudnn8_repo + sync + elif is_cudnn9 ; then + install_cuda_keyring_pkg + + apt-get update -qq + + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" + sync + else + echo "Unsupported cudnn version: [${CUDNN_VERSION}]" + fi + fi + else + echo "Unsupported OS: '${_shortname}'" + exit 1 + fi + + ldconfig + + echo "NVIDIA cuDNN successfully installed for ${_shortname}." + touch "${workdir}/cudnn-complete" +} + +function add_nonfree_components() { + if is_src_nvidia ; then return; fi + if ge_debian12 ; then + # Include in sources file components on which nvidia-open-kernel-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib non-free non-free-firmware" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list + fi +} + +function add_repo_nvidia_container_toolkit() { + if is_debuntu ; then + local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html + test -f "${kr_path}" || + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o "${kr_path}" + + test -f "${sources_list_path}" || + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \ + | tee "${sources_list_path}" + apt-get update + else + curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ + tee /etc/yum.repos.d/nvidia-container-toolkit.repo + fi +} + +function add_repo_cuda() { + if is_debuntu ; then + install_cuda_keyring_pkg # 11.7+, 12.0+ + elif is_rocky ; then + execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" + fi +} + +function build_driver_from_github() { + # non-GPL driver will have been built on rocky8 + if is_rocky8 ; then return 0 ; fi + pushd "${workdir}" + + test -d "${workdir}/open-gpu-kernel-modules" || { + local tarball_fn="${DRIVER_VERSION}.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules + } + + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local build_dir + if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] + then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi + + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + echo "cache hit" + else + # build the kernel modules + pushd open-gpu-kernel-modules + install_build_dependencies + if is_cuda11 && is_ubuntu22 ; then + echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" + exit 1 + fi + execute_with_retries make -j$(nproc) modules \ + > kernel-open/build.log \ + 2> kernel-open/build_error.log + # Sign kernel modules + if [[ -n "${PSN}" ]]; then + for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do + "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ + "${mok_key}" \ + "${mok_der}" \ + "${module}" + done + fi + make modules_install \ + >> kernel-open/build.log \ + 2>> kernel-open/build_error.log + # Collect build logs and installed binaries + tar czvf "${local_tarball}" \ + "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + } + + popd +} + +function build_driver_from_packages() { + if is_debuntu ; then + if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then + local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else + local pkglist=("nvidia-driver-${DRIVER}-open") ; fi + if is_debian ; then + pkglist=( + "firmware-nvidia-gsp=${DRIVER_VERSION}-1" + "nvidia-smi=${DRIVER_VERSION}-1" + "nvidia-alternative=${DRIVER_VERSION}-1" + "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1" + "nvidia-kernel-support=${DRIVER_VERSION}-1" + "nvidia-modprobe=${DRIVER_VERSION}-1" + "libnvidia-ml1=${DRIVER_VERSION}-1" + ) + fi + add_contrib_component + apt-get update -qq + execute_with_retries apt-get install -y -qq --no-install-recommends dkms + #configure_dkms_certs + execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" + sync + + elif is_rocky ; then + #configure_dkms_certs + if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then + echo "nvidia-driver:${DRIVER}-dkms installed successfully" + else + execute_with_retries dnf -y -q module install 'nvidia-driver:latest' + fi + sync + fi + #clear_dkms_key +} + +function install_nvidia_userspace_runfile() { + + # This .run file contains NV's OpenGL implementation as well as + # nvidia optimized implementations of the gtk+ 2,3 stack(s) not + # including glib (https://docs.gtk.org/glib/), and what appears to + # be a copy of the source from the kernel-open directory of for + # example DRIVER_VERSION=560.35.03 + # + # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz + # + # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run + # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. + if test -f "${workdir}/userspace-complete" ; then return ; fi + local local_fn="${tmpdir}/userspace.run" + + cache_fetched_package "${USERSPACE_URL}" \ + "${pkg_bucket}/${USERSPACE_FILENAME}" \ + "${local_fn}" + + local runfile_args + runfile_args="" + local cache_hit="0" + local local_tarball + + if is_rocky8 ; then + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local_tarball="${workdir}/${build_tarball}" + local build_dir + if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] + then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi + + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + cache_hit="1" + runfile_args="--no-kernel-modules" + echo "cache hit" + else + install_build_dependencies + + local signing_options + signing_options="" + if [[ -n "${PSN}" ]]; then + signing_options="--module-signing-hash sha256 \ + --module-signing-x509-hash sha256 \ + --module-signing-secret-key \"${mok_key}\" \ + --module-signing-public-key \"${mok_der}\" \ + --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ + " + fi + + runfile_args="--no-dkms ${signing_options}" + fi + } + else + runfile_args="--no-kernel-modules" + fi + + execute_with_retries bash "${local_fn}" -e -q \ + ${runfile_args} \ + --ui=none \ + --install-libglvnd \ + --tmpdir="${tmpdir}" + + if is_rocky8 ; then + if [[ "${cache_hit}" == "1" ]] ; then + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + else + tar czvf "${local_tarball}" \ + /var/log/nvidia-installer.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi + fi + + rm -f "${local_fn}" + touch "${workdir}/userspace-complete" + sync +} + +function install_cuda_runfile() { + if test -f "${workdir}/cuda-complete" ; then return ; fi + local local_fn="${tmpdir}/cuda.run" + + cache_fetched_package "${NVIDIA_CUDA_URL}" \ + "${pkg_bucket}/${CUDA_RUNFILE}" \ + "${local_fn}" + + execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" + rm -f "${local_fn}" + touch "${workdir}/cuda-complete" + sync +} + +function install_cuda_toolkit() { + local cudatk_package=cuda-toolkit + if ge_debian12 && is_src_os ; then + cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1" + elif [[ -n "${CUDA_VERSION}" ]]; then + cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}" + fi + cuda_package="cuda=${CUDA_FULL_VERSION}-1" + readonly cudatk_package + if is_debuntu ; then +# if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi + execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} + elif is_rocky ; then + # rocky9: cuda-11-[7,8], cuda-12-[1..6] + execute_with_retries dnf -y -q install "${cudatk_package}" + fi + sync +} + +function load_kernel_module() { + # for some use cases, the kernel module needs to be removed before first use of nvidia-smi + for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + done + + depmod -a + modprobe nvidia + for suffix in uvm modeset drm; do + modprobe "nvidia-${suffix}" + done + # TODO: if peermem is available, also modprobe nvidia-peermem +} + +function install_cuda(){ + if test -f "${workdir}/cuda-repo-complete" ; then return ; fi + + if ( ge_debian12 && is_src_os ) ; then + echo "installed with the driver on ${_shortname}" + return 0 + fi + + # The OS package distributions are unreliable + install_cuda_runfile + + # Includes CUDA packages + add_repo_cuda + + touch "${workdir}/cuda-repo-complete" +} + +function install_nvidia_container_toolkit() { + local container_runtime_default + if command -v docker ; then container_runtime_default='docker' + elif command -v containerd ; then container_runtime_default='containerd' + elif command -v crio ; then container_runtime_default='crio' + else container_runtime_default='' ; fi + CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") + + if test -z "${CONTAINER_RUNTIME}" ; then return ; fi + + add_repo_nvidia_container_toolkit + if is_debuntu ; then + execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else + execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi + nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" + systemctl restart "${CONTAINER_RUNTIME}" +} + +# Install NVIDIA GPU driver provided by NVIDIA +function install_nvidia_gpu_driver() { + if test -f "${workdir}/gpu-driver-complete" ; then return ; fi + + if ( ge_debian12 && is_src_os ) ; then + add_nonfree_components + apt-get update -qq + apt-get -yq install \ + dkms \ + nvidia-open-kernel-dkms \ + nvidia-open-kernel-support \ + nvidia-smi \ + libglvnd0 \ + libcuda1 + echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" + return 0 + fi + + # OS driver packages do not produce reliable driver ; use runfile + install_nvidia_userspace_runfile + + build_driver_from_github + + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + touch "${workdir}/gpu-driver-complete" +} + +function install_ops_agent(){ + if test -f "${workdir}/ops-agent-complete" ; then return ; fi + + mkdir -p /opt/google + cd /opt/google + # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation + curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install + + touch "${workdir}/ops-agent-complete" +} + +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_agent() { + # Stackdriver GPU agent parameters +# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' + local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' + if ( ! command -v pip && is_debuntu ) ; then + execute_with_retries "apt-get install -y -qq python3-pip" + fi + local install_dir=/opt/gpu-utilization-agent + mkdir -p "${install_dir}" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ + | sed -e 's/-u --format=/--format=/' \ + | dd status=none of="${install_dir}/report_gpu_metrics.py" + local venv="${install_dir}/venv" + python3 -m venv "${venv}" +( + source "${venv}/bin/activate" + python3 -m pip install --upgrade pip + execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" +) + sync + + # Generate GPU service. + cat </lib/systemd/system/gpu-utilization-agent.service +[Unit] +Description=GPU Utilization Metric Agent + +[Service] +Type=simple +PIDFile=/run/gpu_agent.pid +ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' +User=root +Group=root +WorkingDirectory=/ +Restart=always + +[Install] +WantedBy=multi-user.target +EOF + # Reload systemd manager configuration + systemctl daemon-reload + # Enable gpu-utilization-agent service + systemctl --no-reload --now enable gpu-utilization-agent.service +} + +function configure_gpu_exclusive_mode() { + # check if running spark 3, if not, enable GPU exclusive mode + local spark_version + spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) + if [[ ${spark_version} != 3.* ]]; then + # include exclusive mode on GPU + nvidia-smi -c EXCLUSIVE_PROCESS + fi +} + +function fetch_mig_scripts() { + mkdir -p /usr/local/yarn-mig-scripts + sudo chmod 755 /usr/local/yarn-mig-scripts + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + sudo chmod 755 /usr/local/yarn-mig-scripts/* +} + +function configure_gpu_script() { + # Download GPU discovery script + local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' + mkdir -p ${spark_gpu_script_dir} + # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still + # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: + # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh + local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh" + cat > "${gpus_resources_script}" <<'EOF' +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') + +echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} +EOF + + chmod a+rx "${gpus_resources_script}" + + local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" + if version_ge "${SPARK_VERSION}" "3.0" ; then + local gpu_count + gpu_count="$(lspci | grep NVIDIA | wc -l)" + local executor_cores + executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" + local executor_memory + executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" + local task_cpus=2 + local gpu_amount + gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + + cat >>"${spark_defaults_conf}" <> "${HADOOP_CONF_DIR}/container-executor.cfg" + printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + else + printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" + fi + + # Configure a systemd unit to ensure that permissions are set on restart + cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<&2 ; return 0 + elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 + else nvsmi_works="1" ; fi + + if [[ "$1" == "-L" ]] ; then + local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" + if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" + else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi + + return 0 + fi + + "${nvsmi}" $* +} + +function install_build_dependencies() { + if test -f "${workdir}/build-dependencies-complete" ; then return ; fi + + if is_debuntu ; then + if is_ubuntu22 && is_cuda12 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + fi + + elif is_rocky ; then + execute_with_retries dnf -y -q install gcc + + local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" + set +e + eval "${dnf_cmd}" > "${install_log}" 2>&1 + local retval="$?" + set -e + + if [[ "${retval}" == "0" ]] ; then return ; fi + + if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then + # this kernel-devel may have been migrated to the vault + local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" + local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" + dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ + "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" + )" + fi + + execute_with_retries "${dnf_cmd}" + fi + touch "${workdir}/build-dependencies-complete" +} + +function install_dependencies() { + pkg_list="pciutils screen" + if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} + elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi +} + +function prepare_gpu_env(){ + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + readonly SPARK_VERSION + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 + fi + + readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + nvsmi_works="0" + + if is_cuda11 ; then gcc_ver="11" + elif is_cuda12 ; then gcc_ver="12" ; fi +} diff --git a/templates/legal/license_header b/templates/legal/license_header new file mode 100644 index 000000000..4c05ecc74 --- /dev/null +++ b/templates/legal/license_header @@ -0,0 +1,11 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/templates/secure-boot/util_functions b/templates/secure-boot/util_functions new file mode 100644 index 000000000..f96a48200 --- /dev/null +++ b/templates/secure-boot/util_functions @@ -0,0 +1,105 @@ +function configure_dkms_certs() { + if test -v PSN && [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key" + fi + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert" + fi + else + modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" + fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + + return + fi + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + local mok_directory="$(dirname "${mok_key}")" + mkdir -p "${mok_directory}" + + # symlink private key and copy public cert from volatile storage to DKMS directory + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + + modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >&2 + return 0 + fi + rm -rf "${CA_TMPDIR}" "${mok_key}" +} + +function check_secure_boot() { + local SECURE_BOOT="disabled" + SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + + PSN="$(get_metadata_attribute private_secret_name)" + readonly PSN + + if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then + echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." + exit 1 + elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then + echo "Secure boot is enabled, but no signing material provided." + echo "Please either disable secure boot or provide signing material as per" + echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" + return 1 + fi + + CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" + readonly CA_TMPDIR + + if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv + mok_der=/var/lib/shim-signed/mok/MOK.der + else mok_key=/var/lib/dkms/mok.key + mok_der=/var/lib/dkms/mok.pub ; fi + + configure_dkms_certs +} From 1dae02baddd6dfe86f2b131dee816b052cda53ef Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 19 Dec 2024 19:17:12 -0800 Subject: [PATCH 002/130] new hold nvidia packages function ; moved variable definition around a bit --- templates/common/util_functions | 20 ++++++++++++++++++++ templates/gpu/install_gpu_driver.sh.in | 13 +------------ templates/gpu/util_functions | 17 ++++++++++------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 5b85cad65..df84feff5 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -355,6 +355,26 @@ function check_os() { echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." exit 1 fi + + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + readonly SPARK_VERSION + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 + fi + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi } readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index e4924f51e..23ae59d8f 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -134,6 +134,7 @@ function exit_handler() { # re-hold systemd package if ge_debian12 ; then apt-mark hold systemd libsystemd0 ; fi + hold_nvidia_packages else dnf clean all fi @@ -232,18 +233,6 @@ function prepare_to_install(){ readonly install_log="${tmpdir}/install.log" - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi - if test -f "${workdir}/prepare-complete" ; then return ; fi repair_old_backports diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 5727da537..17e38f8ca 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1176,13 +1176,6 @@ function install_dependencies() { function prepare_gpu_env(){ # Verify SPARK compatability RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - readonly SPARK_VERSION - if version_lt "${SPARK_VERSION}" "3.1" || \ - version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 - fi readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 nvsmi_works="0" @@ -1190,3 +1183,13 @@ function prepare_gpu_env(){ if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi } + +# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades +# Users should run apt-mark unhold before they wish to upgrade these packages +function hold_nvidia_packages() { + apt-mark hold nvidia-* + apt-mark hold libnvidia-* + if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then + apt-mark hold xserver-xorg-video-nvidia* + fi +} From e97e376b528d403e159d015332ac15d05f649f2d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 19 Dec 2024 19:34:32 -0800 Subject: [PATCH 003/130] added two new gpu functions: configure_mig_cgi and enable_mig --- templates/gpu/util_functions | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 17e38f8ca..eb7584745 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1193,3 +1193,17 @@ function hold_nvidia_packages() { apt-mark hold xserver-xorg-video-nvidia* fi } + +function configure_mig_cgi() { + if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then + META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI) + nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C + else + # Dataproc only supports A100's right now split in 2 if not specified + nvidia-smi mig -cgi 9,9 -C + fi +} + +function enable_mig() { + nvidia-smi -mig 1 +} From 310bb9d10eb47db1be705273c8f001b1491538a7 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 19 Dec 2024 19:34:58 -0800 Subject: [PATCH 004/130] templatized version of mig.sh --- templates/spark-rapids/mig.sh.in | 240 +++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 templates/spark-rapids/mig.sh.in diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in new file mode 100644 index 000000000..f77b232fa --- /dev/null +++ b/templates/spark-rapids/mig.sh.in @@ -0,0 +1,240 @@ +#!/bin/bash +# +[% template_path="spark-rapids/mig.sh.in" %] +[% INSERT legal/license_header %] +# +# This script should be specified in --metadata=startup-script-url= option and +# --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it. +# The script does a reboot to fully enable MIG and then configures the MIG device based on the +# user specified MIG_CGI profiles specified via: --metadata=^:^MIG_CGI='9,9'. If MIG_CGI +# is not specified it assumes it's using an A100 and configures 2 instances with profile id 9. +# It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the +# YARN setup to fully utilize the MIG instances on YARN. +# +[% PROCESS common/template_disclaimer %] +# +# This script installs NVIDIA GPU drivers and collects GPU utilization metrics. + +set -euxo pipefail + +[% INSERT common/util_functions %] + +[% INSERT gpu/util_functions %] + +[% INSERT 'secure-boot/util_functions' %] + +function exit_handler() { + # Purge private key material until next grant + clear_dkms_key + + set +ex + echo "Exit handler invoked" + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done + + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi + + if is_debuntu ; then + # Clean up OS package cache + apt-get -y -qq clean + apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove + # re-hold systemd package + if ge_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + hold_nvidia_packages + else + dnf clean all + fi + + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -x -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /var/lib/{docker,mysql,} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib64/google-cloud-sdk \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + fi + + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero + fi + + return 0 +} + +function prepare_to_install(){ + # Verify OS compatability and Secure boot state + check_os + check_secure_boot + + prepare_gpu_env + + OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" + readonly OS_NAME + + # node role + ROLE="$(get_metadata_attribute dataproc-role)" + readonly ROLE + + workdir=/opt/install-dpgce + tmpdir=/tmp/ + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + uname_r=$(uname -r) + readonly uname_r + readonly bdcfg="/usr/local/bin/bdconfig" + export DEBIAN_FRONTEND=noninteractive + + mkdir -p "${workdir}" + trap exit_handler EXIT + set_proxy + mount_ramdisk + + readonly install_log="${tmpdir}/install.log" + + if test -f "${workdir}/prepare-complete" ; then return ; fi + + repair_old_backports + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + if ge_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + hold_nvidia_packages + else + dnf clean all + fi + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero + ) fi + + install_dependencies + + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -LUS keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + + touch "${workdir}/prepare-complete" +} + +function main() { + # default MIG to on when this script is used + META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + + if (lspci | grep -q NVIDIA); then + if [[ $META_MIG_VALUE -ne 0 ]]; then + # if the first invocation, the NVIDIA drivers and tools are not installed + if [[ -f "/usr/bin/nvidia-smi" ]]; then + # check to see if we already enabled mig mode and rebooted so we don't end + # up in infinite reboot loop + NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` + if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then + if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then + echo "MIG is enabled on all GPUs, configuring instances" + configure_mig_cgi + exit 0 + else + echo "GPUs present but MIG is not enabled" + fi + else + echo "More than 1 GPU with MIG configured differently between them" + fi + fi + fi + + install_nvidia_gpu_driver + + if [[ ${META_MIG_VALUE} -ne 0 ]]; then + enable_mig + NUM_GPUS_WITH_DIFF_MIG_MODES="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)" + if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then + if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then + echo "MIG is fully enabled, we don't need to reboot" + configure_mig_cgi + else + echo "MIG is configured on but NOT enabled. Failing" + exit 1 + fi + else + echo "MIG is NOT enabled all on GPUs. Failing" + exit 1 + fi + else + echo "Not enabling MIG" + fi + fi +} + +prepare_to_install + +main From 912ebe7f44cdbeae3a41dbf1e49a26ebd6e83254 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 19 Dec 2024 19:43:38 -0800 Subject: [PATCH 005/130] comment fix-up --- templates/spark-rapids/mig.sh.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in index f77b232fa..815065965 100644 --- a/templates/spark-rapids/mig.sh.in +++ b/templates/spark-rapids/mig.sh.in @@ -2,6 +2,7 @@ # [% template_path="spark-rapids/mig.sh.in" %] [% INSERT legal/license_header %] +# This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures. # # This script should be specified in --metadata=startup-script-url= option and # --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it. @@ -12,8 +13,6 @@ # YARN setup to fully utilize the MIG instances on YARN. # [% PROCESS common/template_disclaimer %] -# -# This script installs NVIDIA GPU drivers and collects GPU utilization metrics. set -euxo pipefail From 87965de1b6af8e7599d149ff141fdccd66028e90 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 20 Dec 2024 15:40:51 -0800 Subject: [PATCH 006/130] nvidia-container-toolkit repo setup changes are working on rocky8 --- templates/common/util_functions | 97 +++++++++++++++++++++----- templates/gpu/install_gpu_driver.sh.in | 11 ++- templates/gpu/util_functions | 32 ++++----- 3 files changed, 99 insertions(+), 41 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index df84feff5..8cc3ede9e 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -7,25 +7,30 @@ function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $ function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) -readonly -A supported_os=( - ['debian']="10 11 12" - ['rocky']="8 9" - ['ubuntu']="18.04 20.04 22.04" -) - -# dynamically define OS version test utility functions -if [[ "$(os_id)" == "rocky" ]]; -then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') -else _os_version="$(os_version)"; fi -for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" - - for osver in $(echo "${supported_os["${os_id_val}"]}") ; do - eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" - eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" - eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" +function define_os_comparison_functions() { + + readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" + ) + + # dynamically define OS version test utility functions + if [[ "$(os_id)" == "rocky" ]]; + then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') + else _os_version="$(os_version)"; fi + for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" + done done -done +} + +define_os_comparison_functions function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) @@ -132,6 +137,7 @@ function cache_fetched_package() { } function add_contrib_component() { + if ! is_debuntu ; then return ; fi if ge_debian12 ; then # Include in sources file components on which nvidia-kernel-open-dkms depends local -r debian_sources="/etc/apt/sources.list.d/debian.sources" @@ -377,6 +383,61 @@ function check_os() { fi } +# +# Generate repo file under /etc/apt/sources.list.d/ +# +function apt_add_repo() { + local -r repo_name="$1" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local -r include_src="${4:-yes}" + local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" + + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + if [[ "${include_src}" == "yes" ]] ; then + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + fi +} + +# +# Generate repo file under /etc/yum.repos.d/ +# +function dnf_add_repo() { + local -r repo_name="$1" + local -r repo_url="$3" # "http(s)://host/path/filename.repo" + local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" + + curl -s -L "${repo_url}" \ + | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ + | dd of="${repo_path}" status=progress +} + +# +# Install package signing key and add corresponding repository +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +# +# Keyrings default to +# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or +# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) +# +function os_add_repo() { + local -r repo_name="$1" + local -r signing_key_url="$2" + local kr_path + if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi + + mkdir -p "$(dirname "${kr_path}")" + + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + + if is_debuntu ; then apt_add_repo $* + else dnf_add_repo $* ; fi +} + + readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" # Dataproc configurations diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 23ae59d8f..c52f79675 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -93,12 +93,11 @@ function main() { fi # Restart YARN services if they are running already - if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-resourcemanager.service - fi - if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-nodemanager.service - fi + for svc in resourcemanager nodemanager; do + if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-${svc}.service + fi + done } function exit_handler() { diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index eb7584745..7faed760c 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -70,6 +70,10 @@ function set_cuda_version() { readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then + CUDA_FULL_VERSION="${CUDA_VERSION}" + CUDA_VERSION="${CUDA_VERSION%.*}" + fi readonly CUDA_VERSION if ( ! test -v CUDA_FULL_VERSION ) ; then CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} @@ -614,23 +618,17 @@ function add_nonfree_components() { } function add_repo_nvidia_container_toolkit() { - if is_debuntu ; then - local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg - local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list - # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html - test -f "${kr_path}" || - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ - | gpg --dearmor -o "${kr_path}" - - test -f "${sources_list_path}" || - curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ - | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \ - | tee "${sources_list_path}" - apt-get update - else - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ - tee /etc/yum.repos.d/nvidia-container-toolkit.repo - fi + local nvctk_root="https://nvidia.github.io/libnvidia-container" + local signing_key_url="${nvctk_root}/gpgkey" + local repo_data + + if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" + else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi + + os_add_repo nvidia-container-toolkit \ + "${signing_key_url}" \ + "${repo_data}" \ + "no" } function add_repo_cuda() { From 93fe4cc6242b1c06df8920b8ee289954c23321f2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 20 Dec 2024 16:15:32 -0800 Subject: [PATCH 007/130] defining variables in the generator script instead of duplicating in the root template ; do not hold nvidia packages in the prepare function --- templates/generate-action.pl | 19 +++++++++---------- templates/gpu/install_gpu_driver.sh.in | 1 - templates/spark-rapids/mig.sh.in | 2 -- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/templates/generate-action.pl b/templates/generate-action.pl index 407dfe310..7cc954a67 100644 --- a/templates/generate-action.pl +++ b/templates/generate-action.pl @@ -6,20 +6,19 @@ use Template; use strict; -use v5.10; + +my $action = $ARGV[0]; +my $v = { template_path => "${action}.in" }; + +sub usage{ die "Usage: $0 " } + +usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}" ); my $tt = Template->new( { INCLUDE_PATH => "$ENV{PWD}/templates", + VARIABLES => $v, INTERPOLATE => 0, }) || die "$Template::ERROR$/"; -my $action = $ARGV[0]; - -sub usage{ - die "Usage: $0 "; -} - -usage unless( -f "$ENV{PWD}/templates/${action}.in" ); -$tt->process("${action}.in") - || die $tt->error(), "\n"; +$tt->process($v->{template_path}) or die( $tt->error(), "\n" ); diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index c52f79675..a5d4172dd 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -1,6 +1,5 @@ #!/bin/bash # -[% template_path="gpu/install_gpu_driver.sh.in" %] [% INSERT legal/license_header %] # [% PROCESS common/template_disclaimer %] diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in index 815065965..fff1186dc 100644 --- a/templates/spark-rapids/mig.sh.in +++ b/templates/spark-rapids/mig.sh.in @@ -1,6 +1,5 @@ #!/bin/bash # -[% template_path="spark-rapids/mig.sh.in" %] [% INSERT legal/license_header %] # This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures. # @@ -165,7 +164,6 @@ function prepare_to_install(){ apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi - hold_nvidia_packages else dnf clean all fi From b82aadc2e18e8905f3e2c71f1802c5abfb4f9a6e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 20 Dec 2024 17:36:16 -0800 Subject: [PATCH 008/130] tested with debian12 --- templates/common/util_functions | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 8cc3ede9e..b777968e5 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -397,6 +397,8 @@ function apt_add_repo() { if [[ "${include_src}" == "yes" ]] ; then echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" fi + + apt-get update -qq } # @@ -424,6 +426,7 @@ function dnf_add_repo() { function os_add_repo() { local -r repo_name="$1" local -r signing_key_url="$2" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" local kr_path if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi @@ -433,8 +436,8 @@ function os_add_repo() { curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ | gpg --import --no-default-keyring --keyring "${kr_path}" - if is_debuntu ; then apt_add_repo $* - else dnf_add_repo $* ; fi + if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi } From dd98436cd728ce5e1366a6d3e602b4231024a105 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 20 Dec 2024 20:57:34 -0800 Subject: [PATCH 009/130] tested on 8x H100s with bookworm --- templates/gpu/util_functions | 53 ++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 7faed760c..f2f3e2a9c 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1192,13 +1192,56 @@ function hold_nvidia_packages() { fi } +function delete_mig_instances() ( + # delete all instances + set +e + nvidia-smi mig -dci + + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No compute instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac + + nvidia-smi mig -dgi + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No GPU instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac +) + +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles function configure_mig_cgi() { - if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then - META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI) - nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C + delete_mig_instances + META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" + if test -n "${META_MIG_CGI_VALUE}"; then + nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C else - # Dataproc only supports A100's right now split in 2 if not specified - nvidia-smi mig -cgi 9,9 -C + if lspci | grep -q H100 ; then + # run the following command to list placement profiles + # nvidia-smi mig -lgipp + # + # This is the result when using H100 instances on 20241220 + # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 + # GPU 0 Profile ID 14 Placements: {0,2,4}:2 + # GPU 0 Profile ID 9 Placements: {0,4}:4 + # GPU 0 Profile ID 5 Placement : {0}:4 + # GPU 0 Profile ID 0 Placement : {0}:8 + + # For H100 3D controllers, use profile 19, 7x1G instances + nvidia-smi mig -cgi 19 -C + elif lspci | grep -q A100 ; then + # Dataproc only supports A100s right now split in 2 if not specified + # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances + nvidia-smi mig -cgi 9,9 -C + else + echo "unrecognized 3D controller" + fi fi } From b4dabad7eaf0f05ae6a889a7538f9362affd1111 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 21 Dec 2024 18:13:58 -0800 Subject: [PATCH 010/130] created and called function enable_and_configure_mig --- spark-rapids/mig.sh | 2201 ++++++++++++++++++++++++++---- templates/spark-rapids/mig.sh.in | 68 +- 2 files changed, 1965 insertions(+), 304 deletions(-) diff --git a/spark-rapids/mig.sh b/spark-rapids/mig.sh index 85300348d..473513438 100644 --- a/spark-rapids/mig.sh +++ b/spark-rapids/mig.sh @@ -13,6 +13,7 @@ # limitations under the License. # This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures. +# # This script should be specified in --metadata=startup-script-url= option and # --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it. # The script does a reboot to fully enable MIG and then configures the MIG device based on the @@ -21,370 +22,2030 @@ # It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the # YARN setup to fully utilize the MIG instances on YARN. # -# Much of this code is copied from install_gpu_driver.sh to do the driver and CUDA installation. -# It's copied in order to not affect the existing scripts when not using MIG. +# This initialization action is generated from +# initialization-actions/templates/spark-rapids/mig.sh.in +# +# Modifications made directly to the generated file will be lost when +# the template is re-evaluated + set -euxo pipefail -function get_metadata_attribute() { - local -r attribute_name=$1 - local -r default_value=$2 - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" -} - -# Fetch Linux Family distro and Dataproc Image version -readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') -readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" -DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value image|grep -Eo 'dataproc-[0-9]-[0-9]'|grep -Eo '[0-9]-[0-9]'|sed -e 's/-/./g') -echo "${DATAPROC_IMAGE_VERSION}" >> /usr/local/share/startup-mig-log - -# CUDA version and Driver version config -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.2.2') #12.2.2 -NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '535.104.05') #535.104.05 -CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 - -# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) -if [[ "${OS_NAME}" == "ubuntu" ]]; then - UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04 - UBUNTU_VERSION=${UBUNTU_VERSION%.*} - if [[ "${UBUNTU_VERSION}" == "18" ]]; then - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 - NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 - CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 - fi -fi +function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) -SECURE_BOOT="disabled" -SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') +function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) +function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) +function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) +function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) -function execute_with_retries() { - local -r cmd=$1 - for ((i = 0; i < 10; i++)); do - if eval "$cmd"; then - return 0 - fi +function define_os_comparison_functions() { + + readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" + ) + + # dynamically define OS version test utility functions + if [[ "$(os_id)" == "rocky" ]]; + then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') + else _os_version="$(os_version)"; fi + for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" + done + done +} + +define_os_comparison_functions + +function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) + +function os_vercat() ( set +x + if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' + elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' + else os_version ; fi ; ) + +function repair_old_backports { + if ! is_debuntu ; then return ; fi + # This script uses 'apt-get update' and is therefore potentially dependent on + # backports repositories which have been archived. In order to mitigate this + # problem, we will use archive.debian.org for the oldoldstable repo + + # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 + debdists="https://deb.debian.org/debian/dists" + oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + + matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) + + for filename in "${matched_files[@]}"; do + # Fetch from archive.debian.org for ${oldoldstable}-backports + perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } + {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" + done +} + +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ + -s -o ${tmpfile} 2>/dev/null) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} +} + +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value ${url} + return_code=$? + return ${return_code} +} + +# replicates /usr/share/google/get_metadata_value +function get_metadata_value() ( + set +x + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + + return ${return_code} +) + +function get_metadata_attribute() ( + set +x + local -r attribute_name="$1" + local -r default_value="${2:-}" + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +) + +function execute_with_retries() ( + set +x + local -r cmd="$*" + + if [[ "$cmd" =~ "^apt-get install" ]] ; then + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + fi + for ((i = 0; i < 3; i++)); do + set -x + time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + set +x + if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 +) + +function cache_fetched_package() { + local src_url="$1" + local gcs_fn="$2" + local local_fn="$3" + + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then + time gcloud storage cp "${gcs_fn}" "${local_fn}" + else + time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + fi } -# Enables a systemd service on bootup to install new headers. -# This service recompiles kernel modules for Ubuntu and Debian, which are necessary for the functioning of nvidia-smi. -function setup_systemd_update_headers() { - cat </lib/systemd/system/install-headers.service -[Unit] -Description=Install Linux headers for the current kernel -After=network-online.target +function add_contrib_component() { + if ! is_debuntu ; then return ; fi + if ge_debian12 ; then + # Include in sources file components on which nvidia-kernel-open-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" -[Service] -ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done' -Type=oneshot -RemainAfterExit=yes + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi +} -[Install] -WantedBy=multi-user.target -EOF +function set_hadoop_property() { + local -r config_file=$1 + local -r property=$2 + local -r value=$3 + "${bdcfg}" set_property \ + --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ + --name "${property}" --value "${value}" \ + --clobber +} - # Reload systemd to recognize the new unit file - systemctl daemon-reload +function configure_yarn_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - # Enable and start the service - systemctl enable --now install-headers.service + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' } -# Install NVIDIA GPU driver provided by NVIDIA -function install_nvidia_gpu_driver() { +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - ## common steps for all linux family distros - readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*} + # Fix local dirs access permissions + local yarn_local_dirs=() - ## installation steps based OS_NAME - if [[ ${OS_NAME} == "debian" ]]; then + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - DEBIAN_VERSION=$(lsb_release -r|awk '{print $2}') # 10 or 11 - export DEBIAN_FRONTEND=noninteractive + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} - execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" +function clean_up_sources_lists() { + # + # bigtop (primary) + # + local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" - readonly LOCAL_INSTALLER_DEB="cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb + if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then + region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" - dpkg -i /tmp/local-installer.deb - cp /var/cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/ - add-apt-repository contrib - execute_with_retries "apt-get update" + local regional_bigtop_repo_uri + regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | + sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | + grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | + cut -d ' ' -f 2 | + head -1) - if [[ ${DEBIAN_VERSION} == 10 ]]; then - apt remove -y libglvnd0 + if [[ "${regional_bigtop_repo_uri}" == */ ]]; then + local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" + else + local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" fi - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" + local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" + rm -f "${bigtop_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ + "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" - # enable a systemd service that updates kernel headers after reboot - setup_systemd_update_headers - - elif [[ ${OS_NAME} == "ubuntu" ]]; then + sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + fi - UBUNTU_VERSION=$(lsb_release -r|awk '{print $2}') # 20.04 or 22.04 - UBUNTU_VERSION=${UBUNTU_VERSION%.*} # 20 or 22 + # + # adoptium + # + # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu + local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" + local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" + rm -f "${adoptium_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ + | gpg --dearmor -o "${adoptium_kr_path}" + echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ + > /etc/apt/sources.list.d/adoptium.list - execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - readonly UBUNTU_REPO_CUDA_PIN="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-ubuntu${UBUNTU_VERSION}04.pin" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 + # + # docker + # + local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" + local docker_repo_file="/etc/apt/sources.list.d/docker.list" + local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" - readonly LOCAL_INSTALLER_DEB="cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb + rm -f "${docker_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ + | gpg --dearmor -o "${docker_kr_path}" + echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ + > ${docker_repo_file} - dpkg -i /tmp/local-installer.deb - cp /var/cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/ - execute_with_retries "apt-get update" - - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" + # + # google cloud + logging/monitoring + # + if ls /etc/apt/sources.list.d/google-cloud*.list ; then + rm -f /usr/share/keyrings/cloud.google.gpg + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + for list in google-cloud google-cloud-logging google-cloud-monitoring ; do + list_file="/etc/apt/sources.list.d/${list}.list" + if [[ -f "${list_file}" ]]; then + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" + fi + done + fi - # enable a systemd service that updates kernel headers after reboot - setup_systemd_update_headers + # + # cran-r + # + if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi + rm -f /usr/share/keyrings/cran-r.gpg + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ + gpg --dearmor -o /usr/share/keyrings/cran-r.gpg + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + fi - elif [[ ${OS_NAME} == "rocky" ]]; then + # + # mysql + # + if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then + rm -f /usr/share/keyrings/mysql.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + gpg --dearmor -o /usr/share/keyrings/mysql.gpg + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list + fi - ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8.8 or 9.1 - ROCKY_VERSION=${ROCKY_VERSION%.*} # 8 or 9 + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi - readonly NVIDIA_ROCKY_REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/rhel${ROCKY_VERSION}/x86_64/cuda-rhel${ROCKY_VERSION}.repo" - execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - execute_with_retries "dnf clean all" - execute_with_retries "dnf -y -q module install nvidia-driver:${NVIDIA_DRIVER_VERSION_PREFIX}" - execute_with_retries "dnf -y -q install cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" - modprobe nvidia +} + +function set_proxy(){ + METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + + if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + + export METADATA_HTTP_PROXY + export http_proxy="${METADATA_HTTP_PROXY}" + export https_proxy="${METADATA_HTTP_PROXY}" + export HTTP_PROXY="${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="${METADATA_HTTP_PROXY}" + no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" + local no_proxy_svc + for no_proxy_svc in compute secretmanager dns servicedirectory logging \ + bigquery composer pubsub bigquerydatatransfer dataflow \ + storage datafusion ; do + no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" + done + + export NO_PROXY="${no_proxy}" +} + +function mount_ramdisk(){ + local free_mem + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + # Write to a ramdisk instead of churning the persistent disk + + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + pip cache purge || echo "unable to purge pip cache" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives else - echo "Unsupported OS: '${OS_NAME}'" + mount -t tmpfs tmpfs /var/cache/dnf + fi +} + +function check_os() { + if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then + echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." + exit 1 + elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then + echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." + exit 1 + elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then + echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." + exit 1 + fi + + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + readonly SPARK_VERSION + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." exit 1 fi - ldconfig - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi } -function enable_mig() { - nvidia-smi -mig 1 +# +# Generate repo file under /etc/apt/sources.list.d/ +# +function apt_add_repo() { + local -r repo_name="$1" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local -r include_src="${4:-yes}" + local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" + + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + if [[ "${include_src}" == "yes" ]] ; then + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + fi + + apt-get update -qq } -function configure_mig_cgi() { - if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then - META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI) - nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C - else - # Dataproc only supports A100's right now split in 2 if not specified - nvidia-smi mig -cgi 9,9 -C +# +# Generate repo file under /etc/yum.repos.d/ +# +function dnf_add_repo() { + local -r repo_name="$1" + local -r repo_url="$3" # "http(s)://host/path/filename.repo" + local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" + + curl -s -L "${repo_url}" \ + | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ + | dd of="${repo_path}" status=progress +} + +# +# Install package signing key and add corresponding repository +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +# +# Keyrings default to +# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or +# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) +# +function os_add_repo() { + local -r repo_name="$1" + local -r signing_key_url="$2" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local kr_path + if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi + + mkdir -p "$(dirname "${kr_path}")" + + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + + if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi +} + + +readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' + + +function set_support_matrix() { + # CUDA version and Driver version + # https://docs.nvidia.com/deploy/cuda-compatibility/ + # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html + # https://developer.nvidia.com/cuda-downloads + + # Minimum supported version for open kernel driver is 515.43.04 + # https://github.com/NVIDIA/open-gpu-kernel-modules/tags + # Rocky8: 12.0: 525.147.05 + local latest + latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" + readonly -A DRIVER_FOR_CUDA=( + ["11.7"]="515.65.01" ["11.8"]="525.147.05" + ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ) + readonly -A DRIVER_SUBVER=( + ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" + ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" + ) + # https://developer.nvidia.com/cudnn-downloads + if is_debuntu ; then + readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" + ) + elif is_rocky ; then + # rocky: + # 12.0: 8.8.1.3 + # 12.1: 8.9.3.28 + # 12.2: 8.9.7.29 + # 12.3: 9.0.0.312 + # 12.4: 9.1.1.17 + # 12.5: 9.2.1.18 + # 12.6: 9.5.1.17 + readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" + ) fi + # https://developer.nvidia.com/nccl/nccl-download + # 12.2: 2.19.3, 12.5: 2.21.5 + readonly -A NCCL_FOR_CUDA=( + ["11.7"]="2.21.5" ["11.8"]="2.21.5" + ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" + ) + readonly -A CUDA_SUBVER=( + ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" + ) } -function upgrade_kernel() { - # Determine which kernel is installed - if [[ "${OS_NAME}" == "debian" ]]; then - CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( / Debian (\S+) / )'` - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( /^Linux version (\S+) / )'` - elif [[ ${OS_NAME} == rocky ]]; then - KERN_VER=$(yum info --installed kernel | awk '/^Version/ {print $3}') - KERN_REL=$(yum info --installed kernel | awk '/^Release/ {print $3}') - CURRENT_KERNEL_VERSION="${KERN_VER}-${KERN_REL}" - else - echo "unsupported OS: ${OS_NAME}!" - exit -1 - fi - - # Get latest version available in repos - if [[ "${OS_NAME}" == "debian" ]]; then - apt-get -qq update - TARGET_VERSION=$(apt-cache show --no-all-versions linux-image-amd64 | awk '/^Version/ {print $2}') - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - apt-get -qq update - LATEST_VERSION=$(apt-cache show --no-all-versions linux-image-gcp | awk '/^Version/ {print $2}') - TARGET_VERSION=`echo ${LATEST_VERSION} | perl -ne 'printf(q{%s-%s-gcp},/(\d+\.\d+\.\d+)\.(\d+)/)'` - elif [[ "${OS_NAME}" == "rocky" ]]; then - if yum info --available kernel ; then - KERN_VER=$(yum info --available kernel | awk '/^Version/ {print $3}') - KERN_REL=$(yum info --available kernel | awk '/^Release/ {print $3}') - TARGET_VERSION="${KERN_VER}-${KERN_REL}" - else - TARGET_VERSION="${CURRENT_KERNEL_VERSION}" +set_support_matrix + +function set_cuda_version() { + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + if [[ -n "${cuda_url}" ]] ; then + # if cuda-url metadata variable has been passed, extract default version from url + local CUDA_URL_VERSION + CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" + if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then + DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" + CUDA_FULL_VERSION="${CUDA_URL_VERSION}" fi fi - # Skip this script if we are already on the target version - if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then - echo "target kernel version [${TARGET_VERSION}] is installed" + if ( ! test -v DEFAULT_CUDA_VERSION ) ; then + DEFAULT_CUDA_VERSION='12.4' + fi + readonly DEFAULT_CUDA_VERSION - # Reboot may have interrupted dpkg. Bring package system to a good state - if [[ "${OS_NAME}" == "debian" || "${OS_NAME}" == "ubuntu" ]]; then - dpkg --configure -a - fi + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then + CUDA_FULL_VERSION="${CUDA_VERSION}" + CUDA_VERSION="${CUDA_VERSION%.*}" + fi + readonly CUDA_VERSION + if ( ! test -v CUDA_FULL_VERSION ) ; then + CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} + fi + readonly CUDA_FULL_VERSION - return 0 +} + +set_cuda_version + +function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) +function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) +function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) + +function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) +function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) +function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) + +function set_driver_version() { + local gpu_driver_url + gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') + + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + + local DEFAULT_DRIVER + # Take default from gpu-driver-url metadata value + if [[ -n "${gpu_driver_url}" ]] ; then + DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" + if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi + # Take default from cuda-url metadata value as a backup + elif [[ -n "${cuda_url}" ]] ; then + local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" + if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then + major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" + driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} + if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the version indicated by the cuda url as the default if it exists + DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" + elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the maximum sub-version available for the major version indicated in cuda url as the default + DEFAULT_DRIVER="${driver_max_maj_version}" + fi + fi fi - # Install the latest kernel - if [[ ${OS_NAME} == debian ]]; then - apt-get install -y linux-image-amd64 - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - apt-get install -y linux-image-gcp - elif [[ "${OS_NAME}" == "rocky" ]]; then - dnf -y -q install kernel + if ( ! test -v DEFAULT_DRIVER ) ; then + # If a default driver version has not been extracted, use the default for this version of CUDA + DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} fi - # Make it possible to reboot before init actions are complete - #1033 - DP_ROOT=/usr/local/share/google/dataproc - STARTUP_SCRIPT="${DP_ROOT}/startup-script.sh" - POST_HDFS_STARTUP_SCRIPT="${DP_ROOT}/post-hdfs-startup-script.sh" + DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") - for startup_script in ${STARTUP_SCRIPT} ${POST_HDFS_STARTUP_SCRIPT} ; do - sed -i -e 's:/usr/bin/env bash:/usr/bin/env bash\nexit 0:' ${startup_script} - done + readonly DRIVER_VERSION + readonly DRIVER="${DRIVER_VERSION%%.*}" - cp /var/log/dataproc-initialization-script-0.log /var/log/dataproc-initialization-script-0.log.0 + export DRIVER_VERSION DRIVER - systemctl reboot + gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" + exit 1 + fi } -# Verify if compatible linux distros and secure boot options are used -function check_os_and_secure_boot() { - if [[ "${OS_NAME}" == "debian" ]]; then - DEBIAN_VERSION=$(lsb_release -r | awk '{print $2}') # 10 or 11 - if [[ "${DEBIAN_VERSION}" != "10" && "${DEBIAN_VERSION}" != "11" ]]; then - echo "Error: The Debian version (${DEBIAN_VERSION}) is not supported. Please use a compatible Debian version." - exit 1 - fi - elif [[ "${OS_NAME}" == "ubuntu" ]]; then - UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04 - UBUNTU_VERSION=${UBUNTU_VERSION%.*} - if [[ "${UBUNTU_VERSION}" != "18" && "${UBUNTU_VERSION}" != "20" && "${UBUNTU_VERSION}" != "22" ]]; then - echo "Error: The Ubuntu version (${UBUNTU_VERSION}) is not supported. Please use a compatible Ubuntu version." - exit 1 +set_driver_version + +readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" +readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + +# Parameters for NVIDIA-provided cuDNN library +readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} +CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) +# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} +if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" +elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" +elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" +fi +readonly CUDNN_VERSION + +readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} +readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + +# Parameters for NVIDIA-provided Debian GPU driver +readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + +readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + +USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" +readonly USERSPACE_FILENAME + +# Short name for urls +if is_ubuntu22 ; then + # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at + # https://developer.download.nvidia.com/compute/machine-learning/repos/ + # use packages from previous release until such time as nvidia + # release ubuntu2204 builds + + shortname="$(os_id)$(os_vercat)" + nccl_shortname="ubuntu2004" +elif ge_rocky9 ; then + # use packages from previous release until such time as nvidia + # release rhel9 builds + + shortname="rhel9" + nccl_shortname="rhel8" +elif is_rocky ; then + shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" + nccl_shortname="${shortname}" +else + shortname="$(os_id)$(os_vercat)" + nccl_shortname="${shortname}" +fi + +# Parameters for NVIDIA-provided package repositories +readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' +readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" + +# Parameters for NVIDIA-provided NCCL library +readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" +NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") +readonly NCCL_REPO_URL +readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub + +function set_cuda_runfile_url() { + local MAX_DRIVER_VERSION + local MAX_CUDA_VERSION + + local MIN_OPEN_DRIVER_VER="515.48.07" + local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" + local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER + + if is_cuda12 ; then + if is_debian12 ; then + MIN_DRIVER_VERSION="545.23.06" + MIN_CUDA_VERSION="12.3.0" + elif is_debian10 ; then + MAX_DRIVER_VERSION="555.42.02" + MAX_CUDA_VERSION="12.5.0" + elif is_ubuntu18 ; then + MAX_DRIVER_VERSION="530.30.02" + MAX_CUDA_VERSION="12.1.1" fi - elif [[ "${OS_NAME}" == "rocky" ]]; then - ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8 or 9 - ROCKY_VERSION=${ROCKY_VERSION%.*} - if [[ "${ROCKY_VERSION}" != "8" && "${ROCKY_VERSION}" != "9" ]]; then - echo "Error: The Rocky Linux version (${ROCKY_VERSION}) is not supported. Please use a compatible Rocky Linux version." - exit 1 + elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + if le_debian10 ; then + # cuda 11 is not supported for <= debian10 + MAX_CUDA_VERSION="0" + MAX_DRIVER_VERSION="0" fi + else + echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + + if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then + echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then + echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then + echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" fi - if [[ "${SECURE_BOOT}" == "enabled" ]]; then - echo "Error: Secure Boot is enabled. Please disable Secure Boot while creating the cluster." + # driver version named in cuda runfile filename + # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) + readonly -A drv_for_cuda=( + ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" + ["11.8.0"]="520.61.05" + ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" + ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" + ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" + ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" + ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ) + + # Verify that the file with the indicated combination exists + local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} + CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" + local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" + local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" + + NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") + readonly NVIDIA_CUDA_URL + + CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_RUNFILE + + if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" exit 1 fi + + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then + echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" + elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then + echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then + echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then + echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" + fi } -# Detect dataproc image version from its various names -if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" +set_cuda_runfile_url + +# Parameter for NVIDIA-provided Rocky Linux GPU driver +readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" + +CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" +CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" +if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then + # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" + if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then + # When cuDNN version is greater than or equal to 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" + fi + # Use legacy url format with one of the tarball name formats depending on version as above + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" +fi +if ( version_ge "${CUDA_VERSION}" "12.0" ); then + # Use modern url format When cuda version is greater than or equal to 12.0 + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" fi +readonly CUDNN_TARBALL +readonly CUDNN_TARBALL_URL -function remove_old_backports { - # This script uses 'apt-get update' and is therefore potentially dependent on - # backports repositories which have been archived. In order to mitigate this - # problem, we will remove any reference to backports repos older than oldstable +# Whether to install NVIDIA-provided or OS-provided GPU driver +GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') +readonly GPU_DRIVER_PROVIDER - # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 - oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}'); - stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}'); - - matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)" - if [[ -n "$matched_files" ]]; then - for filename in "$matched_files"; do - grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \ - sed -i -e 's/^.*-backports.*$//' "$filename" - done +# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver +INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') +readonly INSTALL_GPU_AGENT + +NVIDIA_SMI_PATH='/usr/bin' +MIG_MAJOR_CAPS=0 +IS_MIG_ENABLED=0 + +CUDA_KEYRING_PKG_INSTALLED="0" +function install_cuda_keyring_pkg() { + if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi + local kr_ver=1.1 + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ + -o "${tmpdir}/cuda-keyring.deb" + dpkg -i "${tmpdir}/cuda-keyring.deb" + rm -f "${tmpdir}/cuda-keyring.deb" + CUDA_KEYRING_PKG_INSTALLED="1" +} + +function uninstall_cuda_keyring_pkg() { + apt-get purge -yq cuda-keyring + CUDA_KEYRING_PKG_INSTALLED="0" +} + +function install_local_cuda_repo() { + if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi + + if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + CUDA_LOCAL_REPO_INSTALLED="1" + pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" + CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" + readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" + readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" + readonly DIST_KEYRING_DIR="/var/${pkgname}" + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" + + dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" + rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + + if is_ubuntu ; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ + -o /etc/apt/preferences.d/cuda-repository-pin-600 fi + + touch "${workdir}/install-local-cuda-repo-complete" +} +function uninstall_local_cuda_repo(){ + apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" + rm -f "${workdir}/install-local-cuda-repo-complete" } -function main() { - if [[ ${OS_NAME} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then - remove_old_backports +CUDNN_PKG_NAME="" +function install_local_cudnn_repo() { + if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi + pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" + CUDNN_PKG_NAME="${pkgname}" + local_deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" + + # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${local_deb_url}" -o "${tmpdir}/local-installer.deb" + + dpkg -i "${tmpdir}/local-installer.deb" + + rm -f "${tmpdir}/local-installer.deb" + + cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings + + touch "${workdir}/install-local-cudnn-repo-complete" +} + +function uninstall_local_cudnn_repo() { + apt-get purge -yq "${CUDNN_PKG_NAME}" + rm -f "${workdir}/install-local-cudnn-repo-complete" +} + +CUDNN8_LOCAL_REPO_INSTALLED="0" +CUDNN8_PKG_NAME="" +function install_local_cudnn8_repo() { + if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi + + if is_ubuntu ; then cudnn8_shortname="ubuntu2004" + elif is_debian ; then cudnn8_shortname="debian11" + else return 0 ; fi + if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 + elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 + else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" + + pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" + CUDNN8_PKG_NAME="${pkgname}" + + deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_fn="${tmpdir}/${deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" + + # cache the cudnn package + cache_fetched_package "${local_deb_url}" \ + "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ + "${local_deb_fn}" + + local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" + # If we are using a ram disk, mount another where we will unpack the cudnn local installer + if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then + mkdir -p "${cudnn_path}" + mount -t tmpfs tmpfs "${cudnn_path}" + fi + + dpkg -i "${local_deb_fn}" + + rm -f "${local_deb_fn}" + + cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings + touch "${workdir}/install-local-cudnn8-repo-complete" +} + +function uninstall_local_cudnn8_repo() { + apt-get purge -yq "${CUDNN8_PKG_NAME}" + rm -f "${workdir}/install-local-cudnn8-repo-complete" +} + +function install_nvidia_nccl() { + if test -f "${workdir}/nccl-complete" ; then return ; fi + + if is_cuda11 && is_debian12 ; then + echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" + return + fi + + local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" + + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" + fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" + fi + + mkdir -p "${workdir}" + pushd "${workdir}" + + test -d "${workdir}/nccl" || { + local tarball_fn="v${NCCL_VERSION}-1.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "nccl-${NCCL_VERSION}-1" nccl + } + + local build_path + if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else + build_path="nccl/build/pkg/rpm/x86_64" ; fi + + test -d "${workdir}/nccl/build" || { + local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + else + # build and cache + pushd nccl + # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install + install_build_dependencies + if is_debuntu ; then + # These packages are required to build .deb packages from source + execute_with_retries \ + apt-get install -y -qq build-essential devscripts debhelper fakeroot + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.debian.build + elif is_rocky ; then + # These packages are required to build .rpm packages from source + execute_with_retries \ + dnf -y -q install rpm-build rpmdevtools + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.redhat.build + fi + tar czvf "/${local_tarball}" "../${build_path}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar xz + } + + if is_debuntu ; then + dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" + elif is_rocky ; then + rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" fi - check_os_and_secure_boot - - if [[ "${OS_NAME}" == "rocky" ]]; then - if dnf list kernel-devel-$(uname -r) && dnf list kernel-headers-$(uname -r); then - echo "kernel devel and headers packages are available. Proceed without kernel upgrade." + popd + touch "${workdir}/nccl-complete" +} + +function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) +function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) + +function install_nvidia_cudnn() { + if test -f "${workdir}/cudnn-complete" ; then return ; fi + local major_version + major_version="${CUDNN_VERSION%%.*}" + local cudnn_pkg_version + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" + + if is_rocky ; then + if is_cudnn8 ; then + execute_with_retries dnf -y -q install \ + "libcudnn${major_version}" \ + "libcudnn${major_version}-devel" + sync + elif is_cudnn9 ; then + execute_with_retries dnf -y -q install \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" + sync else - upgrade_kernel + echo "Unsupported cudnn version: '${major_version}'" + fi + elif is_debuntu; then + if ge_debian12 && is_src_os ; then + apt-get -y install nvidia-cudnn + else + if is_cudnn8 ; then + install_local_cudnn8_repo + + apt-get update -qq + + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn8=${cudnn_pkg_version}" \ + "libcudnn8-dev=${cudnn_pkg_version}" + + uninstall_local_cudnn8_repo + sync + elif is_cudnn9 ; then + install_cuda_keyring_pkg + + apt-get update -qq + + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" + sync + else + echo "Unsupported cudnn version: [${CUDNN_VERSION}]" + fi fi - fi - - if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then - export DEBIAN_FRONTEND=noninteractive - execute_with_retries "apt-get update" - execute_with_retries "apt-get install -y -q pciutils" - elif [[ ${OS_NAME} == rocky ]] ; then - execute_with_retries "dnf -y -q install pciutils" + else + echo "Unsupported OS: '${_shortname}'" + exit 1 fi - # default MIG to on when this script is used - META_MIG_VALUE=1 - if (/usr/share/google/get_metadata_value attributes/ENABLE_MIG); then - META_MIG_VALUE=$(/usr/share/google/get_metadata_value attributes/ENABLE_MIG) - fi - - if (lspci | grep -q NVIDIA); then - if [[ $META_MIG_VALUE -ne 0 ]]; then - # if the first invocation, the NVIDIA drivers and tools are not installed - if [[ -f "/usr/bin/nvidia-smi" ]]; then - # check to see if we already enabled mig mode and rebooted so we don't end - # up in infinite reboot loop - NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` - if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then - if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then - echo "MIG is enabled on all GPUs, configuring instances" - configure_mig_cgi - exit 0 - else - echo "GPUs present but MIG is not enabled" - fi - else - echo "More than 1 GPU with MIG configured differently between them" + ldconfig + + echo "NVIDIA cuDNN successfully installed for ${_shortname}." + touch "${workdir}/cudnn-complete" +} + +function add_nonfree_components() { + if is_src_nvidia ; then return; fi + if ge_debian12 ; then + # Include in sources file components on which nvidia-open-kernel-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib non-free non-free-firmware" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list + fi +} + +function add_repo_nvidia_container_toolkit() { + local nvctk_root="https://nvidia.github.io/libnvidia-container" + local signing_key_url="${nvctk_root}/gpgkey" + local repo_data + + if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" + else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi + + os_add_repo nvidia-container-toolkit \ + "${signing_key_url}" \ + "${repo_data}" \ + "no" +} + +function add_repo_cuda() { + if is_debuntu ; then + install_cuda_keyring_pkg # 11.7+, 12.0+ + elif is_rocky ; then + execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" + fi +} + +function build_driver_from_github() { + # non-GPL driver will have been built on rocky8 + if is_rocky8 ; then return 0 ; fi + pushd "${workdir}" + + test -d "${workdir}/open-gpu-kernel-modules" || { + local tarball_fn="${DRIVER_VERSION}.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules + } + + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local build_dir + if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] + then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi + + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + echo "cache hit" + else + # build the kernel modules + pushd open-gpu-kernel-modules + install_build_dependencies + if is_cuda11 && is_ubuntu22 ; then + echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" + exit 1 + fi + execute_with_retries make -j$(nproc) modules \ + > kernel-open/build.log \ + 2> kernel-open/build_error.log + # Sign kernel modules + if [[ -n "${PSN}" ]]; then + for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do + "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ + "${mok_key}" \ + "${mok_der}" \ + "${module}" + done + fi + make modules_install \ + >> kernel-open/build.log \ + 2>> kernel-open/build_error.log + # Collect build logs and installed binaries + tar czvf "${local_tarball}" \ + "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + } + + popd +} + +function build_driver_from_packages() { + if is_debuntu ; then + if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then + local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else + local pkglist=("nvidia-driver-${DRIVER}-open") ; fi + if is_debian ; then + pkglist=( + "firmware-nvidia-gsp=${DRIVER_VERSION}-1" + "nvidia-smi=${DRIVER_VERSION}-1" + "nvidia-alternative=${DRIVER_VERSION}-1" + "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1" + "nvidia-kernel-support=${DRIVER_VERSION}-1" + "nvidia-modprobe=${DRIVER_VERSION}-1" + "libnvidia-ml1=${DRIVER_VERSION}-1" + ) + fi + add_contrib_component + apt-get update -qq + execute_with_retries apt-get install -y -qq --no-install-recommends dkms + #configure_dkms_certs + execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" + sync + + elif is_rocky ; then + #configure_dkms_certs + if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then + echo "nvidia-driver:${DRIVER}-dkms installed successfully" + else + execute_with_retries dnf -y -q module install 'nvidia-driver:latest' + fi + sync + fi + #clear_dkms_key +} + +function install_nvidia_userspace_runfile() { + + # This .run file contains NV's OpenGL implementation as well as + # nvidia optimized implementations of the gtk+ 2,3 stack(s) not + # including glib (https://docs.gtk.org/glib/), and what appears to + # be a copy of the source from the kernel-open directory of for + # example DRIVER_VERSION=560.35.03 + # + # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz + # + # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run + # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. + if test -f "${workdir}/userspace-complete" ; then return ; fi + local local_fn="${tmpdir}/userspace.run" + + cache_fetched_package "${USERSPACE_URL}" \ + "${pkg_bucket}/${USERSPACE_FILENAME}" \ + "${local_fn}" + + local runfile_args + runfile_args="" + local cache_hit="0" + local local_tarball + + if is_rocky8 ; then + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local_tarball="${workdir}/${build_tarball}" + local build_dir + if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] + then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi + + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + cache_hit="1" + runfile_args="--no-kernel-modules" + echo "cache hit" + else + install_build_dependencies + + local signing_options + signing_options="" + if [[ -n "${PSN}" ]]; then + signing_options="--module-signing-hash sha256 \ + --module-signing-x509-hash sha256 \ + --module-signing-secret-key \"${mok_key}\" \ + --module-signing-public-key \"${mok_der}\" \ + --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ + " fi + + runfile_args="--no-dkms ${signing_options}" fi + } + else + runfile_args="--no-kernel-modules" + fi + + execute_with_retries bash "${local_fn}" -e -q \ + ${runfile_args} \ + --ui=none \ + --install-libglvnd \ + --tmpdir="${tmpdir}" + + if is_rocky8 ; then + if [[ "${cache_hit}" == "1" ]] ; then + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + else + tar czvf "${local_tarball}" \ + /var/log/nvidia-installer.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" fi fi - - # Detect NVIDIA GPU - if (lspci | grep -q NVIDIA); then - if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then - execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - elif [[ ${OS_NAME} == rocky ]]; then - echo "kernel devel and headers not required on rocky. installing from binary" + + rm -f "${local_fn}" + touch "${workdir}/userspace-complete" + sync +} + +function install_cuda_runfile() { + if test -f "${workdir}/cuda-complete" ; then return ; fi + local local_fn="${tmpdir}/cuda.run" + + cache_fetched_package "${NVIDIA_CUDA_URL}" \ + "${pkg_bucket}/${CUDA_RUNFILE}" \ + "${local_fn}" + + execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" + rm -f "${local_fn}" + touch "${workdir}/cuda-complete" + sync +} + +function install_cuda_toolkit() { + local cudatk_package=cuda-toolkit + if ge_debian12 && is_src_os ; then + cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1" + elif [[ -n "${CUDA_VERSION}" ]]; then + cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}" + fi + cuda_package="cuda=${CUDA_FULL_VERSION}-1" + readonly cudatk_package + if is_debuntu ; then +# if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi + execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} + elif is_rocky ; then + # rocky9: cuda-11-[7,8], cuda-12-[1..6] + execute_with_retries dnf -y -q install "${cudatk_package}" + fi + sync +} + +function load_kernel_module() { + # for some use cases, the kernel module needs to be removed before first use of nvidia-smi + for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + done + + depmod -a + modprobe nvidia + for suffix in uvm modeset drm; do + modprobe "nvidia-${suffix}" + done + # TODO: if peermem is available, also modprobe nvidia-peermem +} + +function install_cuda(){ + if test -f "${workdir}/cuda-repo-complete" ; then return ; fi + + if ( ge_debian12 && is_src_os ) ; then + echo "installed with the driver on ${_shortname}" + return 0 + fi + + # The OS package distributions are unreliable + install_cuda_runfile + + # Includes CUDA packages + add_repo_cuda + + touch "${workdir}/cuda-repo-complete" +} + +function install_nvidia_container_toolkit() { + local container_runtime_default + if command -v docker ; then container_runtime_default='docker' + elif command -v containerd ; then container_runtime_default='containerd' + elif command -v crio ; then container_runtime_default='crio' + else container_runtime_default='' ; fi + CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") + + if test -z "${CONTAINER_RUNTIME}" ; then return ; fi + + add_repo_nvidia_container_toolkit + if is_debuntu ; then + execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else + execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi + nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" + systemctl restart "${CONTAINER_RUNTIME}" +} + +# Install NVIDIA GPU driver provided by NVIDIA +function install_nvidia_gpu_driver() { + if test -f "${workdir}/gpu-driver-complete" ; then return ; fi + + if ( ge_debian12 && is_src_os ) ; then + add_nonfree_components + apt-get update -qq + apt-get -yq install \ + dkms \ + nvidia-open-kernel-dkms \ + nvidia-open-kernel-support \ + nvidia-smi \ + libglvnd0 \ + libcuda1 + echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" + return 0 + fi + + # OS driver packages do not produce reliable driver ; use runfile + install_nvidia_userspace_runfile + + build_driver_from_github + + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + touch "${workdir}/gpu-driver-complete" +} + +function install_ops_agent(){ + if test -f "${workdir}/ops-agent-complete" ; then return ; fi + + mkdir -p /opt/google + cd /opt/google + # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation + curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install + + touch "${workdir}/ops-agent-complete" +} + +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_agent() { + # Stackdriver GPU agent parameters +# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' + local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' + if ( ! command -v pip && is_debuntu ) ; then + execute_with_retries "apt-get install -y -qq python3-pip" + fi + local install_dir=/opt/gpu-utilization-agent + mkdir -p "${install_dir}" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ + | sed -e 's/-u --format=/--format=/' \ + | dd status=none of="${install_dir}/report_gpu_metrics.py" + local venv="${install_dir}/venv" + python3 -m venv "${venv}" +( + source "${venv}/bin/activate" + python3 -m pip install --upgrade pip + execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" +) + sync + + # Generate GPU service. + cat </lib/systemd/system/gpu-utilization-agent.service +[Unit] +Description=GPU Utilization Metric Agent + +[Service] +Type=simple +PIDFile=/run/gpu_agent.pid +ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' +User=root +Group=root +WorkingDirectory=/ +Restart=always + +[Install] +WantedBy=multi-user.target +EOF + # Reload systemd manager configuration + systemctl daemon-reload + # Enable gpu-utilization-agent service + systemctl --no-reload --now enable gpu-utilization-agent.service +} + +function configure_gpu_exclusive_mode() { + # check if running spark 3, if not, enable GPU exclusive mode + local spark_version + spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) + if [[ ${spark_version} != 3.* ]]; then + # include exclusive mode on GPU + nvidia-smi -c EXCLUSIVE_PROCESS + fi +} + +function fetch_mig_scripts() { + mkdir -p /usr/local/yarn-mig-scripts + sudo chmod 755 /usr/local/yarn-mig-scripts + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + sudo chmod 755 /usr/local/yarn-mig-scripts/* +} + +function configure_gpu_script() { + # Download GPU discovery script + local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' + mkdir -p ${spark_gpu_script_dir} + # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still + # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: + # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh + local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh" + cat > "${gpus_resources_script}" <<'EOF' +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') + +echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} +EOF + + chmod a+rx "${gpus_resources_script}" + + local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" + if version_ge "${SPARK_VERSION}" "3.0" ; then + local gpu_count + gpu_count="$(lspci | grep NVIDIA | wc -l)" + local executor_cores + executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" + local executor_memory + executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" + local task_cpus=2 + local gpu_amount + gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + + cat >>"${spark_defaults_conf}" <> "${HADOOP_CONF_DIR}/container-executor.cfg" + printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + else + printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" + fi + + # Configure a systemd unit to ensure that permissions are set on restart + cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<&2 ; return 0 + elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 + else nvsmi_works="1" ; fi + + if [[ "$1" == "-L" ]] ; then + local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" + if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" + else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi + + return 0 + fi + + "${nvsmi}" $* +} + +function install_build_dependencies() { + if test -f "${workdir}/build-dependencies-complete" ; then return ; fi + + if is_debuntu ; then + if is_ubuntu22 && is_cuda12 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + fi + + elif is_rocky ; then + execute_with_retries dnf -y -q install gcc + + local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" + set +e + eval "${dnf_cmd}" > "${install_log}" 2>&1 + local retval="$?" + set -e + + if [[ "${retval}" == "0" ]] ; then return ; fi + + if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then + # this kernel-devel may have been migrated to the vault + local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" + local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" + dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ + "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" + )" + fi + + execute_with_retries "${dnf_cmd}" + fi + touch "${workdir}/build-dependencies-complete" +} + +function install_dependencies() { + pkg_list="pciutils screen" + if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} + elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi +} + +function prepare_gpu_env(){ + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + + readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + nvsmi_works="0" + + if is_cuda11 ; then gcc_ver="11" + elif is_cuda12 ; then gcc_ver="12" ; fi +} + +# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades +# Users should run apt-mark unhold before they wish to upgrade these packages +function hold_nvidia_packages() { + apt-mark hold nvidia-* + apt-mark hold libnvidia-* + if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then + apt-mark hold xserver-xorg-video-nvidia* + fi +} + +function delete_mig_instances() ( + # delete all instances + set +e + nvidia-smi mig -dci + + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No compute instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac + + nvidia-smi mig -dgi + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No GPU instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac +) + +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles +function configure_mig_cgi() { + delete_mig_instances + META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" + if test -n "${META_MIG_CGI_VALUE}"; then + nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C + else + if lspci | grep -q H100 ; then + # run the following command to list placement profiles + # nvidia-smi mig -lgipp + # + # This is the result when using H100 instances on 20241220 + # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 + # GPU 0 Profile ID 14 Placements: {0,2,4}:2 + # GPU 0 Profile ID 9 Placements: {0,4}:4 + # GPU 0 Profile ID 5 Placement : {0}:4 + # GPU 0 Profile ID 0 Placement : {0}:8 + + # For H100 3D controllers, use profile 19, 7x1G instances + nvidia-smi mig -cgi 19 -C + elif lspci | grep -q A100 ; then + # Dataproc only supports A100s right now split in 2 if not specified + # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances + nvidia-smi mig -cgi 9,9 -C + else + echo "unrecognized 3D controller" + fi + fi +} + +function enable_mig() { + nvidia-smi -mig 1 +} + + +function configure_dkms_certs() { + if test -v PSN && [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key" + fi + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert" + fi + else + modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + + return + fi - install_nvidia_gpu_driver + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" - if [[ ${META_MIG_VALUE} -ne 0 ]]; then - enable_mig - NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` - if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then - if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then - echo "MIG is fully enabled, we don't need to reboot" + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + local mok_directory="$(dirname "${mok_key}")" + mkdir -p "${mok_directory}" + + # symlink private key and copy public cert from volatile storage to DKMS directory + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + + modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >&2 + return 0 + fi + rm -rf "${CA_TMPDIR}" "${mok_key}" +} + +function check_secure_boot() { + local SECURE_BOOT="disabled" + SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + + PSN="$(get_metadata_attribute private_secret_name)" + readonly PSN + + if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then + echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." + exit 1 + elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then + echo "Secure boot is enabled, but no signing material provided." + echo "Please either disable secure boot or provide signing material as per" + echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" + return 1 + fi + + CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" + readonly CA_TMPDIR + + if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv + mok_der=/var/lib/shim-signed/mok/MOK.der + else mok_key=/var/lib/dkms/mok.key + mok_der=/var/lib/dkms/mok.pub ; fi + + configure_dkms_certs +} + + +function exit_handler() { + # Purge private key material until next grant + clear_dkms_key + + set +ex + echo "Exit handler invoked" + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done + + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi + + if is_debuntu ; then + # Clean up OS package cache + apt-get -y -qq clean + apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove + # re-hold systemd package + if ge_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + hold_nvidia_packages + else + dnf clean all + fi + + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -x -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /var/lib/{docker,mysql,} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib64/google-cloud-sdk \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + fi + + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero + fi + + return 0 +} + +function prepare_to_install(){ + # Verify OS compatability and Secure boot state + check_os + check_secure_boot + + prepare_gpu_env + + OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" + readonly OS_NAME + + # node role + ROLE="$(get_metadata_attribute dataproc-role)" + readonly ROLE + + workdir=/opt/install-dpgce + tmpdir=/tmp/ + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + uname_r=$(uname -r) + readonly uname_r + readonly bdcfg="/usr/local/bin/bdconfig" + export DEBIAN_FRONTEND=noninteractive + + mkdir -p "${workdir}" + trap exit_handler EXIT + set_proxy + mount_ramdisk + + readonly install_log="${tmpdir}/install.log" + + if test -f "${workdir}/prepare-complete" ; then return ; fi + + repair_old_backports + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + if ge_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero + ) fi + + install_dependencies + + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -LUS keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + + touch "${workdir}/prepare-complete" +} + +function main() { + # default MIG to on when this script is used + META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + + if ! (lspci | grep -q NVIDIA) ; then return ; fi + if [[ $META_MIG_VALUE -ne 0 ]]; then + # if the first invocation, the NVIDIA drivers and tools are not installed + if [[ -f "/usr/bin/nvidia-smi" ]]; then + # check to see if we already enabled mig mode and rebooted so we don't end + # up in infinite reboot loop + mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" + NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)" + if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then + if (echo "${mig_mode_current}" | grep Enabled); then + echo "MIG is enabled on all GPUs, configuring instances" configure_mig_cgi + exit 0 else - echo "MIG is configured on but NOT enabled, we need to reboot" - reboot + echo "GPUs present but MIG is not enabled" fi else - echo "MIG is NOT enabled all on GPUs, we need to reboot" - reboot + echo "More than 1 GPU with MIG configured differently between them" fi - else - echo "Not enabling MIG" fi fi + + install_nvidia_gpu_driver + + if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi + + enable_mig + + mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" + + NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)" + if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -ne 1 ]] ; then echo "MIG is NOT enabled all on GPUs. Failing" ; exit 1 ; fi + if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured on but NOT enabled. Failing" ; exit 1 ; fi + + echo "MIG is fully enabled" + configure_mig_cgi } +prepare_to_install + main diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in index fff1186dc..0779a1c28 100644 --- a/templates/spark-rapids/mig.sh.in +++ b/templates/spark-rapids/mig.sh.in @@ -184,52 +184,52 @@ function prepare_to_install(){ touch "${workdir}/prepare-complete" } -function main() { +function enable_and_configure_mig() { # default MIG to on when this script is used META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") - if (lspci | grep -q NVIDIA); then - if [[ $META_MIG_VALUE -ne 0 ]]; then - # if the first invocation, the NVIDIA drivers and tools are not installed - if [[ -f "/usr/bin/nvidia-smi" ]]; then - # check to see if we already enabled mig mode and rebooted so we don't end - # up in infinite reboot loop - NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` - if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then - if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then - echo "MIG is enabled on all GPUs, configuring instances" - configure_mig_cgi - exit 0 - else - echo "GPUs present but MIG is not enabled" - fi - else - echo "More than 1 GPU with MIG configured differently between them" - fi - fi - fi + if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi + + enable_mig + + mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" - install_nvidia_gpu_driver + NUM_GPUS_WITH_DIFF_MIG_MODES= + if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled all on GPUs. Failing" ; exit 1 ; fi + if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured on but NOT enabled. Failing" ; exit 1 ; fi - if [[ ${META_MIG_VALUE} -ne 0 ]]; then - enable_mig - NUM_GPUS_WITH_DIFF_MIG_MODES="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)" - if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then - if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then - echo "MIG is fully enabled, we don't need to reboot" + echo "MIG is fully enabled" + configure_mig_cgi +} + +function main() { + # default MIG to on when this script is used + META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + + if ! (lspci | grep -q NVIDIA) ; then return ; fi + if [[ $META_MIG_VALUE -ne 0 ]]; then + # if the first invocation, the NVIDIA drivers and tools are not installed + if [[ -f "/usr/bin/nvidia-smi" ]]; then + # check to see if we already enabled mig mode and rebooted so we don't end + # up in infinite reboot loop + mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" + NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)" + if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then + if (echo "${mig_mode_current}" | grep Enabled); then + echo "MIG is enabled on all GPUs, configuring instances" configure_mig_cgi + exit 0 else - echo "MIG is configured on but NOT enabled. Failing" - exit 1 + echo "GPUs present but MIG is not enabled" fi else - echo "MIG is NOT enabled all on GPUs. Failing" - exit 1 + echo "More than 1 GPU with MIG configured differently between them" fi - else - echo "Not enabling MIG" fi fi + + install_nvidia_gpu_driver + enable_and_configure_mig } prepare_to_install From edeab284b81404e3148e5cbd12c4c928d23bd50c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Dec 2024 15:19:12 -0800 Subject: [PATCH 011/130] moved comment to correct function --- templates/common/util_functions | 3 --- templates/gpu/util_functions | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index b777968e5..9133072d9 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -415,9 +415,6 @@ function dnf_add_repo() { | dd of="${repo_path}" status=progress } -# -# Install package signing key and add corresponding repository -# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html # # Keyrings default to # /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index f2f3e2a9c..c475cc269 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -617,6 +617,9 @@ function add_nonfree_components() { fi } +# +# Install package signing key and add corresponding repository +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html function add_repo_nvidia_container_toolkit() { local nvctk_root="https://nvidia.github.io/libnvidia-container" local signing_key_url="${nvctk_root}/gpgkey" From 0e8946cdbdc5cd31ff8d2eb3960bfb368e8264fa Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Dec 2024 20:25:58 -0800 Subject: [PATCH 012/130] do not point to local rpm pgp key --- templates/common/util_functions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 9133072d9..3373fb24e 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -411,8 +411,8 @@ function dnf_add_repo() { local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" curl -s -L "${repo_url}" \ - | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ | dd of="${repo_path}" status=progress +# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ } # From c44195a913fb3604fcf9d66a744620e41caa1f7f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 21:01:03 -0800 Subject: [PATCH 013/130] store completion signal files in their own directory --- templates/gpu/util_functions | 52 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index c475cc269..5631ab414 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -342,7 +342,7 @@ function uninstall_cuda_keyring_pkg() { } function install_local_cuda_repo() { - if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi CUDA_LOCAL_REPO_INSTALLED="1" @@ -365,16 +365,16 @@ function install_local_cuda_repo() { -o /etc/apt/preferences.d/cuda-repository-pin-600 fi - touch "${workdir}/install-local-cuda-repo-complete" + touch "${workdir}/complete/install-local-cuda-repo" } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - rm -f "${workdir}/install-local-cuda-repo-complete" + rm -f "${workdir}/complete/install-local-cuda-repo" } CUDNN_PKG_NAME="" function install_local_cudnn_repo() { - if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" @@ -390,18 +390,18 @@ function install_local_cudnn_repo() { cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/install-local-cudnn-repo-complete" + touch "${workdir}/complete/install-local-cudnn-repo" } function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" - rm -f "${workdir}/install-local-cudnn-repo-complete" + rm -f "${workdir}/complete/install-local-cudnn-repo" } CUDNN8_LOCAL_REPO_INSTALLED="0" CUDNN8_PKG_NAME="" function install_local_cudnn8_repo() { - if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" @@ -435,16 +435,16 @@ function install_local_cudnn8_repo() { rm -f "${local_deb_fn}" cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/install-local-cudnn8-repo-complete" + touch "${workdir}/complete/install-local-cudnn8-repo" } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" - rm -f "${workdir}/install-local-cudnn8-repo-complete" + rm -f "${workdir}/complete/install-local-cudnn8-repo" } function install_nvidia_nccl() { - if test -f "${workdir}/nccl-complete" ; then return ; fi + if test -f "${workdir}/complete/nccl" ; then return ; fi if is_cuda11 && is_debian12 ; then echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" @@ -535,14 +535,14 @@ function install_nvidia_nccl() { fi popd - touch "${workdir}/nccl-complete" + touch "${workdir}/complete/nccl" } function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { - if test -f "${workdir}/cudnn-complete" ; then return ; fi + if test -f "${workdir}/complete/cudnn" ; then return ; fi local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -601,7 +601,7 @@ function install_nvidia_cudnn() { ldconfig echo "NVIDIA cuDNN successfully installed for ${_shortname}." - touch "${workdir}/cudnn-complete" + touch "${workdir}/complete/cudnn" } function add_nonfree_components() { @@ -754,7 +754,7 @@ function install_nvidia_userspace_runfile() { # # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - if test -f "${workdir}/userspace-complete" ; then return ; fi + if test -f "${workdir}/complete/userspace" ; then return ; fi local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ @@ -822,12 +822,12 @@ function install_nvidia_userspace_runfile() { fi rm -f "${local_fn}" - touch "${workdir}/userspace-complete" + touch "${workdir}/complete/userspace" sync } function install_cuda_runfile() { - if test -f "${workdir}/cuda-complete" ; then return ; fi + if test -f "${workdir}/complete/cuda" ; then return ; fi local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ @@ -836,7 +836,7 @@ function install_cuda_runfile() { execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" rm -f "${local_fn}" - touch "${workdir}/cuda-complete" + touch "${workdir}/complete/cuda" sync } @@ -874,7 +874,7 @@ function load_kernel_module() { } function install_cuda(){ - if test -f "${workdir}/cuda-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/cuda-repo" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" @@ -887,7 +887,7 @@ function install_cuda(){ # Includes CUDA packages add_repo_cuda - touch "${workdir}/cuda-repo-complete" + touch "${workdir}/complete/cuda-repo" } function install_nvidia_container_toolkit() { @@ -910,7 +910,7 @@ function install_nvidia_container_toolkit() { # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if test -f "${workdir}/gpu-driver-complete" ; then return ; fi + if test -f "${workdir}/complete/gpu-driver" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components @@ -932,11 +932,11 @@ function install_nvidia_gpu_driver() { build_driver_from_github echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - touch "${workdir}/gpu-driver-complete" + touch "${workdir}/complete/gpu-driver" } function install_ops_agent(){ - if test -f "${workdir}/ops-agent-complete" ; then return ; fi + if test -f "${workdir}/complete/ops-agent" ; then return ; fi mkdir -p /opt/google cd /opt/google @@ -944,7 +944,7 @@ function install_ops_agent(){ curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - touch "${workdir}/ops-agent-complete" + touch "${workdir}/complete/ops-agent" } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics @@ -1127,7 +1127,7 @@ function nvsmi() { } function install_build_dependencies() { - if test -f "${workdir}/build-dependencies-complete" ; then return ; fi + if test -f "${workdir}/complete/build-dependencies" ; then return ; fi if is_debuntu ; then if is_ubuntu22 && is_cuda12 ; then @@ -1165,7 +1165,7 @@ function install_build_dependencies() { execute_with_retries "${dnf_cmd}" fi - touch "${workdir}/build-dependencies-complete" + touch "${workdir}/complete/build-dependencies" } function install_dependencies() { @@ -1183,6 +1183,8 @@ function prepare_gpu_env(){ if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi + + mkdir -p "${workdir}/complete" } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades From 31d1a9e65425cffb32e09d1a5599b622058c5d97 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 21:47:53 -0800 Subject: [PATCH 014/130] excessive sudo --- templates/gpu/util_functions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 5631ab414..4ce22f01a 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1007,10 +1007,10 @@ function configure_gpu_exclusive_mode() { function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts - sudo chmod 755 /usr/local/yarn-mig-scripts + chmod 755 /usr/local/yarn-mig-scripts wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - sudo chmod 755 /usr/local/yarn-mig-scripts/* + chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { From 4a3a8cdbc1e6cad61efa4b8072ff4279700f65b2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 22:23:50 -0800 Subject: [PATCH 015/130] install spark rapids in all cases --- templates/gpu/install_gpu_driver.sh.in | 2 + templates/spark-rapids/spark-rapids.sh.in | 300 ++++++++++++++++++++++ templates/spark-rapids/util_functions | 49 ++++ 3 files changed, 351 insertions(+) create mode 100644 templates/spark-rapids/spark-rapids.sh.in create mode 100644 templates/spark-rapids/util_functions diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index a5d4172dd..e852ed73c 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -84,10 +84,12 @@ function main() { fi configure_yarn_nodemanager + install_spark_rapids configure_gpu_script configure_gpu_isolation elif [[ "${ROLE}" == "Master" ]]; then configure_yarn_nodemanager + install_spark_rapids configure_gpu_script fi diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in new file mode 100644 index 000000000..fc37f109f --- /dev/null +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -0,0 +1,300 @@ +#!/bin/bash +# +[% INSERT legal/license_header %] +# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2. +# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only +# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions. +# Note that the script is designed to work when secure boot is disabled during cluster creation. +# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu. +# +[% PROCESS common/template_disclaimer %] + +set -euxo pipefail + +[% INSERT common/util_functions %] + +[% INSERT 'secure-boot/util_functions' %] + +[% INSERT gpu/util_functions %] + +[% INSERT 'spark-rapids/util_functions' %] + +check_secure_boot + +# Stackdriver GPU agent parameters +# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver +INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') +readonly INSTALL_GPU_AGENT + +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' + +NVIDIA_SMI_PATH='/usr/bin' +MIG_MAJOR_CAPS=0 +IS_MIG_ENABLED=0 + +function setup_gpu_yarn() { + # This configuration should be run on all nodes + # regardless if they have attached GPUs + configure_yarn_resources + + # Detect NVIDIA GPU + if (lspci | grep -q NVIDIA); then + # if this is called without the MIG script then the drivers are not installed + migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" + if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi + fi + fi + + if is_debuntu ; then + execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" + elif is_rocky ; then + echo "kernel devel and headers not required on rocky. installing from binary" + fi + + # if mig is enabled drivers would have already been installed + if [[ $IS_MIG_ENABLED -eq 0 ]]; then + install_nvidia_gpu_driver + install_cuda + load_kernel_module + + #Install GPU metrics collection in Stackdriver if needed + if [[ ${INSTALL_GPU_AGENT} == true ]]; then + #install_gpu_agent + install_gpu_monitoring_agent + + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent will not be installed.' + fi + configure_gpu_exclusive_mode + fi + + configure_yarn_nodemanager + configure_gpu_script + configure_gpu_isolation + elif [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager + configure_gpu_script + fi + + # Restart YARN services if they are running already + for svc in resourcemanager nodemanager; do + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl restart "hadoop-yarn-${svc}.service" + fi + done +} + +function main() { + repair_old_backports + check_os + check_secure_boot + + setup_gpu_yarn + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + install_spark_rapids + configure_gpu_script + echo "RAPIDS initialized with Spark runtime" + else + echo "Unsupported RAPIDS Runtime: ${RUNTIME}" + exit 1 + fi + + for svc in resourcemanager nodemanager; do + if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-${svc}.service + fi + done +} + +function exit_handler() { + # Purge private key material until next grant + clear_dkms_key + + set +ex + echo "Exit handler invoked" + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done + + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi + + if is_debuntu ; then + # Clean up OS package cache + apt-get -y -qq clean + apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove + # re-hold systemd package + if ge_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + hold_nvidia_packages + else + dnf clean all + fi + + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -x -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /var/lib/{docker,mysql,} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib64/google-cloud-sdk \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + fi + + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero + fi + + return 0 +} + +# Update SPARK RAPIDS config +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" +readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) +readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) + +# Fetch instance roles and runtime +readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) +readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) + +# CUDA version and Driver version config +CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1') #12.2.2 +NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05 +CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 + +function prepare_to_install(){ + # Verify OS compatability and Secure boot state + check_os + check_secure_boot + + prepare_gpu_env + + OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" + readonly OS_NAME + + # node role + ROLE="$(get_metadata_attribute dataproc-role)" + readonly ROLE + + workdir=/opt/install-dpgce + tmpdir=/tmp/ + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + uname_r=$(uname -r) + readonly uname_r + readonly bdcfg="/usr/local/bin/bdconfig" + export DEBIAN_FRONTEND=noninteractive + + mkdir -p "${workdir}" + trap exit_handler EXIT + set_proxy + mount_ramdisk + + readonly install_log="${tmpdir}/install.log" + + if test -f "${workdir}/prepare-complete" ; then return ; fi + + repair_old_backports + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + if ge_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero + ) fi + + install_dependencies + + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -LUS keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + + touch "${workdir}/prepare-complete" +} + +main diff --git a/templates/spark-rapids/util_functions b/templates/spark-rapids/util_functions new file mode 100644 index 000000000..93c87db8a --- /dev/null +++ b/templates/spark-rapids/util_functions @@ -0,0 +1,49 @@ +function install_spark_rapids() { + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ + -P /usr/lib/spark/jars/ +} + +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_monitoring_agent() { + download_gpu_monitoring_agent + install_gpu_monitoring_agent_dependency + start_gpu_monitoring_agent_service +} + +function download_gpu_monitoring_agent(){ + if [[ ${OS_NAME} == rocky ]]; then + execute_with_retries "dnf -y -q install git" + else + execute_with_retries "apt-get install git -y" + fi + mkdir -p /opt/google + chmod 777 /opt/google + cd /opt/google + test -d compute-gpu-monitoring || \ + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" +} + +function install_gpu_monitoring_agent_dependency(){ + cd /opt/google/compute-gpu-monitoring/linux + python3 -m venv venv + venv/bin/pip install wheel + venv/bin/pip install -Ur requirements.txt +} + +function start_gpu_monitoring_agent_service(){ + cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + systemctl daemon-reload + systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service +} + From 6ab36a5868be4190a1c4c0bb235c07bfe3b31331 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 12:36:49 -0800 Subject: [PATCH 016/130] merged spark-rapids functions into general gpu util_functions template --- templates/gpu/util_functions | 152 ++++++++++++++++++++++---- templates/spark-rapids/util_functions | 49 --------- 2 files changed, 129 insertions(+), 72 deletions(-) delete mode 100644 templates/spark-rapids/util_functions diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 4ce22f01a..53e7daa93 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -141,25 +141,29 @@ function set_driver_version() { set_driver_version -readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" -readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" +function set_cudnn_version() { + readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" + readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + + # Parameters for NVIDIA-provided cuDNN library + readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} + CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") + # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} + if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" + elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" + elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" + fi + readonly CUDNN_VERSION +} +set_cudnn_version -# Parameters for NVIDIA-provided cuDNN library -readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} -CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) -# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} -if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" -elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then - # cuDNN v8 is not distribution for ubuntu20+, debian12 - CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" -elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then - # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 - CUDNN_VERSION="8.8.0.121" -fi -readonly CUDNN_VERSION readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) @@ -947,6 +951,39 @@ function install_ops_agent(){ touch "${workdir}/complete/ops-agent" } +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_monitoring_agent() { + download_gpu_monitoring_agent + install_gpu_monitoring_agent_dependency + start_gpu_monitoring_agent_service +} + +function download_gpu_monitoring_agent(){ + if is_rocky ; then + execute_with_retries "dnf -y -q install git" + else + execute_with_retries "apt-get install git -y" + fi + mkdir -p /opt/google + chmod 777 /opt/google + cd /opt/google + test -d compute-gpu-monitoring || \ + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" +} + +function install_gpu_monitoring_agent_dependency(){ + cd /opt/google/compute-gpu-monitoring/linux + python3 -m venv venv + venv/bin/pip install wheel + venv/bin/pip install -Ur requirements.txt +} + +function start_gpu_monitoring_agent_service(){ + cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + systemctl daemon-reload + systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service +} + # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { # Stackdriver GPU agent parameters @@ -1013,6 +1050,28 @@ function fetch_mig_scripts() { chmod 755 /usr/local/yarn-mig-scripts/* } +function install_spark_rapids() { + # Update SPARK RAPIDS config + readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) + + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ + -P /usr/lib/spark/jars/ +} + function configure_gpu_script() { # Download GPU discovery script local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' @@ -1049,9 +1108,9 @@ EOF chmod a+rx "${gpus_resources_script}" local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - if version_ge "${SPARK_VERSION}" "3.0" ; then - local gpu_count - gpu_count="$(lspci | grep NVIDIA | wc -l)" + local gpu_count + gpu_count="$(lspci | grep NVIDIA | wc -l)" + if version_ge "${gpu_count}" "1" ; then local executor_cores executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" local executor_memory @@ -1066,8 +1125,9 @@ EOF # query explain output won't show GPU operator, if the user has doubts # they can uncomment the line before seeing the GPU plan explain; # having AQE enabled gives user the best performance. -spark.executor.resource.gpu.discoveryScript=${gpus_resources_script} spark.executor.resource.gpu.amount=${gpu_count} +spark.plugins=com.nvidia.spark.SQLPlugin +spark.executor.resource.gpu.discoveryScript=${gpus_resources_script} spark.executor.cores=${executor_cores} spark.executor.memory=${executor_memory_gb}G spark.dynamicAllocation.enabled=false @@ -1178,13 +1238,10 @@ function prepare_gpu_env(){ # Verify SPARK compatability RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 nvsmi_works="0" if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi - - mkdir -p "${workdir}/complete" } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades @@ -1253,3 +1310,52 @@ function configure_mig_cgi() { function enable_mig() { nvidia-smi -mig 1 } + +function setup_gpu_yarn() { + # This configuration should be run on all nodes + # regardless if they have attached GPUs + configure_yarn_resources + + # Detect NVIDIA GPU + if (lspci | grep -q NVIDIA); then + # if this is called without the MIG script then the drivers are not installed + migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" + if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi + fi + fi + + # if mig is enabled drivers would have already been installed + if [[ $IS_MIG_ENABLED -eq 0 ]]; then + install_nvidia_gpu_driver + install_cuda + load_kernel_module + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent +# install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode + fi + + configure_yarn_nodemanager + configure_gpu_script + configure_gpu_isolation + elif [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager + configure_gpu_script + fi +} diff --git a/templates/spark-rapids/util_functions b/templates/spark-rapids/util_functions deleted file mode 100644 index 93c87db8a..000000000 --- a/templates/spark-rapids/util_functions +++ /dev/null @@ -1,49 +0,0 @@ -function install_spark_rapids() { - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ -} - -# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_monitoring_agent() { - download_gpu_monitoring_agent - install_gpu_monitoring_agent_dependency - start_gpu_monitoring_agent_service -} - -function download_gpu_monitoring_agent(){ - if [[ ${OS_NAME} == rocky ]]; then - execute_with_retries "dnf -y -q install git" - else - execute_with_retries "apt-get install git -y" - fi - mkdir -p /opt/google - chmod 777 /opt/google - cd /opt/google - test -d compute-gpu-monitoring || \ - execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" -} - -function install_gpu_monitoring_agent_dependency(){ - cd /opt/google/compute-gpu-monitoring/linux - python3 -m venv venv - venv/bin/pip install wheel - venv/bin/pip install -Ur requirements.txt -} - -function start_gpu_monitoring_agent_service(){ - cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system - systemctl daemon-reload - systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service -} - From ef366947c871f69e185b262892615bb87f974ea1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 12:43:13 -0800 Subject: [PATCH 017/130] correcting variable name --- templates/gpu/install_gpu_driver.sh.in | 102 ++++------------ templates/spark-rapids/spark-rapids.sh.in | 139 +++++----------------- 2 files changed, 51 insertions(+), 190 deletions(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index e852ed73c..616fc5eb2 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -10,93 +10,33 @@ set -euxo pipefail [% INSERT common/util_functions %] -[% INSERT gpu/util_functions %] - [% INSERT 'secure-boot/util_functions' %] -function main() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs - configure_yarn_resources - - # Detect NVIDIA GPU - if (lspci | grep -q NVIDIA); then - # if this is called without the MIG script then the drivers are not installed - migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" - if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi - fi - fi - - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver - install_nvidia_container_toolkit - install_cuda - load_kernel_module - - if [[ -n ${CUDNN_VERSION} ]]; then - install_nvidia_nccl - install_nvidia_cudnn - fi - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - #install_ops_agent - install_gpu_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent will not be installed.' - fi - - # for some use cases, the kernel module needs to be removed before first use of nvidia-smi - for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" - done +[% INSERT gpu/util_functions %] - MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")" - if test -n "$(nvsmi -L)" ; then - # cache the result of the gpu query - ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') - echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" - fi - NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - # enable MIG on every GPU - for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do - nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 - done - - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)" - fetch_mig_scripts - else - configure_gpu_exclusive_mode - fi - fi +function main() { + setup_gpu_yarn + if [[ -n ${CUDNN_VERSION} ]]; then + install_nvidia_nccl + install_nvidia_cudnn + fi + install_nvidia_container_toolkit - configure_yarn_nodemanager + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids - configure_gpu_script - configure_gpu_isolation - elif [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - install_spark_rapids - configure_gpu_script + configure_spark + echo "RAPIDS initialized with Spark runtime" + elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then + # we are not currently tooled for installing dask in this action. + echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh" + else + echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" fi # Restart YARN services if they are running already for svc in resourcemanager nodemanager; do - if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-${svc}.service + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl restart "hadoop-yarn-${svc}.service" fi done } @@ -226,14 +166,14 @@ function prepare_to_install(){ readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive - mkdir -p "${workdir}" + mkdir -p "${workdir}/complete" trap exit_handler EXIT set_proxy mount_ramdisk readonly install_log="${tmpdir}/install.log" - if test -f "${workdir}/prepare-complete" ; then return ; fi + if test -f "${workdir}/complete/prepare" ; then return ; fi repair_old_backports @@ -261,7 +201,7 @@ function prepare_to_install(){ screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - touch "${workdir}/prepare-complete" + touch "${workdir}/complete/prepare" } prepare_to_install diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index fc37f109f..1781909d2 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -1,13 +1,24 @@ #!/bin/bash # [% INSERT legal/license_header %] -# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2. -# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only -# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions. -# Note that the script is designed to work when secure boot is disabled during cluster creation. -# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu. # [% PROCESS common/template_disclaimer %] +# +# This script installs NVIDIA GPU drivers (version 550.135) along with +# CUDA 12.4. +# +# Additionally, it installs the RAPIDS Spark plugin, configures Spark +# and YARN, installs an agent to collect GPU utilization metrics. The +# installer is compatible with Debian, Ubuntu, and Rocky Linux +# distributions. +# +# Note that the script is designed to work both when secure boot is +# enabled with a custom image and when disabled during cluster +# creation. +# +# For details see +# github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot +# set -euxo pipefail @@ -17,105 +28,24 @@ set -euxo pipefail [% INSERT gpu/util_functions %] -[% INSERT 'spark-rapids/util_functions' %] - -check_secure_boot - -# Stackdriver GPU agent parameters -# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') -readonly INSTALL_GPU_AGENT - -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' - -NVIDIA_SMI_PATH='/usr/bin' -MIG_MAJOR_CAPS=0 -IS_MIG_ENABLED=0 - -function setup_gpu_yarn() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs - configure_yarn_resources - - # Detect NVIDIA GPU - if (lspci | grep -q NVIDIA); then - # if this is called without the MIG script then the drivers are not installed - migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" - if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi - fi - fi - - if is_debuntu ; then - execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - elif is_rocky ; then - echo "kernel devel and headers not required on rocky. installing from binary" - fi - - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver - install_cuda - load_kernel_module - - #Install GPU metrics collection in Stackdriver if needed - if [[ ${INSTALL_GPU_AGENT} == true ]]; then - #install_gpu_agent - install_gpu_monitoring_agent - - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent will not be installed.' - fi - configure_gpu_exclusive_mode - fi - - configure_yarn_nodemanager - configure_gpu_script - configure_gpu_isolation - elif [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - configure_gpu_script - fi - - # Restart YARN services if they are running already - for svc in resourcemanager nodemanager; do - if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl restart "hadoop-yarn-${svc}.service" - fi - done -} - function main() { - repair_old_backports - check_os - check_secure_boot - setup_gpu_yarn + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids - configure_gpu_script + configure_spark echo "RAPIDS initialized with Spark runtime" + elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then + # we are not currently tooled for installing dask in this action. + echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh" else - echo "Unsupported RAPIDS Runtime: ${RUNTIME}" - exit 1 + echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" fi + # Restart YARN services if they are running already for svc in resourcemanager nodemanager; do - if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-${svc}.service + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl restart "hadoop-yarn-${svc}.service" fi done } @@ -221,20 +151,9 @@ print( " samples-taken: ", scalar @siz, $/, return 0 } -# Update SPARK RAPIDS config -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" -readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) -readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) - # Fetch instance roles and runtime -readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) -# CUDA version and Driver version config -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1') #12.2.2 -NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05 -CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 - function prepare_to_install(){ # Verify OS compatability and Secure boot state check_os @@ -259,14 +178,14 @@ function prepare_to_install(){ readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive - mkdir -p "${workdir}" + mkdir -p "${workdir}/complete" trap exit_handler EXIT set_proxy mount_ramdisk readonly install_log="${tmpdir}/install.log" - if test -f "${workdir}/prepare-complete" ; then return ; fi + if test -f "${workdir}/complete/prepare" ; then return ; fi repair_old_backports @@ -294,7 +213,9 @@ function prepare_to_install(){ screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - touch "${workdir}/prepare-complete" + touch "${workdir}/complete/prepare" } +prepare_to_install + main From af69141efd78d9edb4dee2793938be664552a837 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 12:55:46 -0800 Subject: [PATCH 018/130] using new function name --- templates/gpu/install_gpu_driver.sh.in | 2 +- templates/spark-rapids/spark-rapids.sh.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 616fc5eb2..09ba877ba 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -24,7 +24,7 @@ function main() { if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids - configure_spark + configure_gpu_script echo "RAPIDS initialized with Spark runtime" elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then # we are not currently tooled for installing dask in this action. diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 1781909d2..73e360c42 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -33,7 +33,7 @@ function main() { if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids - configure_spark + configure_gpu_script echo "RAPIDS initialized with Spark runtime" elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then # we are not currently tooled for installing dask in this action. From d59d5e6d90a12f1f142b9d281e42be3054e5a721 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 16:51:44 -0800 Subject: [PATCH 019/130] driver version for 12.4.0 had not been tested in a while and had become incorrect --- templates/gpu/util_functions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 53e7daa93..8836a0caa 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -258,7 +258,7 @@ function set_cuda_runfile_url() { ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" - ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ) From 8a9e00a08197f38621d87bb8b436ef923a4010fc Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 19:33:19 -0800 Subject: [PATCH 020/130] expanding non-default version tests ; adding utility function to verify pyspark --- spark-rapids/test_spark_rapids.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 6e03f2d62..b8e0fe133 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -20,6 +20,10 @@ class SparkRapidsTestCase(DataprocTestCase): def verify_spark_instance(self, name): self.assert_instance_command(name, "nvidia-smi") + def verify_pyspark(self, name): + # Verify that pyspark works + self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -114,13 +118,22 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): # Only need to do this once self.verify_spark_job_sql() - @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14")) + @parameterized.parameters( + ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"), + ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"), + ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03") + ) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): if self.getImageOs() == "rocky": self.skipTest("Not supported for Rocky OS") + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ + ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): + self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") @@ -134,7 +147,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="50GB", + boot_disk_size="60GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: From 1113855ba26bfa29e2e734ac2b5475633a96aec3 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 19:41:57 -0800 Subject: [PATCH 021/130] reduced boot disk size to 50GB --- spark-rapids/test_spark_rapids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index b8e0fe133..ce78f3f52 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -147,7 +147,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="60GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: From 7034739d34c13339ab87c41cc25cb262e3573274 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 20:57:43 -0800 Subject: [PATCH 022/130] skipping old cuda on new images ; sizing instances to build --- spark-rapids/test_spark_rapids.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index ce78f3f52..f3aa19c6f 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -76,7 +76,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): self.INIT_ACTIONS, optional_components=optional_components, metadata=metadata, - machine_type="n1-standard-4", + machine_type="n1-standard-32", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, boot_disk_size="50GB", @@ -106,7 +106,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): self.INIT_ACTIONS, optional_components=optional_components, metadata=metadata, - machine_type="n1-standard-4", + machine_type="n1-standard-32", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, boot_disk_size="50GB", @@ -134,6 +134,11 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") @@ -144,7 +149,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-4", + machine_type="n1-standard-32", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, boot_disk_size="50GB", From 2873f490553080ff892086eab106db7d4ceb7f93 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 21:25:11 -0800 Subject: [PATCH 023/130] skipping older debuntu when cuda version not specified --- spark-rapids/test_spark_rapids.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index f3aa19c6f..2d67a0df2 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -62,12 +62,13 @@ def verify_spark_job_sql(self): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") + if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ + ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): + self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases") + optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" @@ -92,12 +93,13 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") + if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ + ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): + self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases") + optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" @@ -120,16 +122,14 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): @parameterized.parameters( ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"), + ("STANDARD", ["w-0"], GPU_T4, "12.0.1", "525.147.05"), ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"), ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03") ) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): - if self.getImageOs() == "rocky": - self.skipTest("Not supported for Rocky OS") - - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0.1") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") From 576b32f6417248899d8f1784f241066eedf7b9fe Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 14:51:40 -0800 Subject: [PATCH 024/130] refactor into functions --- templates/gpu/util_functions | 134 ++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 65 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 8836a0caa..cdf0d847f 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -65,7 +65,23 @@ function set_cuda_version() { fi if ( ! test -v DEFAULT_CUDA_VERSION ) ; then - DEFAULT_CUDA_VERSION='12.4' + DEFAULT_CUDA_VERSION='12.4.1' + fi + # EXCEPTIONS + # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + case "${DATAPROC_IMAGE_VERSION}" in + "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; + "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; + * ) + echo "unrecognized Dataproc image version" + exit 1 + ;; + esac + + if le_ubuntu18 ; then + DEFAULT_CUDA_VERSION="12.1.1" + CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 fi readonly DEFAULT_CUDA_VERSION @@ -82,8 +98,6 @@ function set_cuda_version() { } -set_cuda_version - function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) @@ -139,8 +153,6 @@ function set_driver_version() { fi } -set_driver_version - function set_cudnn_version() { readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" @@ -160,54 +172,29 @@ function set_cudnn_version() { fi readonly CUDNN_VERSION } -set_cudnn_version + function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) -readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} -readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) - -# Parameters for NVIDIA-provided Debian GPU driver -readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - -readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") - -USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" -readonly USERSPACE_FILENAME - +function set_cuda_repo_shortname() { # Short name for urls -if is_ubuntu22 ; then - # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at - # https://developer.download.nvidia.com/compute/machine-learning/repos/ - # use packages from previous release until such time as nvidia - # release ubuntu2204 builds - - shortname="$(os_id)$(os_vercat)" - nccl_shortname="ubuntu2004" -elif ge_rocky9 ; then - # use packages from previous release until such time as nvidia - # release rhel9 builds - - shortname="rhel9" - nccl_shortname="rhel8" -elif is_rocky ; then +# https://developer.download.nvidia.com/compute/cuda/repos/${shortname} + if is_rocky ; then shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" - nccl_shortname="${shortname}" -else + else shortname="$(os_id)$(os_vercat)" - nccl_shortname="${shortname}" -fi + fi +} -# Parameters for NVIDIA-provided package repositories -readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' -readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" +function set_nv_urls() { + # Parameters for NVIDIA-provided package repositories + readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' + readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" -# Parameters for NVIDIA-provided NCCL library -readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" -NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") -readonly NCCL_REPO_URL -readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub + # Parameter for NVIDIA-provided Rocky Linux GPU driver + readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" +} function set_cuda_runfile_url() { local MAX_DRIVER_VERSION @@ -291,11 +278,7 @@ function set_cuda_runfile_url() { fi } -set_cuda_runfile_url - -# Parameter for NVIDIA-provided Rocky Linux GPU driver -readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" - +function set_cudnn_tarball_url() { CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then @@ -315,20 +298,8 @@ if ( version_ge "${CUDA_VERSION}" "12.0" ); then fi readonly CUDNN_TARBALL readonly CUDNN_TARBALL_URL +} -# Whether to install NVIDIA-provided or OS-provided GPU driver -GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') -readonly GPU_DRIVER_PROVIDER - -# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') -readonly INSTALL_GPU_AGENT - -NVIDIA_SMI_PATH='/usr/bin' -MIG_MAJOR_CAPS=0 -IS_MIG_ENABLED=0 - -CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi local kr_ver=1.1 @@ -376,7 +347,6 @@ function uninstall_local_cuda_repo(){ rm -f "${workdir}/complete/install-local-cuda-repo" } -CUDNN_PKG_NAME="" function install_local_cudnn_repo() { if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" @@ -402,8 +372,6 @@ function uninstall_local_cudnn_repo() { rm -f "${workdir}/complete/install-local-cudnn-repo" } -CUDNN8_LOCAL_REPO_INSTALLED="0" -CUDNN8_PKG_NAME="" function install_local_cudnn8_repo() { if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi @@ -448,6 +416,9 @@ function uninstall_local_cudnn8_repo() { } function install_nvidia_nccl() { + readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} + readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + if test -f "${workdir}/complete/nccl" ; then return ; fi if is_cuda11 && is_debian12 ; then @@ -747,6 +718,13 @@ function build_driver_from_packages() { } function install_nvidia_userspace_runfile() { + # Parameters for NVIDIA-provided Debian GPU driver + readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + + readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + + USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" + readonly USERSPACE_FILENAME # This .run file contains NV's OpenGL implementation as well as # nvidia optimized implementations of the gtk+ 2,3 stack(s) not @@ -1118,6 +1096,7 @@ EOF local task_cpus=2 local gpu_amount gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + if version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ; then gpu_amount="0.5" ; fi cat >>"${spark_defaults_conf}" < Date: Thu, 26 Dec 2024 15:22:53 -0800 Subject: [PATCH 025/130] moved secure-boot utility functions and common environment setup into common/util_functions --- templates/common/util_functions | 179 +++++++++++++++++++++- templates/gpu/install_gpu_driver.sh.in | 61 +------- templates/secure-boot/util_functions | 105 ------------- templates/spark-rapids/spark-rapids.sh.in | 65 +------- 4 files changed, 176 insertions(+), 234 deletions(-) delete mode 100644 templates/secure-boot/util_functions diff --git a/templates/common/util_functions b/templates/common/util_functions index 3373fb24e..4ae90e722 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -30,8 +30,6 @@ function define_os_comparison_functions() { done } -define_os_comparison_functions - function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) function os_vercat() ( set +x @@ -437,10 +435,177 @@ function os_add_repo() { else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi } +function configure_dkms_certs() { + if test -v PSN && [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key" + fi + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert" + fi + else + modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" + fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + + return + fi + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + local mok_directory="$(dirname "${mok_key}")" + mkdir -p "${mok_directory}" + + # symlink private key and copy public cert from volatile storage to DKMS directory + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + + modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >&2 + return 0 + fi + rm -rf "${CA_TMPDIR}" "${mok_key}" +} + +function check_secure_boot() { + local SECURE_BOOT="disabled" + SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + + PSN="$(get_metadata_attribute private_secret_name)" + readonly PSN + + if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then + echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." + exit 1 + elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then + echo "Secure boot is enabled, but no signing material provided." + echo "Please either disable secure boot or provide signing material as per" + echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" + return 1 + fi + + CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" + readonly CA_TMPDIR -readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv + mok_der=/var/lib/shim-signed/mok/MOK.der + else mok_key=/var/lib/dkms/mok.key + mok_der=/var/lib/dkms/mok.pub ; fi -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' + configure_dkms_certs +} + +function prepare_common_env() { + define_os_comparison_functions + + # Verify OS compatability and Secure boot state + check_os + check_secure_boot + + readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + + # Dataproc configurations + readonly HADOOP_CONF_DIR='/etc/hadoop/conf' + readonly HIVE_CONF_DIR='/etc/hive/conf' + readonly SPARK_CONF_DIR='/etc/spark/conf' + + OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" + readonly OS_NAME + + # node role + ROLE="$(get_metadata_attribute dataproc-role)" + readonly ROLE + + workdir=/opt/install-dpgce + tmpdir=/tmp/ + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + uname_r=$(uname -r) + readonly uname_r + readonly bdcfg="/usr/local/bin/bdconfig" + export DEBIAN_FRONTEND=noninteractive + + mkdir -p "${workdir}/complete" + trap exit_handler EXIT + set_proxy + mount_ramdisk + + readonly install_log="${tmpdir}/install.log" + + if test -f "${workdir}/complete/prepare.common" ; then return ; fi + + repair_old_backports + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + if ge_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero + ) fi + + install_dependencies + + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -LUS keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + + touch "${workdir}/complete/prepare.common" +} diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 09ba877ba..bb17b2ab6 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -10,8 +10,6 @@ set -euxo pipefail [% INSERT common/util_functions %] -[% INSERT 'secure-boot/util_functions' %] - [% INSERT gpu/util_functions %] function main() { @@ -143,65 +141,8 @@ print( " samples-taken: ", scalar @siz, $/, } function prepare_to_install(){ - # Verify OS compatability and Secure boot state - check_os - check_secure_boot - + prepare_common_env prepare_gpu_env - - OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role - ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - - workdir=/opt/install-dpgce - tmpdir=/tmp/ - temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" - readonly temp_bucket - readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" - uname_r=$(uname -r) - readonly uname_r - readonly bdcfg="/usr/local/bin/bdconfig" - export DEBIAN_FRONTEND=noninteractive - - mkdir -p "${workdir}/complete" - trap exit_handler EXIT - set_proxy - mount_ramdisk - - readonly install_log="${tmpdir}/install.log" - - if test -f "${workdir}/complete/prepare" ; then return ; fi - - repair_old_backports - - if is_debuntu ; then - clean_up_sources_lists - apt-get update -qq - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - if ge_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi - else - dnf clean all - fi - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e - time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero - ) fi - - install_dependencies - - # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" - touch "/run/keep-running-df" - screen -d -m -LUS keep-running-df \ - bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - - touch "${workdir}/complete/prepare" } prepare_to_install diff --git a/templates/secure-boot/util_functions b/templates/secure-boot/util_functions deleted file mode 100644 index f96a48200..000000000 --- a/templates/secure-boot/util_functions +++ /dev/null @@ -1,105 +0,0 @@ -function configure_dkms_certs() { - if test -v PSN && [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping"; - return 0 - fi - - mkdir -p "${CA_TMPDIR}" - - # If the private key exists, verify it - if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then - echo "Private key material exists" - - local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then - modulus_md5sum="${expected_modulus_md5sum}" - - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key" - fi - - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert" - fi - else - modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" - fi - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - - return - fi - - # Retrieve cloud secrets keys - local sig_priv_secret_name - sig_priv_secret_name="${PSN}" - local sig_pub_secret_name - sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" - local sig_secret_project - sig_secret_project="$(get_metadata_attribute secret_project)" - local sig_secret_version - sig_secret_version="$(get_metadata_attribute secret_version)" - - # If metadata values are not set, do not write mok keys - if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi - - # Write private material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_priv_secret_name}" \ - | dd status=none of="${CA_TMPDIR}/db.rsa" - - # Write public material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_pub_secret_name}" \ - | base64 --decode \ - | dd status=none of="${CA_TMPDIR}/db.der" - - local mok_directory="$(dirname "${mok_key}")" - mkdir -p "${mok_directory}" - - # symlink private key and copy public cert from volatile storage to DKMS directory - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - cp -f "${CA_TMPDIR}/db.der" "${mok_der}" - - modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" -} - -function clear_dkms_key { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping" >&2 - return 0 - fi - rm -rf "${CA_TMPDIR}" "${mok_key}" -} - -function check_secure_boot() { - local SECURE_BOOT="disabled" - SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') - - PSN="$(get_metadata_attribute private_secret_name)" - readonly PSN - - if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then - echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." - exit 1 - elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then - echo "Secure boot is enabled, but no signing material provided." - echo "Please either disable secure boot or provide signing material as per" - echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - return 1 - fi - - CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" - readonly CA_TMPDIR - - if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv - mok_der=/var/lib/shim-signed/mok/MOK.der - else mok_key=/var/lib/dkms/mok.key - mok_der=/var/lib/dkms/mok.pub ; fi - - configure_dkms_certs -} diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 73e360c42..729c556ed 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -24,8 +24,6 @@ set -euxo pipefail [% INSERT common/util_functions %] -[% INSERT 'secure-boot/util_functions' %] - [% INSERT gpu/util_functions %] function main() { @@ -151,69 +149,12 @@ print( " samples-taken: ", scalar @siz, $/, return 0 } -# Fetch instance roles and runtime -readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) - function prepare_to_install(){ - # Verify OS compatability and Secure boot state - check_os - check_secure_boot - + prepare_common_env prepare_gpu_env - OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role - ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - - workdir=/opt/install-dpgce - tmpdir=/tmp/ - temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" - readonly temp_bucket - readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" - uname_r=$(uname -r) - readonly uname_r - readonly bdcfg="/usr/local/bin/bdconfig" - export DEBIAN_FRONTEND=noninteractive - - mkdir -p "${workdir}/complete" - trap exit_handler EXIT - set_proxy - mount_ramdisk - - readonly install_log="${tmpdir}/install.log" - - if test -f "${workdir}/complete/prepare" ; then return ; fi - - repair_old_backports - - if is_debuntu ; then - clean_up_sources_lists - apt-get update -qq - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - if ge_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi - else - dnf clean all - fi - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e - time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero - ) fi - - install_dependencies - - # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" - touch "/run/keep-running-df" - screen -d -m -LUS keep-running-df \ - bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - - touch "${workdir}/complete/prepare" + # Fetch instance roles and runtime + readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) } prepare_to_install From bf98d8591dfe59582bd043e43867adb02b05e995 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 15:31:21 -0800 Subject: [PATCH 026/130] refactored exit_handler --- templates/common/util_functions | 94 +++++++++++++++++++++ templates/gpu/install_gpu_driver.sh.in | 99 +---------------------- templates/gpu/util_functions | 10 +++ templates/spark-rapids/spark-rapids.sh.in | 99 +---------------------- 4 files changed, 108 insertions(+), 194 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 4ae90e722..929eff37a 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -609,3 +609,97 @@ function prepare_common_env() { touch "${workdir}/complete/prepare.common" } + +function common_exit_handler() { + # Purge private key material until next grant + clear_dkms_key + + set +ex + echo "Exit handler invoked" + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done + + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi + + if is_debuntu ; then + # Clean up OS package cache + apt-get -y -qq clean + apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove + # re-hold systemd package + if ge_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + hold_nvidia_packages + else + dnf clean all + fi + + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -x -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ + /var/lib/{docker,mysql,} \ + /opt/nvidia/* \ + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ + /usr/lib64/google-cloud-sdk \ + /opt/nvidia/* \ + /opt/conda/miniconda3 + fi + + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero + fi +} diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index bb17b2ab6..b2fa93b35 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -40,103 +40,8 @@ function main() { } function exit_handler() { - # Purge private key material until next grant - clear_dkms_key - - set +ex - echo "Exit handler invoked" - - # Clear pip cache - pip cache purge || echo "unable to purge pip cache" - - # If system memory was sufficient to mount memory-backed filesystems - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - - # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then - umount -f ${shmdir} - fi - done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' - fi - - if is_debuntu ; then - # Clean up OS package cache - apt-get -y -qq clean - apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove - # re-hold systemd package - if ge_debian12 ; then - apt-mark hold systemd libsystemd0 ; fi - hold_nvidia_packages - else - dnf clean all - fi - - # print disk usage statistics for large components - if is_ubuntu ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 | sort -h - elif is_debian ; then - du -x -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /var/lib/{docker,mysql,} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ - /usr/bin \ - /usr \ - /var \ - / 2>/dev/null | sort -h - else - du -hs \ - /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ - /usr/lib64/google-cloud-sdk \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 - fi - - # Process disk usage logs from installation period - rm -f /run/keep-running-df - sync - sleep 5.01s - # compute maximum size of disk during installation - # Log file contains logs like the following (minus the preceeding #): -#Filesystem 1K-blocks Used Available Use% Mounted on -#/dev/vda2 7096908 2611344 4182932 39% / - df / | tee -a "/run/disk-usage.log" - - perl -e '@siz=( sort { $a => $b } - map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; -print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" - - echo "exit_handler has completed" - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - dd if=/dev/zero of=/zero - sync - sleep 3s - rm -f /zero - fi - + gpu_exit_handler + common_exit_handler return 0 } diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index cdf0d847f..1b204622e 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1363,3 +1363,13 @@ function setup_gpu_yarn() { configure_gpu_script fi } + +function gpu_exit_handler() { + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + for shmdir in /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + fi + fi +} diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 729c556ed..bc1d98e94 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -49,103 +49,8 @@ function main() { } function exit_handler() { - # Purge private key material until next grant - clear_dkms_key - - set +ex - echo "Exit handler invoked" - - # Clear pip cache - pip cache purge || echo "unable to purge pip cache" - - # If system memory was sufficient to mount memory-backed filesystems - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - - # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then - umount -f ${shmdir} - fi - done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' - fi - - if is_debuntu ; then - # Clean up OS package cache - apt-get -y -qq clean - apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove - # re-hold systemd package - if ge_debian12 ; then - apt-mark hold systemd libsystemd0 ; fi - hold_nvidia_packages - else - dnf clean all - fi - - # print disk usage statistics for large components - if is_ubuntu ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 | sort -h - elif is_debian ; then - du -x -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /var/lib/{docker,mysql,} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ - /usr/bin \ - /usr \ - /var \ - / 2>/dev/null | sort -h - else - du -hs \ - /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ - /usr/lib64/google-cloud-sdk \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 - fi - - # Process disk usage logs from installation period - rm -f /run/keep-running-df - sync - sleep 5.01s - # compute maximum size of disk during installation - # Log file contains logs like the following (minus the preceeding #): -#Filesystem 1K-blocks Used Available Use% Mounted on -#/dev/vda2 7096908 2611344 4182932 39% / - df / | tee -a "/run/disk-usage.log" - - perl -e '@siz=( sort { $a => $b } - map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; -print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" - - echo "exit_handler has completed" - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - dd if=/dev/zero of=/zero - sync - sleep 3s - rm -f /zero - fi - + gpu_exit_handler + common_exit_handler return 0 } From 4320953abb720da5e4ee452580de6c67cf3d4b57 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 15:51:58 -0800 Subject: [PATCH 027/130] declaring constants prior to running functions --- templates/gpu/util_functions | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 1b204622e..5489ea33d 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1214,13 +1214,14 @@ function install_dependencies() { } function prepare_gpu_env(){ - set_cuda_version - set_driver_version - set_cuda_repo_shortname - set_nv_urls - set_cuda_runfile_url - set_cudnn_version - set_cudnn_tarball_url + nvsmi_works="0" + NVIDIA_SMI_PATH='/usr/bin' + MIG_MAJOR_CAPS=0 + IS_MIG_ENABLED=0 + CUDA_KEYRING_PKG_INSTALLED="0" + CUDNN_PKG_NAME="" + CUDNN8_LOCAL_REPO_INSTALLED="0" + CUDNN8_PKG_NAME="" # Whether to install NVIDIA-provided or OS-provided GPU driver GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') @@ -1230,19 +1231,17 @@ function prepare_gpu_env(){ INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') readonly INSTALL_GPU_AGENT - NVIDIA_SMI_PATH='/usr/bin' - MIG_MAJOR_CAPS=0 - IS_MIG_ENABLED=0 - - CUDA_KEYRING_PKG_INSTALLED="0" - CUDNN_PKG_NAME="" - CUDNN8_LOCAL_REPO_INSTALLED="0" - CUDNN8_PKG_NAME="" - # Verify SPARK compatability RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + readonly RAPIDS_RUNTIME - nvsmi_works="0" + set_cuda_version + set_driver_version + set_cuda_repo_shortname + set_nv_urls + set_cuda_runfile_url + set_cudnn_version + set_cudnn_tarball_url if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi @@ -1370,6 +1369,6 @@ function gpu_exit_handler() { if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then umount -f ${shmdir} fi - fi + done fi } From 2b0947bb194d856855e8ae0da5b9b12619d97fd3 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 16:10:15 -0800 Subject: [PATCH 028/130] removed old variables, included a current one which does not get exercise --- templates/gpu/util_functions | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 5489ea33d..07d6f92ec 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1218,10 +1218,9 @@ function prepare_gpu_env(){ NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 - CUDA_KEYRING_PKG_INSTALLED="0" CUDNN_PKG_NAME="" - CUDNN8_LOCAL_REPO_INSTALLED="0" CUDNN8_PKG_NAME="" + CUDA_LOCAL_REPO_INSTALLED="0" # Whether to install NVIDIA-provided or OS-provided GPU driver GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') From 5dbc1f28ea24ae2699b71d865b230f26776b9478 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 16:23:52 -0800 Subject: [PATCH 029/130] do not break if variable undefined --- templates/gpu/util_functions | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 07d6f92ec..6516ad948 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -301,7 +301,8 @@ readonly CUDNN_TARBALL_URL } function install_cuda_keyring_pkg() { - if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi + if ( test -v CUDA_KEYRING_PKG_INSTALLED && + [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ From c5d46d3e62c76783b839f4a04d876e93f7403393 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 19:00:08 -0800 Subject: [PATCH 030/130] order of operations error fixed with parantheses. --- templates/gpu/util_functions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 6516ad948..55b241989 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -81,7 +81,7 @@ function set_cuda_version() { if le_ubuntu18 ; then DEFAULT_CUDA_VERSION="12.1.1" - CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 + CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}" #12.1 fi readonly DEFAULT_CUDA_VERSION @@ -648,7 +648,7 @@ function build_driver_from_github() { # build the kernel modules pushd open-gpu-kernel-modules install_build_dependencies - if is_cuda11 && is_ubuntu22 ; then + if ( is_cuda11 && is_ubuntu22 ) ; then echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" exit 1 fi From 7be62b33afb27561e0e8908dae6ff8f7bc4b60e1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 19:29:46 -0800 Subject: [PATCH 031/130] using lower xgboost version for older dataproc images --- templates/gpu/util_functions | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 55b241989..f093c00dc 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1033,7 +1033,14 @@ function install_spark_rapids() { # Update SPARK RAPIDS config readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) - readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + + if version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then + DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then + DEFAULT_XGBOOST_VERSION="1.6.2" + fi + + readonly DEFAULT_XGBOOST_VERSION readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' From 41c327a6c7d2422e9c1d165db79d8bac29dd5465 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 20:24:09 -0800 Subject: [PATCH 032/130] test whether the variable is defined before testing its value --- templates/gpu/install_gpu_driver.sh.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index b2fa93b35..963becaf7 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -14,7 +14,7 @@ set -euxo pipefail function main() { setup_gpu_yarn - if [[ -n ${CUDNN_VERSION} ]]; then + if ( test -v CUDNN_VERSION && [[ -n ${CUDNN_VERSION} ]] ) ; then install_nvidia_nccl install_nvidia_cudnn fi From b70477bf1f2efebc810e02de86cc7c96daf3ff77 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 20:34:59 -0800 Subject: [PATCH 033/130] refactor the xgboost installer a little --- templates/gpu/util_functions | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index f093c00dc..8af157063 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1034,8 +1034,12 @@ function install_spark_rapids() { readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu + local -r scala_ver="2.12" + if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then + DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.3 if version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then - DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + DEFAULT_XGBOOST_VERSION="1.7.6" elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then DEFAULT_XGBOOST_VERSION="1.6.2" fi @@ -1048,13 +1052,13 @@ function install_spark_rapids() { local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ -P /usr/lib/spark/jars/ wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ -P /usr/lib/spark/jars/ wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ + "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \ -P /usr/lib/spark/jars/ } From 073ed1f6521dc5db3e15bc2a289fc40ee673e700 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 23:09:25 -0800 Subject: [PATCH 034/130] only minor changes --- spark-rapids/spark-rapids.sh | 10 ++++------ templates/gpu/util_functions | 28 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 0b4aabd57..6fdfbb78c 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -232,12 +232,10 @@ CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 # EXCEPTIONS # Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) -if [[ "${OS_NAME}" == "ubuntu" ]]; then - if is_ubuntu18 ; then - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 - NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 - CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 - fi +if is_ubuntu18 ; then + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 + NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 + CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 fi # Verify Secure boot diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 8af157063..82f05d6e8 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1036,9 +1036,9 @@ function install_spark_rapids() { # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" - if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then + if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.3 - if version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then + elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then DEFAULT_XGBOOST_VERSION="1.7.6" elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then DEFAULT_XGBOOST_VERSION="1.6.2" @@ -1100,17 +1100,18 @@ EOF local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" local gpu_count gpu_count="$(lspci | grep NVIDIA | wc -l)" - if version_ge "${gpu_count}" "1" ; then - local executor_cores - executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" - local executor_memory - executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" - local task_cpus=2 - local gpu_amount - gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - if version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ; then gpu_amount="0.5" ; fi - - cat >>"${spark_defaults_conf}" <>"${spark_defaults_conf}" < Date: Thu, 26 Dec 2024 23:10:18 -0800 Subject: [PATCH 035/130] explicitly notifying at the completion of the main function --- templates/gpu/install_gpu_driver.sh.in | 5 ++++- templates/spark-rapids/spark-rapids.sh.in | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 963becaf7..53e2f33c7 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -14,7 +14,7 @@ set -euxo pipefail function main() { setup_gpu_yarn - if ( test -v CUDNN_VERSION && [[ -n ${CUDNN_VERSION} ]] ) ; then + if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then install_nvidia_nccl install_nvidia_cudnn fi @@ -37,6 +37,8 @@ function main() { systemctl restart "hadoop-yarn-${svc}.service" fi done + echo "main complete" + return 0 } function exit_handler() { @@ -48,6 +50,7 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env prepare_gpu_env + trap exit_handler EXIT } prepare_to_install diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index bc1d98e94..c5691fea9 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -46,6 +46,8 @@ function main() { systemctl restart "hadoop-yarn-${svc}.service" fi done + echo "main complete" + return 0 } function exit_handler() { @@ -57,6 +59,7 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env prepare_gpu_env + trap exit_handler EXIT # Fetch instance roles and runtime readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) From f8a9b7dc82f00a43acda6ac5cc7efd8a04c6f53a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 23:26:37 -0800 Subject: [PATCH 036/130] moved trap outside of the template --- templates/common/util_functions | 1 - 1 file changed, 1 deletion(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 929eff37a..6a490ad71 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -573,7 +573,6 @@ function prepare_common_env() { export DEBIAN_FRONTEND=noninteractive mkdir -p "${workdir}/complete" - trap exit_handler EXIT set_proxy mount_ramdisk From 19520b4f92fe5c3978c77d6aece88aabacaf9291 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 26 Dec 2024 23:42:16 -0800 Subject: [PATCH 037/130] stop / start instead of restart --- templates/gpu/install_gpu_driver.sh.in | 5 ++++- templates/spark-rapids/spark-rapids.sh.in | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 53e2f33c7..6a9fde18e 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -32,9 +32,12 @@ function main() { fi # Restart YARN services if they are running already + nodes_include_gcs="gs:/$(get_metadata_attribute dataproc-bucket)/google-cloud-dataproc-metainfo/$(get_metadata_attribute dataproc-cluster-uuid)/nodes_include" + gsutil ls "${nodes_include_gcs}" for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl restart "hadoop-yarn-${svc}.service" + systemctl stop "hadoop-yarn-${svc}.service" + systemctl start "hadoop-yarn-${svc}.service" fi done echo "main complete" diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index c5691fea9..f20689cf0 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -43,7 +43,8 @@ function main() { # Restart YARN services if they are running already for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl restart "hadoop-yarn-${svc}.service" + systemctl stop "hadoop-yarn-${svc}.service" + systemctl start "hadoop-yarn-${svc}.service" fi done echo "main complete" From f659ec55fb2dec937a3e3b9223168f86dc7fe8a4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 01:06:52 -0800 Subject: [PATCH 038/130] skipping install on gpu-less systems more quickly --- templates/common/util_functions | 2 -- templates/gpu/install_gpu_driver.sh.in | 3 +++ templates/gpu/util_functions | 4 +++- templates/spark-rapids/spark-rapids.sh.in | 2 ++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 6a490ad71..114fce6a5 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -598,8 +598,6 @@ function prepare_common_env() { time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) fi - install_dependencies - # Monitor disk usage in a screen session df / > "/run/disk-usage.log" touch "/run/keep-running-df" diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 6a9fde18e..8ce2088d7 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -14,6 +14,9 @@ set -euxo pipefail function main() { setup_gpu_yarn + + echo "yarn setup complete" + if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then install_nvidia_nccl install_nvidia_cudnn diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 82f05d6e8..cea01b03d 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1100,7 +1100,6 @@ EOF local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" local gpu_count gpu_count="$(lspci | grep NVIDIA | wc -l)" - if version_lt "${gpu_count}" "1" ; then return ; fi local executor_cores executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" @@ -1223,6 +1222,7 @@ function install_dependencies() { pkg_list="pciutils screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi + lspci | grep -q NVIDIA || exit 0 } function prepare_gpu_env(){ @@ -1246,6 +1246,8 @@ function prepare_gpu_env(){ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RAPIDS_RUNTIME + install_dependencies + set_cuda_version set_driver_version set_cuda_repo_shortname diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index f20689cf0..c5a204703 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -29,6 +29,8 @@ set -euxo pipefail function main() { setup_gpu_yarn + echo "yarn setup complete" + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids configure_gpu_script From af817f0c49b4a8365072a40cf226183b306e6fb1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 01:58:07 -0800 Subject: [PATCH 039/130] install_dependencies is called from base template prep function --- templates/gpu/install_gpu_driver.sh.in | 1 + templates/gpu/util_functions | 2 -- templates/spark-rapids/spark-rapids.sh.in | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 8ce2088d7..7974640ec 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -54,6 +54,7 @@ function exit_handler() { } function prepare_to_install(){ + install_dependencies prepare_common_env prepare_gpu_env trap exit_handler EXIT diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index cea01b03d..8c7de3b48 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1246,8 +1246,6 @@ function prepare_gpu_env(){ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RAPIDS_RUNTIME - install_dependencies - set_cuda_version set_driver_version set_cuda_repo_shortname diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index c5a204703..77649fb19 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -60,6 +60,7 @@ function exit_handler() { } function prepare_to_install(){ + install_dependencies prepare_common_env prepare_gpu_env trap exit_handler EXIT From 2e7441ba75c245d0e719aa4b911df6718455bff6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 02:29:29 -0800 Subject: [PATCH 040/130] re-thought about the dependencies install time --- templates/common/util_functions | 9 +++++++++ templates/gpu/install_gpu_driver.sh.in | 1 - templates/gpu/util_functions | 7 ------- templates/spark-rapids/spark-rapids.sh.in | 1 - 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 114fce6a5..bb5073cd6 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -541,6 +541,13 @@ function check_secure_boot() { configure_dkms_certs } +function install_dependencies() { + pkg_list="pciutils screen" + if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} + elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi + lspci | grep -q NVIDIA || exit 0 +} + function prepare_common_env() { define_os_comparison_functions @@ -598,6 +605,8 @@ function prepare_common_env() { time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) fi + install_dependencies + # Monitor disk usage in a screen session df / > "/run/disk-usage.log" touch "/run/keep-running-df" diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 7974640ec..8ce2088d7 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -54,7 +54,6 @@ function exit_handler() { } function prepare_to_install(){ - install_dependencies prepare_common_env prepare_gpu_env trap exit_handler EXIT diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 8c7de3b48..d8371a258 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1218,13 +1218,6 @@ function install_build_dependencies() { touch "${workdir}/complete/build-dependencies" } -function install_dependencies() { - pkg_list="pciutils screen" - if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} - elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi - lspci | grep -q NVIDIA || exit 0 -} - function prepare_gpu_env(){ nvsmi_works="0" NVIDIA_SMI_PATH='/usr/bin' diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 77649fb19..c5a204703 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -60,7 +60,6 @@ function exit_handler() { } function prepare_to_install(){ - install_dependencies prepare_common_env prepare_gpu_env trap exit_handler EXIT From 29631a0c615a4730c87f50fc09910ddcf8b8a22f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 14:20:38 -0800 Subject: [PATCH 041/130] refactored configure_gpu_exclusive_mode to fewer lines --- templates/gpu/util_functions | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index d8371a258..03417020b 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -981,6 +981,7 @@ function install_gpu_agent() { | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" python3 -m venv "${venv}" + ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip @@ -1012,13 +1013,10 @@ EOF } function configure_gpu_exclusive_mode() { - # check if running spark 3, if not, enable GPU exclusive mode - local spark_version - spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) - if [[ ${spark_version} != 3.* ]]; then - # include exclusive mode on GPU - nvidia-smi -c EXCLUSIVE_PROCESS - fi + # only run this function when spark < 3.0 + if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi + # include exclusive mode on GPU + nvidia-smi -c EXCLUSIVE_PROCESS } function fetch_mig_scripts() { From 7ea7653f47fa9bb2c2c6bdaa6c8dc21bdfaf59d5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 15:01:17 -0800 Subject: [PATCH 042/130] refactored gpu-related code out of common function library ; less reactive to not having GPU --- templates/common/util_functions | 8 +-- templates/gpu/install_gpu_driver.sh.in | 1 - templates/gpu/util_functions | 87 +++++++++++++++----------- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index bb5073cd6..9c2e89372 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -173,11 +173,6 @@ function configure_yarn_resources() { # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' set_hadoop_property 'yarn-site.xml' \ @@ -542,10 +537,11 @@ function check_secure_boot() { } function install_dependencies() { + test -f "${workdir}/complete/install-dependencies" && return 0 pkg_list="pciutils screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi - lspci | grep -q NVIDIA || exit 0 + touch "${workdir}/complete/install-dependencies" } function prepare_common_env() { diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 8ce2088d7..ae4693f16 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -21,7 +21,6 @@ function main() { install_nvidia_nccl install_nvidia_cudnn fi - install_nvidia_container_toolkit if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 03417020b..71b5c89d0 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1096,8 +1096,6 @@ EOF chmod a+rx "${gpus_resources_script}" local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - local gpu_count - gpu_count="$(lspci | grep NVIDIA | wc -l)" local executor_cores executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" @@ -1128,6 +1126,15 @@ spark.yarn.unmanagedAM.enabled=false EOF } +function configure_yarn_nodemanager_gpu() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}" + configure_yarn_nodemanager +} + function configure_gpu_isolation() { # enable GPU isolation sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" @@ -1217,6 +1224,7 @@ function install_build_dependencies() { } function prepare_gpu_env(){ + gpu_count="$(lspci | grep -q NVIDIA | wc -l)" nvsmi_works="0" NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 @@ -1321,48 +1329,51 @@ function setup_gpu_yarn() { # regardless if they have attached GPUs configure_yarn_resources - # Detect NVIDIA GPU - if (lspci | grep -q NVIDIA); then - # if this is called without the MIG script then the drivers are not installed - migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" - if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi - fi + # When there is no GPU, but the installer is executing on a master node: + if [[ "${gpu_count}" == "0" ]] ; then + if [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager fi + return 0 + fi - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver - install_cuda - load_kernel_module - - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent -# install_gpu_monitoring_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent has not been installed.' + # if this is called without the MIG script then the drivers are not installed + migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" + if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts fi - configure_gpu_exclusive_mode fi + fi - configure_yarn_nodemanager - configure_gpu_script - configure_gpu_isolation - elif [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - configure_gpu_script + # if mig is enabled drivers would have already been installed + if [[ $IS_MIG_ENABLED -eq 0 ]]; then + install_nvidia_gpu_driver + install_cuda + load_kernel_module + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent +# install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode fi + + install_nvidia_container_toolkit + configure_yarn_nodemanager_gpu + configure_gpu_script + configure_gpu_isolation } function gpu_exit_handler() { From 70349a655fda55974339a765f371b7cf95289403 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 15:11:24 -0800 Subject: [PATCH 043/130] being more surgical about signing material usage --- templates/common/util_functions | 5 ----- templates/gpu/util_functions | 9 ++++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 9c2e89372..c093b43f3 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -532,8 +532,6 @@ function check_secure_boot() { mok_der=/var/lib/shim-signed/mok/MOK.der else mok_key=/var/lib/dkms/mok.key mok_der=/var/lib/dkms/mok.pub ; fi - - configure_dkms_certs } function install_dependencies() { @@ -613,9 +611,6 @@ function prepare_common_env() { } function common_exit_handler() { - # Purge private key material until next grant - clear_dkms_key - set +ex echo "Exit handler invoked" diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 71b5c89d0..d5975e566 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -657,12 +657,14 @@ function build_driver_from_github() { 2> kernel-open/build_error.log # Sign kernel modules if [[ -n "${PSN}" ]]; then + configure_dkms_certs for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ "${mok_key}" \ "${mok_der}" \ "${module}" done + clear_dkms_key fi make modules_install \ >> kernel-open/build.log \ @@ -702,12 +704,10 @@ function build_driver_from_packages() { add_contrib_component apt-get update -qq execute_with_retries apt-get install -y -qq --no-install-recommends dkms - #configure_dkms_certs execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync elif is_rocky ; then - #configure_dkms_certs if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else @@ -715,7 +715,6 @@ function build_driver_from_packages() { fi sync fi - #clear_dkms_key } function install_nvidia_userspace_runfile() { @@ -767,7 +766,7 @@ function install_nvidia_userspace_runfile() { echo "cache hit" else install_build_dependencies - + configure_dkms_certs local signing_options signing_options="" if [[ -n "${PSN}" ]]; then @@ -778,7 +777,6 @@ function install_nvidia_userspace_runfile() { --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ " fi - runfile_args="--no-dkms ${signing_options}" fi } @@ -797,6 +795,7 @@ function install_nvidia_userspace_runfile() { gcloud storage cat "${gcs_tarball}" | tar -C / -xzv depmod -a else + clear_dkms_key tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') From b5473c58f7fcedf7f6e74d822c706dbed4b00b7c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 15:48:35 -0800 Subject: [PATCH 044/130] removed dependency on pciutils ; defined is_debuntu with other os comparison functions --- templates/common/util_functions | 5 ++--- templates/gpu/util_functions | 9 ++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index c093b43f3..b35407074 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -28,10 +28,9 @@ function define_os_comparison_functions() { eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" done done + eval "function is_debuntu() ( set +x ; is_debian || is_ubuntu ; )" } -function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) - function os_vercat() ( set +x if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' @@ -536,7 +535,7 @@ function check_secure_boot() { function install_dependencies() { test -f "${workdir}/complete/install-dependencies" && return 0 - pkg_list="pciutils screen" + pkg_list="screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi touch "${workdir}/complete/install-dependencies" diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index d5975e566..f10c15f06 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1223,7 +1223,8 @@ function install_build_dependencies() { } function prepare_gpu_env(){ - gpu_count="$(lspci | grep -q NVIDIA | wc -l)" + gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" + echo "gpu_count=[${gpu_count}]" nvsmi_works="0" NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 @@ -1294,7 +1295,9 @@ function configure_mig_cgi() { if test -n "${META_MIG_CGI_VALUE}"; then nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C else - if lspci | grep -q H100 ; then + # https://pci-ids.ucw.cz/v2.2/pci.ids + local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)" + if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then # run the following command to list placement profiles # nvidia-smi mig -lgipp # @@ -1309,7 +1312,7 @@ function configure_mig_cgi() { # For H100 3D controllers, use profile 19, 7x1G instances nvidia-smi mig -cgi 19 -C - elif lspci | grep -q A100 ; then + if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then # Dataproc only supports A100s right now split in 2 if not specified # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances nvidia-smi mig -cgi 9,9 -C From be3dbf665a7df16b638ac452e0622f36b68dfae4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 15:55:03 -0800 Subject: [PATCH 045/130] again I meant elif --- templates/gpu/util_functions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index f10c15f06..e6d3f38fa 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1312,7 +1312,7 @@ function configure_mig_cgi() { # For H100 3D controllers, use profile 19, 7x1G instances nvidia-smi mig -cgi 19 -C - if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then + elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then # Dataproc only supports A100s right now split in 2 if not specified # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances nvidia-smi mig -cgi 9,9 -C From 3ca8c913c28935ba95ab980b67bc0f6b0920973f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 16:12:02 -0800 Subject: [PATCH 046/130] fall back on metadata value if modulus_md5sum variable undefined --- templates/gpu/util_functions | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index e6d3f38fa..90e02b4b8 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -753,10 +753,8 @@ function install_nvidia_userspace_runfile() { test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" local_tarball="${workdir}/${build_tarball}" - local build_dir - if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] - then build_dir="${modulus_md5sum}" - else build_dir="unsigned" ; fi + local def_dir="${modulus_md5sum:-unsigned}" + local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" @@ -980,7 +978,6 @@ function install_gpu_agent() { | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" python3 -m venv "${venv}" - ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip From 83d5ccc91af28e0218d119224fe0822b49ccec12 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 16:21:11 -0800 Subject: [PATCH 047/130] switch to other build_dir variable assignment --- templates/gpu/util_functions | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 90e02b4b8..529ea30ea 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -635,10 +635,8 @@ function build_driver_from_github() { test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" - local build_dir - if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] - then build_dir="${modulus_md5sum}" - else build_dir="unsigned" ; fi + local def_dir="${modulus_md5sum:-unsigned}" + local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" From 93fdb304bdf2311821192dc47f6435db5addd3b5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 16:37:52 -0800 Subject: [PATCH 048/130] parens --- templates/gpu/util_functions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 529ea30ea..d2c2fe32e 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1098,7 +1098,7 @@ EOF local task_cpus=2 local gpu_amount gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - if version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ; then gpu_amount="0.5" ; fi + if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi cat >>"${spark_defaults_conf}" < Date: Fri, 27 Dec 2024 17:01:52 -0800 Subject: [PATCH 049/130] allow failure when grepping PCI devices for 10DE --- templates/common/util_functions | 92 +++++++++++++++++---------------- templates/gpu/util_functions | 2 + 2 files changed, 50 insertions(+), 44 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index b35407074..cd26dbf83 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -594,17 +594,20 @@ function prepare_common_env() { fi # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + + ( set +e time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero - ) fi + ) - install_dependencies + install_dependencies - # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" - touch "/run/keep-running-df" - screen -d -m -LUS keep-running-df \ - bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -LUS keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + fi touch "${workdir}/complete/prepare.common" } @@ -644,43 +647,45 @@ function common_exit_handler() { dnf clean all fi - # print disk usage statistics for large components - if is_ubuntu ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /opt/nvidia/* \ - /opt/conda/miniconda3 | sort -h - elif is_debian ; then - du -x -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ - /var/lib/{docker,mysql,} \ - /opt/nvidia/* \ - /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ - /usr/bin \ - /usr \ - /var \ - / 2>/dev/null | sort -h - else - du -hs \ - /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ - /usr/lib64/google-cloud-sdk \ - /opt/nvidia/* \ - /opt/conda/miniconda3 - fi + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -x -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ + /var/lib/{docker,mysql,} \ + /opt/nvidia/* \ + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ + /usr/lib64/google-cloud-sdk \ + /opt/nvidia/* \ + /opt/conda/miniconda3 + fi - # Process disk usage logs from installation period - rm -f /run/keep-running-df - sync - sleep 5.01s - # compute maximum size of disk during installation - # Log file contains logs like the following (minus the preceeding #): + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): #Filesystem 1K-blocks Used Available Use% Mounted on #/dev/vda2 7096908 2611344 4182932 39% / - df / | tee -a "/run/disk-usage.log" + df / | tee -a "/run/disk-usage.log" - perl -e '@siz=( sort { $a => $b } + perl -e \ + '@siz=( sort { $a => $b } map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } ); $max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; @@ -689,13 +694,12 @@ print( " samples-taken: ", scalar @siz, $/, "minimum-disk-used: $min", $/, " increased-by: $inc", $/ )' < "/run/disk-usage.log" - echo "exit_handler has completed" - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + # zero free disk space dd if=/dev/zero of=/zero sync sleep 3s rm -f /zero fi + echo "exit_handler has completed" } diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index d2c2fe32e..a7f611902 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1218,7 +1218,9 @@ function install_build_dependencies() { } function prepare_gpu_env(){ + set +e gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" + set -e echo "gpu_count=[${gpu_count}]" nvsmi_works="0" NVIDIA_SMI_PATH='/usr/bin' From 668db727d8078117c2b1fff93e9e52201c891754 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 17:36:38 -0800 Subject: [PATCH 050/130] removed listing of nodes_include ; does not work in custom-images context --- templates/gpu/install_gpu_driver.sh.in | 2 -- 1 file changed, 2 deletions(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index ae4693f16..be3baca89 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -34,8 +34,6 @@ function main() { fi # Restart YARN services if they are running already - nodes_include_gcs="gs:/$(get_metadata_attribute dataproc-bucket)/google-cloud-dataproc-metainfo/$(get_metadata_attribute dataproc-cluster-uuid)/nodes_include" - gsutil ls "${nodes_include_gcs}" for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then systemctl stop "hadoop-yarn-${svc}.service" From 2193c2883671594e712597a6981952b024724c2c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 19:10:10 -0800 Subject: [PATCH 051/130] min spark version supported by newer rapids is insufficient ; xgboost version is fine where it is at --- templates/gpu/util_functions | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index a7f611902..3c5bb7f06 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1023,8 +1023,7 @@ function fetch_mig_scripts() { function install_spark_rapids() { # Update SPARK RAPIDS config - readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" - readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" @@ -1033,9 +1032,12 @@ function install_spark_rapids() { elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then DEFAULT_XGBOOST_VERSION="1.7.6" elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then - DEFAULT_XGBOOST_VERSION="1.6.2" + DEFAULT_XGBOOST_VERSION="1.7.6" + DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 fi + readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + readonly DEFAULT_XGBOOST_VERSION readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) From 992d83acd3a1cac2a2607f31d46573b1691741ab Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 20:48:57 -0800 Subject: [PATCH 052/130] skipping fewer tests --- spark-rapids/test_spark_rapids.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 2d67a0df2..9b9481716 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -62,13 +62,6 @@ def verify_spark_job_sql(self): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in 2.0 and earlier images") - - if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ - ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases") - optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" @@ -93,13 +86,6 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in 2.0 and earlier images") - - if ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ - ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA 12.4 (default) not supported on older debian/ubuntu releases") - optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" @@ -129,19 +115,16 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0.1") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.1.1") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.1.1 not supported on older debian/ubuntu releases") if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in 2.0 and earlier images") - metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" ",cuda-version={0},driver-version={1}".format(cuda_version, driver_version)) From 7170872b973851103545811ed04a61964781357a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 21:15:49 -0800 Subject: [PATCH 053/130] simplified rapids / xgboost default version logic --- templates/gpu/util_functions | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 3c5bb7f06..1f6672051 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1023,22 +1023,17 @@ function fetch_mig_scripts() { function install_spark_rapids() { # Update SPARK RAPIDS config - DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_XGBOOST_VERSION="1.7.6" # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" - if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then - DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.3 - elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then - DEFAULT_XGBOOST_VERSION="1.7.6" - elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" ; then - DEFAULT_XGBOOST_VERSION="1.7.6" + + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 fi readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) - - readonly DEFAULT_XGBOOST_VERSION readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' From b33cb27b68c09d0591682779a3e0f92beadcf03b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 27 Dec 2024 23:59:40 -0800 Subject: [PATCH 054/130] ubuntu sometimes takes a while to bring gcloud online --- templates/common/util_functions | 3 +++ 1 file changed, 3 insertions(+) diff --git a/templates/common/util_functions b/templates/common/util_functions index cd26dbf83..9f7075f0b 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -589,6 +589,9 @@ function prepare_common_env() { apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi + if is_ubuntu ; then + while ! command -v gcloud ; do sleep 5s ; done + fi else dnf clean all fi From c56440a80ab3dc2533021d6bea8473d57b15b482 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 28 Dec 2024 00:00:14 -0800 Subject: [PATCH 055/130] only using 24.08.1 on 2.2 images ; fix a typo in a comment --- templates/gpu/util_functions | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 1f6672051..26c4d02f9 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -246,7 +246,7 @@ function set_cuda_runfile_url() { ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ - ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not + ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ) @@ -1023,14 +1023,14 @@ function fetch_mig_scripts() { function install_spark_rapids() { # Update SPARK RAPIDS config - local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 local DEFAULT_XGBOOST_VERSION="1.7.6" # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" - if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then - DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then + DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" fi readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) From f10df49a93603a746274a8eb88fb20c682cd7133 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 29 Dec 2024 15:21:11 -0800 Subject: [PATCH 056/130] refactored ; these files should be quite similar now --- templates/gpu/install_gpu_driver.sh.in | 7 - templates/spark-rapids/mig.sh.in | 212 ++-------------------- templates/spark-rapids/spark-rapids.sh.in | 8 +- 3 files changed, 12 insertions(+), 215 deletions(-) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index be3baca89..ffdda45e4 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -33,13 +33,6 @@ function main() { echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" fi - # Restart YARN services if they are running already - for svc in resourcemanager nodemanager; do - if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl stop "hadoop-yarn-${svc}.service" - systemctl start "hadoop-yarn-${svc}.service" - fi - done echo "main complete" return 0 } diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in index 0779a1c28..27da6ffd0 100644 --- a/templates/spark-rapids/mig.sh.in +++ b/templates/spark-rapids/mig.sh.in @@ -19,217 +19,27 @@ set -euxo pipefail [% INSERT gpu/util_functions %] -[% INSERT 'secure-boot/util_functions' %] - -function exit_handler() { - # Purge private key material until next grant - clear_dkms_key - - set +ex - echo "Exit handler invoked" - - # Clear pip cache - pip cache purge || echo "unable to purge pip cache" - - # If system memory was sufficient to mount memory-backed filesystems - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - - # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then - umount -f ${shmdir} - fi - done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' - fi - - if is_debuntu ; then - # Clean up OS package cache - apt-get -y -qq clean - apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove - # re-hold systemd package - if ge_debian12 ; then - apt-mark hold systemd libsystemd0 ; fi - hold_nvidia_packages - else - dnf clean all - fi - - # print disk usage statistics for large components - if is_ubuntu ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 | sort -h - elif is_debian ; then - du -x -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /var/lib/{docker,mysql,} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ - /usr/bin \ - /usr \ - /var \ - / 2>/dev/null | sort -h - else - du -hs \ - /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ - /usr/lib64/google-cloud-sdk \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 - fi - - # Process disk usage logs from installation period - rm -f /run/keep-running-df - sync - sleep 5.01s - # compute maximum size of disk during installation - # Log file contains logs like the following (minus the preceeding #): -#Filesystem 1K-blocks Used Available Use% Mounted on -#/dev/vda2 7096908 2611344 4182932 39% / - df / | tee -a "/run/disk-usage.log" +function main() { + setup_gpu_yarn - perl -e '@siz=( sort { $a => $b } - map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; -print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" + echo "yarn setup complete" - echo "exit_handler has completed" + enable_and_configure_mig - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - dd if=/dev/zero of=/zero - sync - sleep 3s - rm -f /zero - fi + echo "main complete" + return 0 +} +function exit_handler() { + gpu_exit_handler + common_exit_handler return 0 } function prepare_to_install(){ - # Verify OS compatability and Secure boot state - check_os - check_secure_boot - + prepare_common_env prepare_gpu_env - - OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role - ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - - workdir=/opt/install-dpgce - tmpdir=/tmp/ - temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" - readonly temp_bucket - readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" - uname_r=$(uname -r) - readonly uname_r - readonly bdcfg="/usr/local/bin/bdconfig" - export DEBIAN_FRONTEND=noninteractive - - mkdir -p "${workdir}" trap exit_handler EXIT - set_proxy - mount_ramdisk - - readonly install_log="${tmpdir}/install.log" - - if test -f "${workdir}/prepare-complete" ; then return ; fi - - repair_old_backports - - if is_debuntu ; then - clean_up_sources_lists - apt-get update -qq - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - if ge_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi - else - dnf clean all - fi - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e - time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero - ) fi - - install_dependencies - - # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" - touch "/run/keep-running-df" - screen -d -m -LUS keep-running-df \ - bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - - touch "${workdir}/prepare-complete" -} - -function enable_and_configure_mig() { - # default MIG to on when this script is used - META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") - - if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi - - enable_mig - - mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" - - NUM_GPUS_WITH_DIFF_MIG_MODES= - if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled all on GPUs. Failing" ; exit 1 ; fi - if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured on but NOT enabled. Failing" ; exit 1 ; fi - - echo "MIG is fully enabled" - configure_mig_cgi -} - -function main() { - # default MIG to on when this script is used - META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") - - if ! (lspci | grep -q NVIDIA) ; then return ; fi - if [[ $META_MIG_VALUE -ne 0 ]]; then - # if the first invocation, the NVIDIA drivers and tools are not installed - if [[ -f "/usr/bin/nvidia-smi" ]]; then - # check to see if we already enabled mig mode and rebooted so we don't end - # up in infinite reboot loop - mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" - NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)" - if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then - if (echo "${mig_mode_current}" | grep Enabled); then - echo "MIG is enabled on all GPUs, configuring instances" - configure_mig_cgi - exit 0 - else - echo "GPUs present but MIG is not enabled" - fi - else - echo "More than 1 GPU with MIG configured differently between them" - fi - fi - fi - - install_nvidia_gpu_driver - enable_and_configure_mig } prepare_to_install diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index c5a204703..ac8ec5c3f 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -19,6 +19,7 @@ # For details see # github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot # +[% PROCESS common/template_disclaimer %] set -euxo pipefail @@ -42,13 +43,6 @@ function main() { echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" fi - # Restart YARN services if they are running already - for svc in resourcemanager nodemanager; do - if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl stop "hadoop-yarn-${svc}.service" - systemctl start "hadoop-yarn-${svc}.service" - fi - done echo "main complete" return 0 } From f3a103ec996f7079a21769ca2d49cb4931285468 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 12:01:51 -0800 Subject: [PATCH 057/130] returning spark-rapids/* to master ; this version of these templates was used to generate a spark-rapids/spark-rapids.sh which passes all master tests --- gpu/install_gpu_driver.sh | 2418 ++++++++++++++++++++----------- spark-rapids/mig.sh | 2201 ++++------------------------ spark-rapids/spark-rapids.sh | 10 +- templates/common/util_functions | 8 + templates/gpu/util_functions | 56 +- 5 files changed, 1917 insertions(+), 2776 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 25efb2a49..8d3d5aa84 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +# +# This initialization action is generated from +# initialization-actions/templates/gpu/install_gpu_driver.sh.in +# +# Modifications made directly to the generated file will be lost when +# the template is re-evaluated + # # This script installs NVIDIA GPU drivers and collects GPU utilization metrics. @@ -25,27 +33,29 @@ function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $ function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) -readonly -A supported_os=( - ['debian']="10 11 12" - ['rocky']="8 9" - ['ubuntu']="18.04 20.04 22.04" -) - -# dynamically define OS version test utility functions -if [[ "$(os_id)" == "rocky" ]]; -then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') -else _os_version="$(os_version)"; fi -for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" - - for osver in $(echo "${supported_os["${os_id_val}"]}") ; do - eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" - eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" - eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" +function define_os_comparison_functions() { + + readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" + ) + + # dynamically define OS version test utility functions + if [[ "$(os_id)" == "rocky" ]]; + then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') + else _os_version="$(os_version)"; fi + for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" + done done -done - -function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) + eval "function is_debuntu() ( set +x ; is_debian || is_ubuntu ; )" +} function os_vercat() ( set +x if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' @@ -53,7 +63,7 @@ function os_vercat() ( set +x else os_version ; fi ; ) function repair_old_backports { - if ge_debian12 || ! is_debuntu ; then return ; fi + if ! is_debuntu ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will use archive.debian.org for the oldoldstable repo @@ -94,6 +104,7 @@ function print_metadata_value_if_exists() { return ${return_code} } +# replicates /usr/share/google/get_metadata_value function get_metadata_value() ( set +x local readonly varname=$1 @@ -117,67 +128,719 @@ function get_metadata_attribute() ( get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) -OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -readonly OS_NAME - -# node role -ROLE="$(get_metadata_attribute dataproc-role)" -readonly ROLE - -# CUDA version and Driver version -# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html -# https://developer.nvidia.com/cuda-downloads -# Rocky8: 12.0: 525.147.05 -readonly -A DRIVER_FOR_CUDA=( - ["11.8"]="560.35.03" - ["12.0"]="525.60.13" ["12.4"]="560.35.03" ["12.6"]="560.35.03" -) -# https://developer.nvidia.com/cudnn-downloads -if is_debuntu ; then -readonly -A CUDNN_FOR_CUDA=( - ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.6"]="9.5.1.17" -) -elif is_rocky ; then -# rocky: -# 12.0: 8.8.1.3 -# 12.1: 8.9.3.28 -# 12.2: 8.9.7.29 -# 12.3: 9.0.0.312 -# 12.4: 9.1.1.17 -# 12.5: 9.2.1.18 -# 12.6: 9.5.1.17 -readonly -A CUDNN_FOR_CUDA=( - ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.4"]="9.1.1.17" ["12.6"]="9.5.1.17" -) -fi -# https://developer.nvidia.com/nccl/nccl-download -# 12.2: 2.19.3, 12.5: 2.21.5 -readonly -A NCCL_FOR_CUDA=( - ["11.8"]="2.15.5" - ["12.0"]="2.16.5" ["12.4"]="2.23.4" ["12.6"]="2.23.4" -) -readonly -A CUDA_SUBVER=( - ["11.8"]="11.8.0" - ["12.0"]="12.0.0" ["12.4"]="12.4.1" ["12.6"]="12.6.2" +function execute_with_retries() ( + set +x + local -r cmd="$*" + + if [[ "$cmd" =~ "^apt-get install" ]] ; then + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + fi + for ((i = 0; i < 3; i++)); do + set -x + time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + set +x + if [[ $retval == 0 ]] ; then return 0 ; fi + sleep 5 + done + return 1 ) -RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') -readonly DEFAULT_CUDA_VERSION='12.4' -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") -if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then - # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27 - CUDA_VERSION="${DEFAULT_CUDA_VERSION}" -fi +function cache_fetched_package() { + local src_url="$1" + local gcs_fn="$2" + local local_fn="$3" -if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then - # Only CUDA 12.0 supported on older debuntu - CUDA_VERSION="12.0" -fi -readonly CUDA_VERSION -readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}" + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then + time gcloud storage cp "${gcs_fn}" "${local_fn}" + else + time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + fi +} + +function add_contrib_component() { + if ! is_debuntu ; then return ; fi + if ge_debian12 ; then + # Include in sources file components on which nvidia-kernel-open-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi +} + +function set_hadoop_property() { + local -r config_file=$1 + local -r property=$2 + local -r value=$3 + "${bdcfg}" set_property \ + --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ + --name "${property}" --value "${value}" \ + --clobber +} + +function configure_yarn_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + + # Fix local dirs access permissions + local yarn_local_dirs=() + + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} + +function clean_up_sources_lists() { + # + # bigtop (primary) + # + local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" + + if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then + region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" + + local regional_bigtop_repo_uri + regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | + sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | + grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | + cut -d ' ' -f 2 | + head -1) + + if [[ "${regional_bigtop_repo_uri}" == */ ]]; then + local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" + else + local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + fi + + local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" + rm -f "${bigtop_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ + "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" + + sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + fi + + # + # adoptium + # + # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu + local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" + local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" + rm -f "${adoptium_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ + | gpg --dearmor -o "${adoptium_kr_path}" + echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ + > /etc/apt/sources.list.d/adoptium.list + + + # + # docker + # + local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" + local docker_repo_file="/etc/apt/sources.list.d/docker.list" + local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" + + rm -f "${docker_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ + | gpg --dearmor -o "${docker_kr_path}" + echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ + > ${docker_repo_file} + + # + # google cloud + logging/monitoring + # + if ls /etc/apt/sources.list.d/google-cloud*.list ; then + rm -f /usr/share/keyrings/cloud.google.gpg + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + for list in google-cloud google-cloud-logging google-cloud-monitoring ; do + list_file="/etc/apt/sources.list.d/${list}.list" + if [[ -f "${list_file}" ]]; then + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" + fi + done + fi + + # + # cran-r + # + if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi + rm -f /usr/share/keyrings/cran-r.gpg + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ + gpg --dearmor -o /usr/share/keyrings/cran-r.gpg + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + fi + + # + # mysql + # + if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then + rm -f /usr/share/keyrings/mysql.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + gpg --dearmor -o /usr/share/keyrings/mysql.gpg + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list + fi + + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + +} + +function set_proxy(){ + METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + + if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + + export METADATA_HTTP_PROXY + export http_proxy="${METADATA_HTTP_PROXY}" + export https_proxy="${METADATA_HTTP_PROXY}" + export HTTP_PROXY="${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="${METADATA_HTTP_PROXY}" + no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" + local no_proxy_svc + for no_proxy_svc in compute secretmanager dns servicedirectory logging \ + bigquery composer pubsub bigquerydatatransfer dataflow \ + storage datafusion ; do + no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" + done + + export NO_PROXY="${no_proxy}" +} + +function mount_ramdisk(){ + local free_mem + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + + # Write to a ramdisk instead of churning the persistent disk + + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + pip cache purge || echo "unable to purge pip cache" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi +} + +function check_os() { + if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then + echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." + exit 1 + elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then + echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." + exit 1 + elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then + echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." + exit 1 + fi + + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + readonly SPARK_VERSION + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 + fi + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi +} + +# +# Generate repo file under /etc/apt/sources.list.d/ +# +function apt_add_repo() { + local -r repo_name="$1" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local -r include_src="${4:-yes}" + local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" + + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + if [[ "${include_src}" == "yes" ]] ; then + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + fi + + apt-get update -qq +} + +# +# Generate repo file under /etc/yum.repos.d/ +# +function dnf_add_repo() { + local -r repo_name="$1" + local -r repo_url="$3" # "http(s)://host/path/filename.repo" + local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" + + curl -s -L "${repo_url}" \ + | dd of="${repo_path}" status=progress +# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ +} + +# +# Keyrings default to +# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or +# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) +# +function os_add_repo() { + local -r repo_name="$1" + local -r signing_key_url="$2" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local kr_path + if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi + + mkdir -p "$(dirname "${kr_path}")" + + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + + if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi +} + +function configure_dkms_certs() { + if test -v PSN && [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key" + fi + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert" + fi + else + modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" + fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + + return + fi + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + local mok_directory="$(dirname "${mok_key}")" + mkdir -p "${mok_directory}" + + # symlink private key and copy public cert from volatile storage to DKMS directory + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + + modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >&2 + return 0 + fi + rm -rf "${CA_TMPDIR}" "${mok_key}" +} + +function check_secure_boot() { + local SECURE_BOOT="disabled" + SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + + PSN="$(get_metadata_attribute private_secret_name)" + readonly PSN + + if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then + echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." + exit 1 + elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then + echo "Secure boot is enabled, but no signing material provided." + echo "Please either disable secure boot or provide signing material as per" + echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" + return 1 + fi + + CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" + readonly CA_TMPDIR + + if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv + mok_der=/var/lib/shim-signed/mok/MOK.der + else mok_key=/var/lib/dkms/mok.key + mok_der=/var/lib/dkms/mok.pub ; fi +} + +function install_dependencies() { + test -f "${workdir}/complete/install-dependencies" && return 0 + pkg_list="screen" + if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} + elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi + touch "${workdir}/complete/install-dependencies" +} + +function prepare_common_env() { + define_os_comparison_functions + + # Verify OS compatability and Secure boot state + check_os + check_secure_boot + + readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + + # Dataproc configurations + readonly HADOOP_CONF_DIR='/etc/hadoop/conf' + readonly HIVE_CONF_DIR='/etc/hive/conf' + readonly SPARK_CONF_DIR='/etc/spark/conf' + + OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" + readonly OS_NAME + + # node role + ROLE="$(get_metadata_attribute dataproc-role)" + readonly ROLE + + workdir=/opt/install-dpgce + tmpdir=/tmp/ + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + uname_r=$(uname -r) + readonly uname_r + readonly bdcfg="/usr/local/bin/bdconfig" + export DEBIAN_FRONTEND=noninteractive + + mkdir -p "${workdir}/complete" + set_proxy + mount_ramdisk + + readonly install_log="${tmpdir}/install.log" + + if test -f "${workdir}/complete/prepare.common" ; then return ; fi + + repair_old_backports + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + if ge_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + if is_ubuntu ; then + while ! command -v gcloud ; do sleep 5s ; done + fi + else + dnf clean all + fi + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + + ( set +e + time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero + ) + + install_dependencies + + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -LUS keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + fi + + touch "${workdir}/complete/prepare.common" +} + +function common_exit_handler() { + set +ex + echo "Exit handler invoked" + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # Restart YARN services if they are running already + for svc in resourcemanager nodemanager; do + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl stop "hadoop-yarn-${svc}.service" + systemctl start "hadoop-yarn-${svc}.service" + fi + done + + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done + + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi + + if is_debuntu ; then + # Clean up OS package cache + apt-get -y -qq clean + apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove + # re-hold systemd package + if ge_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + hold_nvidia_packages + else + dnf clean all + fi + + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -x -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ + /var/lib/{docker,mysql,} \ + /opt/nvidia/* \ + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ + /usr/lib64/google-cloud-sdk \ + /opt/nvidia/* \ + /opt/conda/miniconda3 + fi + + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e \ + '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" + + + # zero free disk space + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero + fi + echo "exit_handler has completed" +} + + +function set_support_matrix() { + # CUDA version and Driver version + # https://docs.nvidia.com/deploy/cuda-compatibility/ + # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html + # https://developer.nvidia.com/cuda-downloads + + # Minimum supported version for open kernel driver is 515.43.04 + # https://github.com/NVIDIA/open-gpu-kernel-modules/tags + # Rocky8: 12.0: 525.147.05 + local latest + latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" + readonly -A DRIVER_FOR_CUDA=( + ["11.7"]="515.65.01" ["11.8"]="525.147.05" + ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ) + readonly -A DRIVER_SUBVER=( + ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" + ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" + ) + # https://developer.nvidia.com/cudnn-downloads + if is_debuntu ; then + readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" + ) + elif is_rocky ; then + # rocky: + # 12.0: 8.8.1.3 + # 12.1: 8.9.3.28 + # 12.2: 8.9.7.29 + # 12.3: 9.0.0.312 + # 12.4: 9.1.1.17 + # 12.5: 9.2.1.18 + # 12.6: 9.5.1.17 + readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" + ) + fi + # https://developer.nvidia.com/nccl/nccl-download + # 12.2: 2.19.3, 12.5: 2.21.5 + readonly -A NCCL_FOR_CUDA=( + ["11.7"]="2.21.5" ["11.8"]="2.21.5" + ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" + ) + readonly -A CUDA_SUBVER=( + ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" + ) +} + +set_support_matrix + +function set_cuda_version() { + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + if [[ -n "${cuda_url}" ]] ; then + # if cuda-url metadata variable has been passed, extract default version from url + local CUDA_URL_VERSION + CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" + if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then + DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" + CUDA_FULL_VERSION="${CUDA_URL_VERSION}" + fi + fi + + if ( ! test -v DEFAULT_CUDA_VERSION ) ; then + DEFAULT_CUDA_VERSION='12.4.1' + fi + # EXCEPTIONS + # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + case "${DATAPROC_IMAGE_VERSION}" in + "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; + "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; + * ) + echo "unrecognized Dataproc image version" + exit 1 + ;; + esac + + if le_ubuntu18 ; then + DEFAULT_CUDA_VERSION="12.1.1" + CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}" #12.1 + fi + readonly DEFAULT_CUDA_VERSION + + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then + CUDA_FULL_VERSION="${CUDA_VERSION}" + CUDA_VERSION="${CUDA_VERSION%.*}" + fi + readonly CUDA_VERSION + if ( ! test -v CUDA_FULL_VERSION ) ; then + CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} + fi + readonly CUDA_FULL_VERSION + +} function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) @@ -187,110 +850,179 @@ function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) -DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}" -if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then - DEFAULT_DRIVER="560.28.03" ; fi -if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03" ; fi -if ( is_rocky && le_cuda11 ) ; then DEFAULT_DRIVER="525.147.05" ; fi -if ( is_ubuntu20 && le_cuda11 ) ; then DEFAULT_DRIVER="535.183.06" ; fi -if ( is_rocky9 && ge_cuda12 ) ; then DEFAULT_DRIVER="565.57.01" ; fi -DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") +function set_driver_version() { + local gpu_driver_url + gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') + + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + + local DEFAULT_DRIVER + # Take default from gpu-driver-url metadata value + if [[ -n "${gpu_driver_url}" ]] ; then + DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" + if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi + # Take default from cuda-url metadata value as a backup + elif [[ -n "${cuda_url}" ]] ; then + local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" + if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then + major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" + driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} + if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the version indicated by the cuda url as the default if it exists + DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" + elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the maximum sub-version available for the major version indicated in cuda url as the default + DEFAULT_DRIVER="${driver_max_maj_version}" + fi + fi + fi -readonly DRIVER_VERSION -readonly DRIVER=${DRIVER_VERSION%%.*} + if ( ! test -v DEFAULT_DRIVER ) ; then + # If a default driver version has not been extracted, use the default for this version of CUDA + DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} + fi -readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" -readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") -# Parameters for NVIDIA-provided cuDNN library -readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} -CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") -function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) -function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) -# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} -if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" -elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then - # cuDNN v8 is not distribution for ubuntu20+, debian12 - CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" -elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then - # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 - CUDNN_VERSION="8.8.0.121" -fi -readonly CUDNN_VERSION + readonly DRIVER_VERSION + readonly DRIVER="${DRIVER_VERSION%%.*}" -readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} -readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + export DRIVER_VERSION DRIVER -# Parameters for NVIDIA-provided Debian GPU driver -readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" + exit 1 + fi +} -readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") +function set_cudnn_version() { + readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" + readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + + # Parameters for NVIDIA-provided cuDNN library + readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} + CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") + # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} + if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" + elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" + elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" + fi + readonly CUDNN_VERSION +} -# Short name for urls -if is_ubuntu22 ; then - # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at - # https://developer.download.nvidia.com/compute/machine-learning/repos/ - # use packages from previous release until such time as nvidia - # release ubuntu2204 builds - nccl_shortname="ubuntu2004" - shortname="$(os_id)$(os_vercat)" -elif ge_rocky9 ; then - # use packages from previous release until such time as nvidia - # release rhel9 builds +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) - nccl_shortname="rhel8" - shortname="rhel9" -elif is_rocky ; then +function set_cuda_repo_shortname() { +# Short name for urls +# https://developer.download.nvidia.com/compute/cuda/repos/${shortname} + if is_rocky ; then shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" - nccl_shortname="${shortname}" -else + else shortname="$(os_id)$(os_vercat)" - nccl_shortname="${shortname}" -fi + fi +} -# Parameters for NVIDIA-provided package repositories -readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' -readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" +function set_nv_urls() { + # Parameters for NVIDIA-provided package repositories + readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' + readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" -# Parameters for NVIDIA-provided NCCL library -readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" -NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") -readonly NCCL_REPO_URL -readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub + # Parameter for NVIDIA-provided Rocky Linux GPU driver + readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" +} function set_cuda_runfile_url() { - local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}" - local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}" - - if ge_cuda12 ; then - if ( le_debian11 || le_ubuntu18 ) ; then - RUNFILE_DRIVER_VERSION="525.60.13" - RUNFILE_CUDA_VERSION="12.0.0" - elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then - RUNFILE_DRIVER_VERSION="525.147.05" - RUNFILE_CUDA_VERSION="12.0.0" + local MAX_DRIVER_VERSION + local MAX_CUDA_VERSION + + local MIN_OPEN_DRIVER_VER="515.48.07" + local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" + local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER + + if is_cuda12 ; then + if is_debian12 ; then + MIN_DRIVER_VERSION="545.23.06" + MIN_CUDA_VERSION="12.3.0" + elif is_debian10 ; then + MAX_DRIVER_VERSION="555.42.02" + MAX_CUDA_VERSION="12.5.0" + elif is_ubuntu18 ; then + MAX_DRIVER_VERSION="530.30.02" + MAX_CUDA_VERSION="12.1.1" + fi + elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + if le_debian10 ; then + # cuda 11 is not supported for <= debian10 + MAX_CUDA_VERSION="0" + MAX_DRIVER_VERSION="0" fi else - RUNFILE_DRIVER_VERSION="520.61.05" - RUNFILE_CUDA_VERSION="11.8.0" + echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + + if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then + echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then + echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then + echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" fi - readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run" - CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}" - DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}" - readonly DEFAULT_NVIDIA_CUDA_URL + # driver version named in cuda runfile filename + # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) + readonly -A drv_for_cuda=( + ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" + ["11.8.0"]="520.61.05" + ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" + ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" + ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" + ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" + ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ) + + # Verify that the file with the indicated combination exists + local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} + CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" + local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" + local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") readonly NVIDIA_CUDA_URL -} -set_cuda_runfile_url + CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_RUNFILE + + if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" + exit 1 + fi -# Parameter for NVIDIA-provided Rocky Linux GPU driver -readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then + echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" + elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then + echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then + echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then + echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" + fi +} +function set_cudnn_tarball_url() { CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then @@ -298,59 +1030,23 @@ if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then # When cuDNN version is greater than or equal to 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" - fi - # Use legacy url format with one of the tarball name formats depending on version as above - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" -fi -if ( version_ge "${CUDA_VERSION}" "12.0" ); then - # Use modern url format When cuda version is greater than or equal to 12.0 - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" -fi -readonly CUDNN_TARBALL -readonly CUDNN_TARBALL_URL - -# Whether to install NVIDIA-provided or OS-provided GPU driver -GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') -readonly GPU_DRIVER_PROVIDER - -# Stackdriver GPU agent parameters -readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' -# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') -readonly INSTALL_GPU_AGENT - -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' - -NVIDIA_SMI_PATH='/usr/bin' -MIG_MAJOR_CAPS=0 -IS_MIG_ENABLED=0 - -function execute_with_retries() ( - set +x - local -r cmd="$*" - - if [[ "$cmd" =~ "^apt-get install" ]] ; then - apt-get -y clean - apt-get -y autoremove - fi - for ((i = 0; i < 3; i++)); do - set -x - time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } - set +x - if [[ $retval == 0 ]] ; then return 0 ; fi - sleep 5 - done - return 1 -) + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" + fi + # Use legacy url format with one of the tarball name formats depending on version as above + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" +fi +if ( version_ge "${CUDA_VERSION}" "12.0" ); then + # Use modern url format When cuda version is greater than or equal to 12.0 + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" +fi +readonly CUDNN_TARBALL +readonly CUDNN_TARBALL_URL +} -CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { - if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi + if ( test -v CUDA_KEYRING_PKG_INSTALLED && + [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ @@ -365,8 +1061,9 @@ function uninstall_cuda_keyring_pkg() { CUDA_KEYRING_PKG_INSTALLED="0" } -CUDA_LOCAL_REPO_INSTALLED="0" function install_local_cuda_repo() { + if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi + if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi CUDA_LOCAL_REPO_INSTALLED="1" pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" @@ -387,20 +1084,20 @@ function install_local_cuda_repo() { "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi + + touch "${workdir}/complete/install-local-cuda-repo" } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - CUDA_LOCAL_REPO_INSTALLED="0" + rm -f "${workdir}/complete/install-local-cuda-repo" } -CUDNN_LOCAL_REPO_INSTALLED="0" -CUDNN_PKG_NAME="" function install_local_cudnn_repo() { - if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi - pkgname="cudnn-local-repo-${shortname}-${CUDNN}" + if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi + pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ @@ -410,20 +1107,19 @@ function install_local_cudnn_repo() { rm -f "${tmpdir}/local-installer.deb" - cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings + cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - CUDNN_LOCAL_REPO_INSTALLED="1" + touch "${workdir}/complete/install-local-cudnn-repo" } function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" - CUDNN_LOCAL_REPO_INSTALLED="0" + rm -f "${workdir}/complete/install-local-cudnn-repo" } -CUDNN8_LOCAL_REPO_INSTALLED="0" -CUDNN8_PKG_NAME="" function install_local_cudnn8_repo() { - if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi + if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" else return 0 ; fi @@ -437,61 +1133,136 @@ function install_local_cudnn8_repo() { deb_fn="${pkgname}_1.0-1_amd64.deb" local_deb_fn="${tmpdir}/${deb_fn}" - local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o "${local_deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" + + # cache the cudnn package + cache_fetched_package "${local_deb_url}" \ + "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ + "${local_deb_fn}" + + local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" + # If we are using a ram disk, mount another where we will unpack the cudnn local installer + if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then + mkdir -p "${cudnn_path}" + mount -t tmpfs tmpfs "${cudnn_path}" + fi dpkg -i "${local_deb_fn}" rm -f "${local_deb_fn}" - cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - CUDNN8_LOCAL_REPO_INSTALLED="1" + cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings + touch "${workdir}/complete/install-local-cudnn8-repo" } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" - CUDNN8_LOCAL_REPO_INSTALLED="0" + rm -f "${workdir}/complete/install-local-cudnn8-repo" } function install_nvidia_nccl() { + readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} + readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + + if test -f "${workdir}/complete/nccl" ; then return ; fi + + if is_cuda11 && is_debian12 ; then + echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" + return + fi + local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - if is_rocky ; then - execute_with_retries \ - dnf -y -q install \ - "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" - sync - elif is_ubuntu ; then - install_cuda_keyring_pkg + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" + fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" + fi - apt-get update -qq + mkdir -p "${workdir}" + pushd "${workdir}" - if is_ubuntu18 ; then - execute_with_retries \ - apt-get install -q -y \ - libnccl2 libnccl-dev - sync + test -d "${workdir}/nccl" || { + local tarball_fn="v${NCCL_VERSION}-1.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "nccl-${NCCL_VERSION}-1" nccl + } + + local build_path + if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else + build_path="nccl/build/pkg/rpm/x86_64" ; fi + + test -d "${workdir}/nccl/build" || { + local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" else - execute_with_retries \ - apt-get install -q -y \ - "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" - sync + # build and cache + pushd nccl + # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install + install_build_dependencies + if is_debuntu ; then + # These packages are required to build .deb packages from source + execute_with_retries \ + apt-get install -y -qq build-essential devscripts debhelper fakeroot + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.debian.build + elif is_rocky ; then + # These packages are required to build .rpm packages from source + execute_with_retries \ + dnf -y -q install rpm-build rpmdevtools + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.redhat.build + fi + tar czvf "/${local_tarball}" "../${build_path}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd fi - else - echo "Unsupported OS: '${OS_NAME}'" - # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems - # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz - # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz - # nvhpc_2024_247_Linux_x86_64_cuda_multi/install - return + gcloud storage cat "${gcs_tarball}" | tar xz + } + + if is_debuntu ; then + dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" + elif is_rocky ; then + rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" fi + + popd + touch "${workdir}/complete/nccl" } function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { + if test -f "${workdir}/complete/cudnn" ; then return ; fi local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -515,7 +1286,6 @@ function install_nvidia_cudnn() { if ge_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else - local CUDNN="${CUDNN_VERSION%.*}" if is_cudnn8 ; then install_local_cudnn8_repo @@ -525,6 +1295,8 @@ function install_nvidia_cudnn() { apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" + + uninstall_local_cudnn8_repo sync elif is_cudnn9 ; then install_cuda_keyring_pkg @@ -541,118 +1313,15 @@ function install_nvidia_cudnn() { echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi fi - elif is_ubuntu ; then - local -a packages - packages=( - "libcudnn${major_version}=${cudnn_pkg_version}" - "libcudnn${major_version}-dev=${cudnn_pkg_version}") - execute_with_retries \ - apt-get install -q -y --no-install-recommends "${packages[*]}" - sync else - echo "Unsupported OS: '${OS_NAME}'" + echo "Unsupported OS: '${_shortname}'" exit 1 fi ldconfig - echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." -} - -CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" -PSN="$(get_metadata_attribute private_secret_name)" -readonly PSN -function configure_dkms_certs() { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping"; - return 0 - fi - - mkdir -p "${CA_TMPDIR}" - - # If the private key exists, verify it - if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then - echo "Private key material exists" - - local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then - modulus_md5sum="${expected_modulus_md5sum}" - else - modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3" - fi - - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key modulus" - fi - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key - - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert modulus" - fi - - return - fi - - - # Retrieve cloud secrets keys - local sig_priv_secret_name - sig_priv_secret_name="${PSN}" - local sig_pub_secret_name - sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" - local sig_secret_project - sig_secret_project="$(get_metadata_attribute secret_project)" - local sig_secret_version - sig_secret_version="$(get_metadata_attribute secret_version)" - - # If metadata values are not set, do not write mok keys - if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi - - # Write private material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_priv_secret_name}" \ - | dd status=none of="${CA_TMPDIR}/db.rsa" - - # Write public material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_pub_secret_name}" \ - | base64 --decode \ - | dd status=none of="${CA_TMPDIR}/db.der" - - # symlink private key and copy public cert from volatile storage for DKMS - if is_ubuntu ; then - mkdir -p /var/lib/shim-signed/mok - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv - cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der - else - mkdir -p /var/lib/dkms/ - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key - cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub - fi -} - -function clear_dkms_key { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping" >&2 - return 0 - fi - rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv -} - -function add_contrib_component() { - if ge_debian12 ; then - # Include in sources file components on which nvidia-kernel-open-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib" - - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list - fi + echo "NVIDIA cuDNN successfully installed for ${_shortname}." + touch "${workdir}/complete/cudnn" } function add_nonfree_components() { @@ -668,76 +1337,93 @@ function add_nonfree_components() { fi } +# +# Install package signing key and add corresponding repository +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html function add_repo_nvidia_container_toolkit() { - if is_debuntu ; then - local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg - local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list - # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html - test -f "${kr_path}" || - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ - | gpg --dearmor -o "${kr_path}" + local nvctk_root="https://nvidia.github.io/libnvidia-container" + local signing_key_url="${nvctk_root}/gpgkey" + local repo_data - test -f "${sources_list_path}" || - curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ - | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \ - | tee "${sources_list_path}" - fi + if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" + else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi + + os_add_repo nvidia-container-toolkit \ + "${signing_key_url}" \ + "${repo_data}" \ + "no" } function add_repo_cuda() { if is_debuntu ; then - local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg - local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" - echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ - | sudo tee "${sources_list_path}" - curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ - -o "${kr_path}" + install_cuda_keyring_pkg # 11.7+, 12.0+ elif is_rocky ; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - execute_with_retries "dnf clean all" fi } -readonly uname_r=$(uname -r) function build_driver_from_github() { - if is_ubuntu ; then - mok_key=/var/lib/shim-signed/mok/MOK.priv - mok_der=/var/lib/shim-signed/mok/MOK.der - else - mok_key=/var/lib/dkms/mok.key - mok_der=/var/lib/dkms/mok.pub - fi - workdir=/opt/install-nvidia-driver - mkdir -p "${workdir}" + # non-GPL driver will have been built on rocky8 + if is_rocky8 ; then return 0 ; fi pushd "${workdir}" + test -d "${workdir}/open-gpu-kernel-modules" || { - tarball_fn="${DRIVER_VERSION}.tar.gz" + local tarball_fn="${DRIVER_VERSION}.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ | tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } - cd open-gpu-kernel-modules - time make -j$(nproc) modules \ - > /var/log/open-gpu-kernel-modules-build.log \ - 2> /var/log/open-gpu-kernel-modules-build_error.log - sync + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local def_dir="${modulus_md5sum:-unsigned}" + local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") - if [[ -n "${PSN}" ]]; then - #configure_dkms_certs - for module in $(find kernel-open -name '*.ko'); do - "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ - "${mok_key}" \ - "${mok_der}" \ - "${module}" - done - #clear_dkms_key - fi + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + echo "cache hit" + else + # build the kernel modules + pushd open-gpu-kernel-modules + install_build_dependencies + if ( is_cuda11 && is_ubuntu22 ) ; then + echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" + exit 1 + fi + execute_with_retries make -j$(nproc) modules \ + > kernel-open/build.log \ + 2> kernel-open/build_error.log + # Sign kernel modules + if [[ -n "${PSN}" ]]; then + configure_dkms_certs + for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do + "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ + "${mok_key}" \ + "${mok_der}" \ + "${module}" + done + clear_dkms_key + fi + make modules_install \ + >> kernel-open/build.log \ + 2>> kernel-open/build_error.log + # Collect build logs and installed binaries + tar czvf "${local_tarball}" \ + "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + } - make modules_install \ - >> /var/log/open-gpu-kernel-modules-build.log \ - 2>> /var/log/open-gpu-kernel-modules-build_error.log popd } @@ -760,12 +1446,10 @@ function build_driver_from_packages() { add_contrib_component apt-get update -qq execute_with_retries apt-get install -y -qq --no-install-recommends dkms - #configure_dkms_certs execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync elif is_rocky ; then - #configure_dkms_certs if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else @@ -773,26 +1457,108 @@ function build_driver_from_packages() { fi sync fi - #clear_dkms_key } function install_nvidia_userspace_runfile() { - if test -f "${tmpdir}/userspace-complete" ; then return ; fi - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${USERSPACE_URL}" -o "${tmpdir}/userspace.run" - execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}" - rm -f "${tmpdir}/userspace.run" - touch "${tmpdir}/userspace-complete" + # Parameters for NVIDIA-provided Debian GPU driver + readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + + readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + + USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" + readonly USERSPACE_FILENAME + + # This .run file contains NV's OpenGL implementation as well as + # nvidia optimized implementations of the gtk+ 2,3 stack(s) not + # including glib (https://docs.gtk.org/glib/), and what appears to + # be a copy of the source from the kernel-open directory of for + # example DRIVER_VERSION=560.35.03 + # + # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz + # + # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run + # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. + if test -f "${workdir}/complete/userspace" ; then return ; fi + local local_fn="${tmpdir}/userspace.run" + + cache_fetched_package "${USERSPACE_URL}" \ + "${pkg_bucket}/${USERSPACE_FILENAME}" \ + "${local_fn}" + + local runfile_args + runfile_args="" + local cache_hit="0" + local local_tarball + + if is_rocky8 ; then + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local_tarball="${workdir}/${build_tarball}" + local def_dir="${modulus_md5sum:-unsigned}" + local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") + + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + cache_hit="1" + runfile_args="--no-kernel-modules" + echo "cache hit" + else + install_build_dependencies + configure_dkms_certs + local signing_options + signing_options="" + if [[ -n "${PSN}" ]]; then + signing_options="--module-signing-hash sha256 \ + --module-signing-x509-hash sha256 \ + --module-signing-secret-key \"${mok_key}\" \ + --module-signing-public-key \"${mok_der}\" \ + --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ + " + fi + runfile_args="--no-dkms ${signing_options}" + fi + } + else + runfile_args="--no-kernel-modules" + fi + + execute_with_retries bash "${local_fn}" -e -q \ + ${runfile_args} \ + --ui=none \ + --install-libglvnd \ + --tmpdir="${tmpdir}" + + if is_rocky8 ; then + if [[ "${cache_hit}" == "1" ]] ; then + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + else + clear_dkms_key + tar czvf "${local_tarball}" \ + /var/log/nvidia-installer.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi + fi + + rm -f "${local_fn}" + touch "${workdir}/complete/userspace" sync } function install_cuda_runfile() { - if test -f "${tmpdir}/cuda-complete" ; then return ; fi - time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run" - execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}" - rm -f "${tmpdir}/cuda.run" - touch "${tmpdir}/cuda-complete" + if test -f "${workdir}/complete/cuda" ; then return ; fi + local local_fn="${tmpdir}/cuda.run" + + cache_fetched_package "${NVIDIA_CUDA_URL}" \ + "${pkg_bucket}/${CUDA_RUNFILE}" \ + "${local_fn}" + + execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" + rm -f "${local_fn}" + touch "${workdir}/complete/cuda" sync } @@ -808,12 +1574,11 @@ function install_cuda_toolkit() { if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} - sync elif is_rocky ; then # rocky9: cuda-11-[7,8], cuda-12-[1..6] execute_with_retries dnf -y -q install "${cudatk_package}" - sync fi + sync } function load_kernel_module() { @@ -830,57 +1595,120 @@ function load_kernel_module() { # TODO: if peermem is available, also modprobe nvidia-peermem } +function install_cuda(){ + if test -f "${workdir}/complete/cuda-repo" ; then return ; fi + + if ( ge_debian12 && is_src_os ) ; then + echo "installed with the driver on ${_shortname}" + return 0 + fi + + # The OS package distributions are unreliable + install_cuda_runfile + + # Includes CUDA packages + add_repo_cuda + + touch "${workdir}/complete/cuda-repo" +} + +function install_nvidia_container_toolkit() { + local container_runtime_default + if command -v docker ; then container_runtime_default='docker' + elif command -v containerd ; then container_runtime_default='containerd' + elif command -v crio ; then container_runtime_default='crio' + else container_runtime_default='' ; fi + CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") + + if test -z "${CONTAINER_RUNTIME}" ; then return ; fi + + add_repo_nvidia_container_toolkit + if is_debuntu ; then + execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else + execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi + nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" + systemctl restart "${CONTAINER_RUNTIME}" +} + # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { + if test -f "${workdir}/complete/gpu-driver" ; then return ; fi + if ( ge_debian12 && is_src_os ) ; then add_nonfree_components - add_repo_nvidia_container_toolkit apt-get update -qq - #configure_dkms_certs apt-get -yq install \ - nvidia-container-toolkit \ - dkms \ - nvidia-open-kernel-dkms \ - nvidia-open-kernel-support \ - nvidia-smi \ - libglvnd0 \ - libcuda1 - #clear_dkms_key - elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then + dkms \ + nvidia-open-kernel-dkms \ + nvidia-open-kernel-support \ + nvidia-smi \ + libglvnd0 \ + libcuda1 + echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" + return 0 + fi - install_nvidia_userspace_runfile + # OS driver packages do not produce reliable driver ; use runfile + install_nvidia_userspace_runfile - build_driver_from_github + build_driver_from_github - install_cuda_runfile - elif is_debuntu ; then - install_cuda_keyring_pkg + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + touch "${workdir}/complete/gpu-driver" +} - build_driver_from_packages +function install_ops_agent(){ + if test -f "${workdir}/complete/ops-agent" ; then return ; fi - install_cuda_toolkit - elif is_rocky ; then - add_repo_cuda + mkdir -p /opt/google + cd /opt/google + # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation + curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - build_driver_from_packages + touch "${workdir}/complete/ops-agent" +} - install_cuda_toolkit - else - echo "Unsupported OS: '${OS_NAME}'" - exit 1 - fi - ldconfig - if is_src_os ; then - echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully" +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_monitoring_agent() { + download_gpu_monitoring_agent + install_gpu_monitoring_agent_dependency + start_gpu_monitoring_agent_service +} + +function download_gpu_monitoring_agent(){ + if is_rocky ; then + execute_with_retries "dnf -y -q install git" else - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + execute_with_retries "apt-get install git -y" fi + mkdir -p /opt/google + chmod 777 /opt/google + cd /opt/google + test -d compute-gpu-monitoring || \ + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" +} + +function install_gpu_monitoring_agent_dependency(){ + cd /opt/google/compute-gpu-monitoring/linux + python3 -m venv venv + venv/bin/pip install wheel + venv/bin/pip install -Ur requirements.txt +} + +function start_gpu_monitoring_agent_service(){ + cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + systemctl daemon-reload + systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { - if ! command -v pip; then - execute_with_retries "apt-get install -y -qq python-pip" + # Stackdriver GPU agent parameters +# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' + local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' + if ( ! command -v pip && is_debuntu ) ; then + execute_with_retries "apt-get install -y -qq python3-pip" fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" @@ -890,7 +1718,13 @@ function install_gpu_agent() { "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" - execute_with_retries pip install -r "${install_dir}/requirements.txt" + local venv="${install_dir}/venv" + python3 -m venv "${venv}" +( + source "${venv}/bin/activate" + python3 -m pip install --upgrade pip + execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" +) sync # Generate GPU service. @@ -901,7 +1735,7 @@ Description=GPU Utilization Metric Agent [Service] Type=simple PIDFile=/run/gpu_agent.pid -ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"' +ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' User=root Group=root WorkingDirectory=/ @@ -916,75 +1750,50 @@ EOF systemctl --no-reload --now enable gpu-utilization-agent.service } -function set_hadoop_property() { - local -r config_file=$1 - local -r property=$2 - local -r value=$3 - "${bdcfg}" set_property \ - --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ - --name "${property}" --value "${value}" \ - --clobber -} - -function configure_yarn() { - if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - - # Fix local dirs access permissions - local yarn_local_dirs=() - - readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - - if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" - fi -} - function configure_gpu_exclusive_mode() { - # check if running spark 3, if not, enable GPU exclusive mode - local spark_version - spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) - if [[ ${spark_version} != 3.* ]]; then - # include exclusive mode on GPU - nvsmi -c EXCLUSIVE_PROCESS - fi + # only run this function when spark < 3.0 + if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi + # include exclusive mode on GPU + nvidia-smi -c EXCLUSIVE_PROCESS + clear_nvsmi_cache } function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts - sudo chmod 755 /usr/local/yarn-mig-scripts + chmod 755 /usr/local/yarn-mig-scripts wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - sudo chmod 755 /usr/local/yarn-mig-scripts/* + chmod 755 /usr/local/yarn-mig-scripts/* +} + +function install_spark_rapids() { + # Update SPARK RAPIDS config + local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 + local DEFAULT_XGBOOST_VERSION="1.7.6" + + # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu + local -r scala_ver="2.12" + + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then + DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + fi + + readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) + + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \ + -P /usr/lib/spark/jars/ } function configure_gpu_script() { @@ -1023,9 +1832,43 @@ EOF chmod a+rx "${gpus_resources_script}" local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then - echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}" - fi + + local executor_cores + executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" + local executor_memory + executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" + local task_cpus=2 + local gpu_amount + gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi + + cat >>"${spark_defaults_conf}" < "${nvsmi_query_xml}" +} + function nvsmi() { local nvsmi="/usr/bin/nvidia-smi" - if [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2 + if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' elif [[ ! -f "${nvsmi}" ]] ; then echo "nvidia-smi not installed" >&2 ; return 0 elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi @@ -1074,14 +1928,23 @@ function nvsmi() { "${nvsmi}" $* } -function install_dependencies() { +function install_build_dependencies() { + if test -f "${workdir}/complete/build-dependencies" ; then return ; fi + if is_debuntu ; then - execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen + if is_ubuntu22 && is_cuda12 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + fi + elif is_rocky ; then - execute_with_retries dnf -y -q install pciutils gcc screen + execute_with_retries dnf -y -q install gcc local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" - local install_log="${tmpdir}/install.log" set +e eval "${dnf_cmd}" > "${install_log}" 2>&1 local retval="$?" @@ -1104,364 +1967,247 @@ function install_dependencies() { execute_with_retries "${dnf_cmd}" fi + touch "${workdir}/complete/build-dependencies" } -function main() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs - configure_yarn - - # Detect NVIDIA GPU - if (lspci | grep -q NVIDIA); then - # if this is called without the MIG script then the drivers are not installed - migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" - if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi - fi - fi - - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver +function prepare_gpu_env(){ + set +e + gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" + set -e + echo "gpu_count=[${gpu_count}]" + nvsmi_works="0" + nvsmi_query_xml="${tmpdir}/nvsmi.xml" + xmllint="/opt/conda/miniconda3/bin/xmllint" + NVIDIA_SMI_PATH='/usr/bin' + MIG_MAJOR_CAPS=0 + IS_MIG_ENABLED=0 + CUDNN_PKG_NAME="" + CUDNN8_PKG_NAME="" + CUDA_LOCAL_REPO_INSTALLED="0" - load_kernel_module + # Whether to install NVIDIA-provided or OS-provided GPU driver + GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') + readonly GPU_DRIVER_PROVIDER - if [[ -n ${CUDNN_VERSION} ]]; then - install_nvidia_nccl - install_nvidia_cudnn - fi - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent will not be installed.' - fi + # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver + INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') + readonly INSTALL_GPU_AGENT - # for some use cases, the kernel module needs to be removed before first use of nvidia-smi - for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" - done + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + readonly RAPIDS_RUNTIME - MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")" - if test -n "$(nvsmi -L)" ; then - # cache the result of the gpu query - ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') - echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" - fi - NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - # enable MIG on every GPU - for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do - nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 - done + set_cuda_version + set_driver_version + set_cuda_repo_shortname + set_nv_urls + set_cuda_runfile_url + set_cudnn_version + set_cudnn_tarball_url - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)" - fetch_mig_scripts - else - configure_gpu_exclusive_mode - fi - fi + if is_cuda11 ; then gcc_ver="11" + elif is_cuda12 ; then gcc_ver="12" ; fi +} - configure_yarn_nodemanager - configure_gpu_script - configure_gpu_isolation - elif [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - configure_gpu_script +# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades +# Users should run apt-mark unhold before they wish to upgrade these packages +function hold_nvidia_packages() { + apt-mark hold nvidia-* + apt-mark hold libnvidia-* + if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then + apt-mark hold xserver-xorg-video-nvidia* fi +} - # Restart YARN services if they are running already - if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-resourcemanager.service - fi - if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-nodemanager.service +function delete_mig_instances() ( + # delete all instances + set +e + nvidia-smi mig -dci + + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No compute instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac + + nvidia-smi mig -dgi + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No GPU instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac +) + +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles +function configure_mig_cgi() { + delete_mig_instances + META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" + if test -n "${META_MIG_CGI_VALUE}"; then + nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C + else + # https://pci-ids.ucw.cz/v2.2/pci.ids + local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)" + if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then + # run the following command to list placement profiles + # nvidia-smi mig -lgipp + # + # This is the result when using H100 instances on 20241220 + # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 + # GPU 0 Profile ID 14 Placements: {0,2,4}:2 + # GPU 0 Profile ID 9 Placements: {0,4}:4 + # GPU 0 Profile ID 5 Placement : {0}:4 + # GPU 0 Profile ID 0 Placement : {0}:8 + + # For H100 3D controllers, consider profile 19, 7x1G instances + nvidia-smi mig -cgi 9,9 -C + elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then + # Dataproc only supports H100s right now ; split in 2 if not specified + # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances + nvidia-smi mig -cgi 9,9 -C + else + echo "unrecognized 3D controller" + fi fi + clear_nvsmi_cache } -function clean_up_sources_lists() { - # - # bigtop (primary) - # - local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" +function enable_mig() { + if test -f "${workdir}/complete/enable-mig" ; then return ; fi - if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then - region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" + # Start persistenced if it's not already running + if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi + for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do + # Write an ascii zero to the numa node indicator + echo "0" | dd of="${f}" status=none + done + time nvidia-smi --gpu-reset # 30s + nvidia-smi -mig 1 + clear_nvsmi_cache - local regional_bigtop_repo_uri - regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | - sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | - grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | - cut -d ' ' -f 2 | - head -1) + touch "${workdir}/complete/enable-mig" +} - if [[ "${regional_bigtop_repo_uri}" == */ ]]; then - local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" - else - local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" - fi +function enable_and_configure_mig() { + # default MIG to on when this script is used + META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") - local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" - rm -f "${bigtop_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ - "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" + if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi - sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - fi + enable_mig - # - # adoptium - # - # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu - local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" - local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" - rm -f "${adoptium_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ - | gpg --dearmor -o "${adoptium_kr_path}" - echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ - > /etc/apt/sources.list.d/adoptium.list + xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' + query_nvsmi + mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" + if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi + if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured but NOT enabled. Failing" ; exit 1 ; fi - # - # docker - # - local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" - local docker_repo_file="/etc/apt/sources.list.d/docker.list" - local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" + echo "MIG is fully enabled" + configure_mig_cgi +} - rm -f "${docker_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ - | gpg --dearmor -o "${docker_kr_path}" - echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ - > ${docker_repo_file} +function setup_gpu_yarn() { + # This configuration should be run on all nodes + # regardless if they have attached GPUs + configure_yarn_resources - # - # google cloud + logging/monitoring - # - if ls /etc/apt/sources.list.d/google-cloud*.list ; then - rm -f /usr/share/keyrings/cloud.google.gpg - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - for list in google-cloud google-cloud-logging google-cloud-monitoring ; do - list_file="/etc/apt/sources.list.d/${list}.list" - if [[ -f "${list_file}" ]]; then - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" - fi - done + # When there is no GPU, but the installer is executing on a master node: + if [[ "${gpu_count}" == "0" ]] ; then + if [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager + fi + return 0 fi - # - # cran-r - # - if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then - keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" - if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi - rm -f /usr/share/keyrings/cran-r.gpg - curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ - gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list - fi + # if this is called without the MIG script then the drivers are not installed + query_nvsmi + migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - # - # mysql - # - if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then - rm -f /usr/share/keyrings/mysql.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ - gpg --dearmor -o /usr/share/keyrings/mysql.gpg - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi + fi fi - if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + # if mig is enabled drivers would have already been installed + if [[ $IS_MIG_ENABLED -eq 0 ]]; then + install_nvidia_gpu_driver + install_cuda + load_kernel_module + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent +# install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode + fi + install_nvidia_container_toolkit + configure_yarn_nodemanager_gpu + configure_gpu_script + configure_gpu_isolation } -function exit_handler() { - set +ex - echo "Exit handler invoked" - - # Purge private key material until next grant - clear_dkms_key - - # Clear pip cache - pip cache purge || echo "unable to purge pip cache" - - # If system memory was sufficient to mount memory-backed filesystems +function gpu_exit_handler() { if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - - # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do - if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then + for shmdir in /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then umount -f ${shmdir} fi done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' fi +} - if is_debuntu ; then - # Clean up OS package cache - apt-get -y -qq clean - apt-get -y -qq autoremove - # re-hold systemd package - if ge_debian12 ; then - apt-mark hold systemd libsystemd0 ; fi - else - dnf clean all - fi - # print disk usage statistics for large components - if is_ubuntu ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 | sort -h - elif is_debian ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 | sort -h - else - du -hs \ - /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ - /usr/lib64/google-cloud-sdk \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 - fi - - # Process disk usage logs from installation period - rm -f /run/keep-running-df - sync - sleep 5.01s - # compute maximum size of disk during installation - # Log file contains logs like the following (minus the preceeding #): -#Filesystem 1K-blocks Used Available Use% Mounted on -#/dev/vda2 7096908 2611344 4182932 39% / - df / | tee -a "/run/disk-usage.log" +function main() { + setup_gpu_yarn - perl -e '@siz=( sort { $a => $b } - map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; -print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" + echo "yarn setup complete" - echo "exit_handler has completed" + if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then + install_nvidia_nccl + install_nvidia_cudnn + fi - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - dd if=/dev/zero of=/zero - sync - sleep 3s - rm -f /zero + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + install_spark_rapids + configure_gpu_script + echo "RAPIDS initialized with Spark runtime" + elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then + # we are not currently tooled for installing dask in this action. + echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh" + else + echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" fi + echo "main complete" return 0 } -function set_proxy(){ - export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)" - export http_proxy="${METADATA_HTTP_PROXY}" - export https_proxy="${METADATA_HTTP_PROXY}" - export HTTP_PROXY="${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="${METADATA_HTTP_PROXY}" - export no_proxy=metadata.google.internal,169.254.169.254 - export NO_PROXY=metadata.google.internal,169.254.169.254 -} - -function mount_ramdisk(){ - local free_mem - free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" - if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi - - # Write to a ramdisk instead of churning the persistent disk - - tmpdir="/mnt/shm" - mkdir -p "${tmpdir}" - mount -t tmpfs tmpfs "${tmpdir}" - - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - pip cache purge || echo "unable to purge pip cache" - - # Download pip packages to tmpfs - pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" - - # Download OS packages to tmpfs - if is_debuntu ; then - mount -t tmpfs tmpfs /var/cache/apt/archives - else - mount -t tmpfs tmpfs /var/cache/dnf - fi +function exit_handler() { + gpu_exit_handler + common_exit_handler + return 0 } function prepare_to_install(){ - nvsmi_works="0" - readonly bdcfg="/usr/local/bin/bdconfig" - tmpdir=/tmp/ - if ! is_debuntu && ! is_rocky ; then - echo "Unsupported OS: '$(os_name)'" - exit 1 - fi - - repair_old_backports - - export DEBIAN_FRONTEND=noninteractive - + prepare_common_env + prepare_gpu_env trap exit_handler EXIT - mount_ramdisk - install_log="${tmpdir}/install.log" - - set_proxy - - if is_debuntu ; then - clean_up_sources_lists - apt-get update -qq - apt-get -y clean - sleep 5s - apt-get -y -qq autoremove - if ge_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi - else - dnf clean all - fi - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e - time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero - ) fi - - configure_dkms_certs - - install_dependencies - - # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" - touch "/run/keep-running-df" - screen -d -m -US keep-running-df \ - bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" } prepare_to_install diff --git a/spark-rapids/mig.sh b/spark-rapids/mig.sh index 473513438..85300348d 100644 --- a/spark-rapids/mig.sh +++ b/spark-rapids/mig.sh @@ -13,7 +13,6 @@ # limitations under the License. # This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures. -# # This script should be specified in --metadata=startup-script-url= option and # --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it. # The script does a reboot to fully enable MIG and then configures the MIG device based on the @@ -22,2030 +21,370 @@ # It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the # YARN setup to fully utilize the MIG instances on YARN. # -# This initialization action is generated from -# initialization-actions/templates/spark-rapids/mig.sh.in -# -# Modifications made directly to the generated file will be lost when -# the template is re-evaluated - +# Much of this code is copied from install_gpu_driver.sh to do the driver and CUDA installation. +# It's copied in order to not affect the existing scripts when not using MIG. set -euxo pipefail -function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) -function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) -function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) - -function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) -function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) -function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) -function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) - -function define_os_comparison_functions() { - - readonly -A supported_os=( - ['debian']="10 11 12" - ['rocky']="8 9" - ['ubuntu']="18.04 20.04 22.04" - ) - - # dynamically define OS version test utility functions - if [[ "$(os_id)" == "rocky" ]]; - then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') - else _os_version="$(os_version)"; fi - for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" - - for osver in $(echo "${supported_os["${os_id_val}"]}") ; do - eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" - eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" - eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" - done - done -} - -define_os_comparison_functions - -function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) - -function os_vercat() ( set +x - if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' - elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' - else os_version ; fi ; ) - -function repair_old_backports { - if ! is_debuntu ; then return ; fi - # This script uses 'apt-get update' and is therefore potentially dependent on - # backports repositories which have been archived. In order to mitigate this - # problem, we will use archive.debian.org for the oldoldstable repo - - # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 - debdists="https://deb.debian.org/debian/dists" - oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); - oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); - stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); - - matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) - - for filename in "${matched_files[@]}"; do - # Fetch from archive.debian.org for ${oldoldstable}-backports - perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } - {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" - done -} - -function print_metadata_value() { - local readonly tmpfile=$(mktemp) - http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ - -s -o ${tmpfile} 2>/dev/null) - local readonly return_code=$? - # If the command completed successfully, print the metadata value to stdout. - if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then - cat ${tmpfile} - fi - rm -f ${tmpfile} - return ${return_code} -} - -function print_metadata_value_if_exists() { - local return_code=1 - local readonly url=$1 - print_metadata_value ${url} - return_code=$? - return ${return_code} -} - -# replicates /usr/share/google/get_metadata_value -function get_metadata_value() ( - set +x - local readonly varname=$1 - local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 - # Print the instance metadata value. - print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} - return_code=$? - # If the instance doesn't have the value, try the project. - if [[ ${return_code} != 0 ]]; then - print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} - return_code=$? - fi - - return ${return_code} -) - -function get_metadata_attribute() ( - set +x - local -r attribute_name="$1" - local -r default_value="${2:-}" - get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" -) +function get_metadata_attribute() { + local -r attribute_name=$1 + local -r default_value=$2 + /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +} + +# Fetch Linux Family distro and Dataproc Image version +readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') +readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" +DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value image|grep -Eo 'dataproc-[0-9]-[0-9]'|grep -Eo '[0-9]-[0-9]'|sed -e 's/-/./g') +echo "${DATAPROC_IMAGE_VERSION}" >> /usr/local/share/startup-mig-log + +# CUDA version and Driver version config +CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.2.2') #12.2.2 +NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '535.104.05') #535.104.05 +CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 + +# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) +if [[ "${OS_NAME}" == "ubuntu" ]]; then + UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04 + UBUNTU_VERSION=${UBUNTU_VERSION%.*} + if [[ "${UBUNTU_VERSION}" == "18" ]]; then + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 + NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 + CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 + fi +fi -function execute_with_retries() ( - set +x - local -r cmd="$*" +SECURE_BOOT="disabled" +SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') - if [[ "$cmd" =~ "^apt-get install" ]] ; then - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - fi - for ((i = 0; i < 3; i++)); do - set -x - time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } - set +x - if [[ $retval == 0 ]] ; then return 0 ; fi +function execute_with_retries() { + local -r cmd=$1 + for ((i = 0; i < 10; i++)); do + if eval "$cmd"; then + return 0 + fi sleep 5 done return 1 -) - -function cache_fetched_package() { - local src_url="$1" - local gcs_fn="$2" - local local_fn="$3" - - if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then - time gcloud storage cp "${gcs_fn}" "${local_fn}" - else - time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ - gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) - fi } -function add_contrib_component() { - if ! is_debuntu ; then return ; fi - if ge_debian12 ; then - # Include in sources file components on which nvidia-kernel-open-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib" - - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list - fi -} +# Enables a systemd service on bootup to install new headers. +# This service recompiles kernel modules for Ubuntu and Debian, which are necessary for the functioning of nvidia-smi. +function setup_systemd_update_headers() { + cat </lib/systemd/system/install-headers.service +[Unit] +Description=Install Linux headers for the current kernel +After=network-online.target -function set_hadoop_property() { - local -r config_file=$1 - local -r property=$2 - local -r value=$3 - "${bdcfg}" set_property \ - --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ - --name "${property}" --value "${value}" \ - --clobber -} +[Service] +ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done' +Type=oneshot +RemainAfterExit=yes -function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts - if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' +[Install] +WantedBy=multi-user.target +EOF - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + # Reload systemd to recognize the new unit file + systemctl daemon-reload - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' + # Enable and start the service + systemctl enable --now install-headers.service } -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' +# Install NVIDIA GPU driver provided by NVIDIA +function install_nvidia_gpu_driver() { - # Fix local dirs access permissions - local yarn_local_dirs=() + ## common steps for all linux family distros + readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*} - readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + ## installation steps based OS_NAME + if [[ ${OS_NAME} == "debian" ]]; then - if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" - fi -} + DEBIAN_VERSION=$(lsb_release -r|awk '{print $2}') # 10 or 11 + export DEBIAN_FRONTEND=noninteractive -function clean_up_sources_lists() { - # - # bigtop (primary) - # - local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" + execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then - region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" + readonly LOCAL_INSTALLER_DEB="cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb - local regional_bigtop_repo_uri - regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | - sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | - grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | - cut -d ' ' -f 2 | - head -1) + dpkg -i /tmp/local-installer.deb + cp /var/cuda-repo-debian${DEBIAN_VERSION}-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/ + add-apt-repository contrib + execute_with_retries "apt-get update" - if [[ "${regional_bigtop_repo_uri}" == */ ]]; then - local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" - else - local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + if [[ ${DEBIAN_VERSION} == 10 ]]; then + apt remove -y libglvnd0 fi - local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" - rm -f "${bigtop_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ - "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" - - sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - fi - - # - # adoptium - # - # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu - local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" - local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" - rm -f "${adoptium_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ - | gpg --dearmor -o "${adoptium_kr_path}" - echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ - > /etc/apt/sources.list.d/adoptium.list - - - # - # docker - # - local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" - local docker_repo_file="/etc/apt/sources.list.d/docker.list" - local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" - - rm -f "${docker_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ - | gpg --dearmor -o "${docker_kr_path}" - echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ - > ${docker_repo_file} - - # - # google cloud + logging/monitoring - # - if ls /etc/apt/sources.list.d/google-cloud*.list ; then - rm -f /usr/share/keyrings/cloud.google.gpg - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - for list in google-cloud google-cloud-logging google-cloud-monitoring ; do - list_file="/etc/apt/sources.list.d/${list}.list" - if [[ -f "${list_file}" ]]; then - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" - fi - done - fi - - # - # cran-r - # - if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then - keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" - if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi - rm -f /usr/share/keyrings/cran-r.gpg - curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ - gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list - fi - - # - # mysql - # - if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then - rm -f /usr/share/keyrings/mysql.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ - gpg --dearmor -o /usr/share/keyrings/mysql.gpg - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list - fi - - if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" + execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" -} - -function set_proxy(){ - METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + # enable a systemd service that updates kernel headers after reboot + setup_systemd_update_headers + + elif [[ ${OS_NAME} == "ubuntu" ]]; then - if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + UBUNTU_VERSION=$(lsb_release -r|awk '{print $2}') # 20.04 or 22.04 + UBUNTU_VERSION=${UBUNTU_VERSION%.*} # 20 or 22 - export METADATA_HTTP_PROXY - export http_proxy="${METADATA_HTTP_PROXY}" - export https_proxy="${METADATA_HTTP_PROXY}" - export HTTP_PROXY="${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="${METADATA_HTTP_PROXY}" - no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" - local no_proxy_svc - for no_proxy_svc in compute secretmanager dns servicedirectory logging \ - bigquery composer pubsub bigquerydatatransfer dataflow \ - storage datafusion ; do - no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" - done + execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - export NO_PROXY="${no_proxy}" -} + readonly UBUNTU_REPO_CUDA_PIN="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-ubuntu${UBUNTU_VERSION}04.pin" + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 -function mount_ramdisk(){ - local free_mem - free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" - if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + readonly LOCAL_INSTALLER_DEB="cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" -o /tmp/local-installer.deb - # Write to a ramdisk instead of churning the persistent disk + dpkg -i /tmp/local-installer.deb + cp /var/cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/ + execute_with_retries "apt-get update" + + execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" + execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" - tmpdir="/mnt/shm" - mkdir -p "${tmpdir}" - mount -t tmpfs tmpfs "${tmpdir}" + # enable a systemd service that updates kernel headers after reboot + setup_systemd_update_headers - # Download conda packages to tmpfs - /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + elif [[ ${OS_NAME} == "rocky" ]]; then - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - pip cache purge || echo "unable to purge pip cache" + ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8.8 or 9.1 + ROCKY_VERSION=${ROCKY_VERSION%.*} # 8 or 9 - # Download pip packages to tmpfs - pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + readonly NVIDIA_ROCKY_REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/rhel${ROCKY_VERSION}/x86_64/cuda-rhel${ROCKY_VERSION}.repo" + execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" + execute_with_retries "dnf clean all" + execute_with_retries "dnf -y -q module install nvidia-driver:${NVIDIA_DRIVER_VERSION_PREFIX}" + execute_with_retries "dnf -y -q install cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" + modprobe nvidia - # Download OS packages to tmpfs - if is_debuntu ; then - mount -t tmpfs tmpfs /var/cache/apt/archives else - mount -t tmpfs tmpfs /var/cache/dnf - fi -} - -function check_os() { - if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then - echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." - exit 1 - elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then - echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." - exit 1 - elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then - echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." - exit 1 - fi - - SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - readonly SPARK_VERSION - if version_lt "${SPARK_VERSION}" "3.1" || \ - version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + echo "Unsupported OS: '${OS_NAME}'" exit 1 fi - - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi -} - -# -# Generate repo file under /etc/apt/sources.list.d/ -# -function apt_add_repo() { - local -r repo_name="$1" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local -r include_src="${4:-yes}" - local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" - - echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" - if [[ "${include_src}" == "yes" ]] ; then - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" - fi - - apt-get update -qq -} - -# -# Generate repo file under /etc/yum.repos.d/ -# -function dnf_add_repo() { - local -r repo_name="$1" - local -r repo_url="$3" # "http(s)://host/path/filename.repo" - local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - - curl -s -L "${repo_url}" \ - | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ - | dd of="${repo_path}" status=progress -} - -# -# Install package signing key and add corresponding repository -# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html -# -# Keyrings default to -# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or -# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) -# -function os_add_repo() { - local -r repo_name="$1" - local -r signing_key_url="$2" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local kr_path - if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi - - mkdir -p "$(dirname "${kr_path}")" - - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ - | gpg --import --no-default-keyring --keyring "${kr_path}" - - if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" - else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi -} - - -readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" - -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' - - -function set_support_matrix() { - # CUDA version and Driver version - # https://docs.nvidia.com/deploy/cuda-compatibility/ - # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html - # https://developer.nvidia.com/cuda-downloads - - # Minimum supported version for open kernel driver is 515.43.04 - # https://github.com/NVIDIA/open-gpu-kernel-modules/tags - # Rocky8: 12.0: 525.147.05 - local latest - latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" - readonly -A DRIVER_FOR_CUDA=( - ["11.7"]="515.65.01" ["11.8"]="525.147.05" - ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" - ) - readonly -A DRIVER_SUBVER=( - ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" - ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" - ) - # https://developer.nvidia.com/cudnn-downloads - if is_debuntu ; then - readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" - ) - elif is_rocky ; then - # rocky: - # 12.0: 8.8.1.3 - # 12.1: 8.9.3.28 - # 12.2: 8.9.7.29 - # 12.3: 9.0.0.312 - # 12.4: 9.1.1.17 - # 12.5: 9.2.1.18 - # 12.6: 9.5.1.17 - readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" - ) - fi - # https://developer.nvidia.com/nccl/nccl-download - # 12.2: 2.19.3, 12.5: 2.21.5 - readonly -A NCCL_FOR_CUDA=( - ["11.7"]="2.21.5" ["11.8"]="2.21.5" - ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" - ) - readonly -A CUDA_SUBVER=( - ["11.7"]="11.7.1" ["11.8"]="11.8.0" - ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" - ) -} - -set_support_matrix - -function set_cuda_version() { - local cuda_url - cuda_url=$(get_metadata_attribute 'cuda-url' '') - if [[ -n "${cuda_url}" ]] ; then - # if cuda-url metadata variable has been passed, extract default version from url - local CUDA_URL_VERSION - CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" - if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then - DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" - CUDA_FULL_VERSION="${CUDA_URL_VERSION}" - fi - fi - - if ( ! test -v DEFAULT_CUDA_VERSION ) ; then - DEFAULT_CUDA_VERSION='12.4' - fi - readonly DEFAULT_CUDA_VERSION - - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") - if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then - CUDA_FULL_VERSION="${CUDA_VERSION}" - CUDA_VERSION="${CUDA_VERSION%.*}" - fi - readonly CUDA_VERSION - if ( ! test -v CUDA_FULL_VERSION ) ; then - CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} - fi - readonly CUDA_FULL_VERSION - + ldconfig + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" } -set_cuda_version - -function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) -function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) -function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) - -function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) -function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) -function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) - -function set_driver_version() { - local gpu_driver_url - gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') - - local cuda_url - cuda_url=$(get_metadata_attribute 'cuda-url' '') - - local DEFAULT_DRIVER - # Take default from gpu-driver-url metadata value - if [[ -n "${gpu_driver_url}" ]] ; then - DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" - if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi - # Take default from cuda-url metadata value as a backup - elif [[ -n "${cuda_url}" ]] ; then - local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" - if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then - major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" - driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then - # use the version indicated by the cuda url as the default if it exists - DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then - # use the maximum sub-version available for the major version indicated in cuda url as the default - DEFAULT_DRIVER="${driver_max_maj_version}" - fi - fi - fi - - if ( ! test -v DEFAULT_DRIVER ) ; then - # If a default driver version has not been extracted, use the default for this version of CUDA - DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} - fi - - DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") - - readonly DRIVER_VERSION - readonly DRIVER="${DRIVER_VERSION%%.*}" - - export DRIVER_VERSION DRIVER - - gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then - echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" - exit 1 - fi +function enable_mig() { + nvidia-smi -mig 1 } -set_driver_version - -readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" -readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" - -# Parameters for NVIDIA-provided cuDNN library -readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} -CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") -function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) -function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) -# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} -if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" -elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then - # cuDNN v8 is not distribution for ubuntu20+, debian12 - CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" -elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then - # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 - CUDNN_VERSION="8.8.0.121" -fi -readonly CUDNN_VERSION - -readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} -readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) - -# Parameters for NVIDIA-provided Debian GPU driver -readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - -readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") - -USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" -readonly USERSPACE_FILENAME - -# Short name for urls -if is_ubuntu22 ; then - # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at - # https://developer.download.nvidia.com/compute/machine-learning/repos/ - # use packages from previous release until such time as nvidia - # release ubuntu2204 builds - - shortname="$(os_id)$(os_vercat)" - nccl_shortname="ubuntu2004" -elif ge_rocky9 ; then - # use packages from previous release until such time as nvidia - # release rhel9 builds - - shortname="rhel9" - nccl_shortname="rhel8" -elif is_rocky ; then - shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" - nccl_shortname="${shortname}" -else - shortname="$(os_id)$(os_vercat)" - nccl_shortname="${shortname}" -fi - -# Parameters for NVIDIA-provided package repositories -readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' -readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" - -# Parameters for NVIDIA-provided NCCL library -readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" -NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") -readonly NCCL_REPO_URL -readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub - -function set_cuda_runfile_url() { - local MAX_DRIVER_VERSION - local MAX_CUDA_VERSION - - local MIN_OPEN_DRIVER_VER="515.48.07" - local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" - local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER - - if is_cuda12 ; then - if is_debian12 ; then - MIN_DRIVER_VERSION="545.23.06" - MIN_CUDA_VERSION="12.3.0" - elif is_debian10 ; then - MAX_DRIVER_VERSION="555.42.02" - MAX_CUDA_VERSION="12.5.0" - elif is_ubuntu18 ; then - MAX_DRIVER_VERSION="530.30.02" - MAX_CUDA_VERSION="12.1.1" - fi - elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then - if le_debian10 ; then - # cuda 11 is not supported for <= debian10 - MAX_CUDA_VERSION="0" - MAX_DRIVER_VERSION="0" - fi +function configure_mig_cgi() { + if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then + META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI) + nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C else - echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - fi - - if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then - echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then - echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - fi - if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then - echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" - elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then - echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" - fi - - # driver version named in cuda runfile filename - # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) - readonly -A drv_for_cuda=( - ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" - ["11.8.0"]="520.61.05" - ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" - ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" - ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" - ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" - ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ - ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not - ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" - ) - - # Verify that the file with the indicated combination exists - local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} - CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" - local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" - local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" - - NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - readonly NVIDIA_CUDA_URL - - CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" - readonly CUDA_RUNFILE - - if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then - echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" - exit 1 - fi - - if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then - echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" - elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then - echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then - echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then - echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" - fi -} - -set_cuda_runfile_url - -# Parameter for NVIDIA-provided Rocky Linux GPU driver -readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" - -CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" -CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" -if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then - # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" - if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then - # When cuDNN version is greater than or equal to 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" - fi - # Use legacy url format with one of the tarball name formats depending on version as above - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" -fi -if ( version_ge "${CUDA_VERSION}" "12.0" ); then - # Use modern url format When cuda version is greater than or equal to 12.0 - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" -fi -readonly CUDNN_TARBALL -readonly CUDNN_TARBALL_URL - -# Whether to install NVIDIA-provided or OS-provided GPU driver -GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') -readonly GPU_DRIVER_PROVIDER - -# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') -readonly INSTALL_GPU_AGENT - -NVIDIA_SMI_PATH='/usr/bin' -MIG_MAJOR_CAPS=0 -IS_MIG_ENABLED=0 - -CUDA_KEYRING_PKG_INSTALLED="0" -function install_cuda_keyring_pkg() { - if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi - local kr_ver=1.1 - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ - -o "${tmpdir}/cuda-keyring.deb" - dpkg -i "${tmpdir}/cuda-keyring.deb" - rm -f "${tmpdir}/cuda-keyring.deb" - CUDA_KEYRING_PKG_INSTALLED="1" -} - -function uninstall_cuda_keyring_pkg() { - apt-get purge -yq cuda-keyring - CUDA_KEYRING_PKG_INSTALLED="0" -} - -function install_local_cuda_repo() { - if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi - - if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi - CUDA_LOCAL_REPO_INSTALLED="1" - pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" - CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" - readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" - readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" - readonly DIST_KEYRING_DIR="/var/${pkgname}" - - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" - - dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" - rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" - cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ - - if is_ubuntu ; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ - -o /etc/apt/preferences.d/cuda-repository-pin-600 - fi - - touch "${workdir}/install-local-cuda-repo-complete" -} -function uninstall_local_cuda_repo(){ - apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - rm -f "${workdir}/install-local-cuda-repo-complete" -} - -CUDNN_PKG_NAME="" -function install_local_cudnn_repo() { - if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi - pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" - CUDNN_PKG_NAME="${pkgname}" - local_deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" - - # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o "${tmpdir}/local-installer.deb" - - dpkg -i "${tmpdir}/local-installer.deb" - - rm -f "${tmpdir}/local-installer.deb" - - cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - - touch "${workdir}/install-local-cudnn-repo-complete" -} - -function uninstall_local_cudnn_repo() { - apt-get purge -yq "${CUDNN_PKG_NAME}" - rm -f "${workdir}/install-local-cudnn-repo-complete" -} - -CUDNN8_LOCAL_REPO_INSTALLED="0" -CUDNN8_PKG_NAME="" -function install_local_cudnn8_repo() { - if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi - - if is_ubuntu ; then cudnn8_shortname="ubuntu2004" - elif is_debian ; then cudnn8_shortname="debian11" - else return 0 ; fi - if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 - elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 - else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi - cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" - - pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" - CUDNN8_PKG_NAME="${pkgname}" - - deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_fn="${tmpdir}/${deb_fn}" - local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" - - # cache the cudnn package - cache_fetched_package "${local_deb_url}" \ - "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ - "${local_deb_fn}" - - local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" - # If we are using a ram disk, mount another where we will unpack the cudnn local installer - if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then - mkdir -p "${cudnn_path}" - mount -t tmpfs tmpfs "${cudnn_path}" - fi - - dpkg -i "${local_deb_fn}" - - rm -f "${local_deb_fn}" - - cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/install-local-cudnn8-repo-complete" -} - -function uninstall_local_cudnn8_repo() { - apt-get purge -yq "${CUDNN8_PKG_NAME}" - rm -f "${workdir}/install-local-cudnn8-repo-complete" -} - -function install_nvidia_nccl() { - if test -f "${workdir}/nccl-complete" ; then return ; fi - - if is_cuda11 && is_debian12 ; then - echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" - return - fi - - local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - - # https://github.com/NVIDIA/nccl/blob/master/README.md - # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Fermi: SM_20, compute_30 - # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 - # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 - # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 - - # The following architectures are suppored by open kernel driver - # Volta: SM_70,SM_72, compute_70,compute_72 - # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 - - # The following architectures are supported by CUDA v11.8+ - # Ada: SM_89, compute_89 - # Hopper: SM_90,SM_90a compute_90,compute_90a - # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" - if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" - fi - if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" - fi - - mkdir -p "${workdir}" - pushd "${workdir}" - - test -d "${workdir}/nccl" || { - local tarball_fn="v${NCCL_VERSION}-1.tar.gz" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ - | tar xz - mv "nccl-${NCCL_VERSION}-1" nccl - } - - local build_path - if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else - build_path="nccl/build/pkg/rpm/x86_64" ; fi - - test -d "${workdir}/nccl/build" || { - local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" - - output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') - if echo "${output}" | grep -q "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - else - # build and cache - pushd nccl - # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install - install_build_dependencies - if is_debuntu ; then - # These packages are required to build .deb packages from source - execute_with_retries \ - apt-get install -y -qq build-essential devscripts debhelper fakeroot - export NVCC_GENCODE - execute_with_retries make -j$(nproc) pkg.debian.build - elif is_rocky ; then - # These packages are required to build .rpm packages from source - execute_with_retries \ - dnf -y -q install rpm-build rpmdevtools - export NVCC_GENCODE - execute_with_retries make -j$(nproc) pkg.redhat.build - fi - tar czvf "/${local_tarball}" "../${build_path}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" - make clean - popd - fi - gcloud storage cat "${gcs_tarball}" | tar xz - } - - if is_debuntu ; then - dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" - elif is_rocky ; then - rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" + # Dataproc only supports A100's right now split in 2 if not specified + nvidia-smi mig -cgi 9,9 -C fi - - popd - touch "${workdir}/nccl-complete" } -function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) -function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) - -function install_nvidia_cudnn() { - if test -f "${workdir}/cudnn-complete" ; then return ; fi - local major_version - major_version="${CUDNN_VERSION%%.*}" - local cudnn_pkg_version - cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" - - if is_rocky ; then - if is_cudnn8 ; then - execute_with_retries dnf -y -q install \ - "libcudnn${major_version}" \ - "libcudnn${major_version}-devel" - sync - elif is_cudnn9 ; then - execute_with_retries dnf -y -q install \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" - sync - else - echo "Unsupported cudnn version: '${major_version}'" - fi - elif is_debuntu; then - if ge_debian12 && is_src_os ; then - apt-get -y install nvidia-cudnn - else - if is_cudnn8 ; then - install_local_cudnn8_repo - - apt-get update -qq - - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn8=${cudnn_pkg_version}" \ - "libcudnn8-dev=${cudnn_pkg_version}" - - uninstall_local_cudnn8_repo - sync - elif is_cudnn9 ; then - install_cuda_keyring_pkg - - apt-get update -qq - - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" - sync - else - echo "Unsupported cudnn version: [${CUDNN_VERSION}]" - fi - fi +function upgrade_kernel() { + # Determine which kernel is installed + if [[ "${OS_NAME}" == "debian" ]]; then + CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( / Debian (\S+) / )'` + elif [[ "${OS_NAME}" == "ubuntu" ]]; then + CURRENT_KERNEL_VERSION=`cat /proc/version | perl -ne 'print( /^Linux version (\S+) / )'` + elif [[ ${OS_NAME} == rocky ]]; then + KERN_VER=$(yum info --installed kernel | awk '/^Version/ {print $3}') + KERN_REL=$(yum info --installed kernel | awk '/^Release/ {print $3}') + CURRENT_KERNEL_VERSION="${KERN_VER}-${KERN_REL}" else - echo "Unsupported OS: '${_shortname}'" - exit 1 - fi - - ldconfig - - echo "NVIDIA cuDNN successfully installed for ${_shortname}." - touch "${workdir}/cudnn-complete" -} - -function add_nonfree_components() { - if is_src_nvidia ; then return; fi - if ge_debian12 ; then - # Include in sources file components on which nvidia-open-kernel-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib non-free non-free-firmware" - - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list - fi -} - -function add_repo_nvidia_container_toolkit() { - local nvctk_root="https://nvidia.github.io/libnvidia-container" - local signing_key_url="${nvctk_root}/gpgkey" - local repo_data - - if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" - else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi - - os_add_repo nvidia-container-toolkit \ - "${signing_key_url}" \ - "${repo_data}" \ - "no" -} - -function add_repo_cuda() { - if is_debuntu ; then - install_cuda_keyring_pkg # 11.7+, 12.0+ - elif is_rocky ; then - execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - fi -} - -function build_driver_from_github() { - # non-GPL driver will have been built on rocky8 - if is_rocky8 ; then return 0 ; fi - pushd "${workdir}" - - test -d "${workdir}/open-gpu-kernel-modules" || { - local tarball_fn="${DRIVER_VERSION}.tar.gz" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ - | tar xz - mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules - } - - local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" - test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local build_dir - if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] - then build_dir="${modulus_md5sum}" - else build_dir="unsigned" ; fi - - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" - - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then - echo "cache hit" - else - # build the kernel modules - pushd open-gpu-kernel-modules - install_build_dependencies - if is_cuda11 && is_ubuntu22 ; then - echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" - exit 1 - fi - execute_with_retries make -j$(nproc) modules \ - > kernel-open/build.log \ - 2> kernel-open/build_error.log - # Sign kernel modules - if [[ -n "${PSN}" ]]; then - for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do - "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ - "${mok_key}" \ - "${mok_der}" \ - "${module}" - done - fi - make modules_install \ - >> kernel-open/build.log \ - 2>> kernel-open/build_error.log - # Collect build logs and installed binaries - tar czvf "${local_tarball}" \ - "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ - $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" - make clean - popd - fi - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv - depmod -a - } - - popd -} - -function build_driver_from_packages() { - if is_debuntu ; then - if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then - local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else - local pkglist=("nvidia-driver-${DRIVER}-open") ; fi - if is_debian ; then - pkglist=( - "firmware-nvidia-gsp=${DRIVER_VERSION}-1" - "nvidia-smi=${DRIVER_VERSION}-1" - "nvidia-alternative=${DRIVER_VERSION}-1" - "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1" - "nvidia-kernel-support=${DRIVER_VERSION}-1" - "nvidia-modprobe=${DRIVER_VERSION}-1" - "libnvidia-ml1=${DRIVER_VERSION}-1" - ) - fi - add_contrib_component - apt-get update -qq - execute_with_retries apt-get install -y -qq --no-install-recommends dkms - #configure_dkms_certs - execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" - sync - - elif is_rocky ; then - #configure_dkms_certs - if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then - echo "nvidia-driver:${DRIVER}-dkms installed successfully" + echo "unsupported OS: ${OS_NAME}!" + exit -1 + fi + + # Get latest version available in repos + if [[ "${OS_NAME}" == "debian" ]]; then + apt-get -qq update + TARGET_VERSION=$(apt-cache show --no-all-versions linux-image-amd64 | awk '/^Version/ {print $2}') + elif [[ "${OS_NAME}" == "ubuntu" ]]; then + apt-get -qq update + LATEST_VERSION=$(apt-cache show --no-all-versions linux-image-gcp | awk '/^Version/ {print $2}') + TARGET_VERSION=`echo ${LATEST_VERSION} | perl -ne 'printf(q{%s-%s-gcp},/(\d+\.\d+\.\d+)\.(\d+)/)'` + elif [[ "${OS_NAME}" == "rocky" ]]; then + if yum info --available kernel ; then + KERN_VER=$(yum info --available kernel | awk '/^Version/ {print $3}') + KERN_REL=$(yum info --available kernel | awk '/^Release/ {print $3}') + TARGET_VERSION="${KERN_VER}-${KERN_REL}" else - execute_with_retries dnf -y -q module install 'nvidia-driver:latest' + TARGET_VERSION="${CURRENT_KERNEL_VERSION}" fi - sync - fi - #clear_dkms_key -} - -function install_nvidia_userspace_runfile() { - - # This .run file contains NV's OpenGL implementation as well as - # nvidia optimized implementations of the gtk+ 2,3 stack(s) not - # including glib (https://docs.gtk.org/glib/), and what appears to - # be a copy of the source from the kernel-open directory of for - # example DRIVER_VERSION=560.35.03 - # - # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz - # - # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run - # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - if test -f "${workdir}/userspace-complete" ; then return ; fi - local local_fn="${tmpdir}/userspace.run" - - cache_fetched_package "${USERSPACE_URL}" \ - "${pkg_bucket}/${USERSPACE_FILENAME}" \ - "${local_fn}" - - local runfile_args - runfile_args="" - local cache_hit="0" - local local_tarball - - if is_rocky8 ; then - local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" - test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" - local_tarball="${workdir}/${build_tarball}" - local build_dir - if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] - then build_dir="${modulus_md5sum}" - else build_dir="unsigned" ; fi - - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" - - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then - cache_hit="1" - runfile_args="--no-kernel-modules" - echo "cache hit" - else - install_build_dependencies - - local signing_options - signing_options="" - if [[ -n "${PSN}" ]]; then - signing_options="--module-signing-hash sha256 \ - --module-signing-x509-hash sha256 \ - --module-signing-secret-key \"${mok_key}\" \ - --module-signing-public-key \"${mok_der}\" \ - --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ - " - fi - - runfile_args="--no-dkms ${signing_options}" - fi - } - else - runfile_args="--no-kernel-modules" fi - execute_with_retries bash "${local_fn}" -e -q \ - ${runfile_args} \ - --ui=none \ - --install-libglvnd \ - --tmpdir="${tmpdir}" + # Skip this script if we are already on the target version + if [[ "${CURRENT_KERNEL_VERSION}" == "${TARGET_VERSION}" ]]; then + echo "target kernel version [${TARGET_VERSION}] is installed" - if is_rocky8 ; then - if [[ "${cache_hit}" == "1" ]] ; then - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv - depmod -a - else - tar czvf "${local_tarball}" \ - /var/log/nvidia-installer.log \ - $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" + # Reboot may have interrupted dpkg. Bring package system to a good state + if [[ "${OS_NAME}" == "debian" || "${OS_NAME}" == "ubuntu" ]]; then + dpkg --configure -a fi - fi - - rm -f "${local_fn}" - touch "${workdir}/userspace-complete" - sync -} - -function install_cuda_runfile() { - if test -f "${workdir}/cuda-complete" ; then return ; fi - local local_fn="${tmpdir}/cuda.run" - - cache_fetched_package "${NVIDIA_CUDA_URL}" \ - "${pkg_bucket}/${CUDA_RUNFILE}" \ - "${local_fn}" - - execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" - rm -f "${local_fn}" - touch "${workdir}/cuda-complete" - sync -} - -function install_cuda_toolkit() { - local cudatk_package=cuda-toolkit - if ge_debian12 && is_src_os ; then - cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1" - elif [[ -n "${CUDA_VERSION}" ]]; then - cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}" - fi - cuda_package="cuda=${CUDA_FULL_VERSION}-1" - readonly cudatk_package - if is_debuntu ; then -# if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi - execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} - elif is_rocky ; then - # rocky9: cuda-11-[7,8], cuda-12-[1..6] - execute_with_retries dnf -y -q install "${cudatk_package}" - fi - sync -} - -function load_kernel_module() { - # for some use cases, the kernel module needs to be removed before first use of nvidia-smi - for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" - done - - depmod -a - modprobe nvidia - for suffix in uvm modeset drm; do - modprobe "nvidia-${suffix}" - done - # TODO: if peermem is available, also modprobe nvidia-peermem -} - -function install_cuda(){ - if test -f "${workdir}/cuda-repo-complete" ; then return ; fi - - if ( ge_debian12 && is_src_os ) ; then - echo "installed with the driver on ${_shortname}" - return 0 - fi - - # The OS package distributions are unreliable - install_cuda_runfile - - # Includes CUDA packages - add_repo_cuda - - touch "${workdir}/cuda-repo-complete" -} - -function install_nvidia_container_toolkit() { - local container_runtime_default - if command -v docker ; then container_runtime_default='docker' - elif command -v containerd ; then container_runtime_default='containerd' - elif command -v crio ; then container_runtime_default='crio' - else container_runtime_default='' ; fi - CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") - - if test -z "${CONTAINER_RUNTIME}" ; then return ; fi - - add_repo_nvidia_container_toolkit - if is_debuntu ; then - execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else - execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi - nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" - systemctl restart "${CONTAINER_RUNTIME}" -} - -# Install NVIDIA GPU driver provided by NVIDIA -function install_nvidia_gpu_driver() { - if test -f "${workdir}/gpu-driver-complete" ; then return ; fi - if ( ge_debian12 && is_src_os ) ; then - add_nonfree_components - apt-get update -qq - apt-get -yq install \ - dkms \ - nvidia-open-kernel-dkms \ - nvidia-open-kernel-support \ - nvidia-smi \ - libglvnd0 \ - libcuda1 - echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" return 0 fi - # OS driver packages do not produce reliable driver ; use runfile - install_nvidia_userspace_runfile - - build_driver_from_github - - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - touch "${workdir}/gpu-driver-complete" -} - -function install_ops_agent(){ - if test -f "${workdir}/ops-agent-complete" ; then return ; fi - - mkdir -p /opt/google - cd /opt/google - # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation - curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh - execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - - touch "${workdir}/ops-agent-complete" -} - -# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_agent() { - # Stackdriver GPU agent parameters -# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' - local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' - if ( ! command -v pip && is_debuntu ) ; then - execute_with_retries "apt-get install -y -qq python3-pip" - fi - local install_dir=/opt/gpu-utilization-agent - mkdir -p "${install_dir}" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ - | sed -e 's/-u --format=/--format=/' \ - | dd status=none of="${install_dir}/report_gpu_metrics.py" - local venv="${install_dir}/venv" - python3 -m venv "${venv}" -( - source "${venv}/bin/activate" - python3 -m pip install --upgrade pip - execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" -) - sync - - # Generate GPU service. - cat </lib/systemd/system/gpu-utilization-agent.service -[Unit] -Description=GPU Utilization Metric Agent - -[Service] -Type=simple -PIDFile=/run/gpu_agent.pid -ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' -User=root -Group=root -WorkingDirectory=/ -Restart=always - -[Install] -WantedBy=multi-user.target -EOF - # Reload systemd manager configuration - systemctl daemon-reload - # Enable gpu-utilization-agent service - systemctl --no-reload --now enable gpu-utilization-agent.service -} - -function configure_gpu_exclusive_mode() { - # check if running spark 3, if not, enable GPU exclusive mode - local spark_version - spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) - if [[ ${spark_version} != 3.* ]]; then - # include exclusive mode on GPU - nvidia-smi -c EXCLUSIVE_PROCESS - fi -} - -function fetch_mig_scripts() { - mkdir -p /usr/local/yarn-mig-scripts - sudo chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - sudo chmod 755 /usr/local/yarn-mig-scripts/* -} - -function configure_gpu_script() { - # Download GPU discovery script - local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' - mkdir -p ${spark_gpu_script_dir} - # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still - # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: - # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh - local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh" - cat > "${gpus_resources_script}" <<'EOF' -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') - -echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} -EOF - - chmod a+rx "${gpus_resources_script}" - - local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - if version_ge "${SPARK_VERSION}" "3.0" ; then - local gpu_count - gpu_count="$(lspci | grep NVIDIA | wc -l)" - local executor_cores - executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" - local executor_memory - executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" - local task_cpus=2 - local gpu_amount - gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - - cat >>"${spark_defaults_conf}" <> "${HADOOP_CONF_DIR}/container-executor.cfg" - printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - else - printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" + # Install the latest kernel + if [[ ${OS_NAME} == debian ]]; then + apt-get install -y linux-image-amd64 + elif [[ "${OS_NAME}" == "ubuntu" ]]; then + apt-get install -y linux-image-gcp + elif [[ "${OS_NAME}" == "rocky" ]]; then + dnf -y -q install kernel fi - # Configure a systemd unit to ensure that permissions are set on restart - cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<&2 ; return 0 - elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 - else nvsmi_works="1" ; fi - - if [[ "$1" == "-L" ]] ; then - local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" - if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" - else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi + for startup_script in ${STARTUP_SCRIPT} ${POST_HDFS_STARTUP_SCRIPT} ; do + sed -i -e 's:/usr/bin/env bash:/usr/bin/env bash\nexit 0:' ${startup_script} + done - return 0 - fi + cp /var/log/dataproc-initialization-script-0.log /var/log/dataproc-initialization-script-0.log.0 - "${nvsmi}" $* + systemctl reboot } -function install_build_dependencies() { - if test -f "${workdir}/build-dependencies-complete" ; then return ; fi - - if is_debuntu ; then - if is_ubuntu22 && is_cuda12 ; then - # On ubuntu22, the default compiler does not build some kernel module versions - # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 - execute_with_retries apt-get install -y -qq gcc-12 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-12 +# Verify if compatible linux distros and secure boot options are used +function check_os_and_secure_boot() { + if [[ "${OS_NAME}" == "debian" ]]; then + DEBIAN_VERSION=$(lsb_release -r | awk '{print $2}') # 10 or 11 + if [[ "${DEBIAN_VERSION}" != "10" && "${DEBIAN_VERSION}" != "11" ]]; then + echo "Error: The Debian version (${DEBIAN_VERSION}) is not supported. Please use a compatible Debian version." + exit 1 fi - - elif is_rocky ; then - execute_with_retries dnf -y -q install gcc - - local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" - set +e - eval "${dnf_cmd}" > "${install_log}" 2>&1 - local retval="$?" - set -e - - if [[ "${retval}" == "0" ]] ; then return ; fi - - if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then - # this kernel-devel may have been migrated to the vault - local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" - local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" - dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ - "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" - )" + elif [[ "${OS_NAME}" == "ubuntu" ]]; then + UBUNTU_VERSION=$(lsb_release -r | awk '{print $2}') # 20.04 + UBUNTU_VERSION=${UBUNTU_VERSION%.*} + if [[ "${UBUNTU_VERSION}" != "18" && "${UBUNTU_VERSION}" != "20" && "${UBUNTU_VERSION}" != "22" ]]; then + echo "Error: The Ubuntu version (${UBUNTU_VERSION}) is not supported. Please use a compatible Ubuntu version." + exit 1 + fi + elif [[ "${OS_NAME}" == "rocky" ]]; then + ROCKY_VERSION=$(lsb_release -r | awk '{print $2}') # 8 or 9 + ROCKY_VERSION=${ROCKY_VERSION%.*} + if [[ "${ROCKY_VERSION}" != "8" && "${ROCKY_VERSION}" != "9" ]]; then + echo "Error: The Rocky Linux version (${ROCKY_VERSION}) is not supported. Please use a compatible Rocky Linux version." + exit 1 fi - - execute_with_retries "${dnf_cmd}" fi - touch "${workdir}/build-dependencies-complete" -} - -function install_dependencies() { - pkg_list="pciutils screen" - if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} - elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi -} -function prepare_gpu_env(){ - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - - readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 - nvsmi_works="0" - - if is_cuda11 ; then gcc_ver="11" - elif is_cuda12 ; then gcc_ver="12" ; fi -} - -# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades -# Users should run apt-mark unhold before they wish to upgrade these packages -function hold_nvidia_packages() { - apt-mark hold nvidia-* - apt-mark hold libnvidia-* - if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then - apt-mark hold xserver-xorg-video-nvidia* + if [[ "${SECURE_BOOT}" == "enabled" ]]; then + echo "Error: Secure Boot is enabled. Please disable Secure Boot while creating the cluster." + exit 1 fi } -function delete_mig_instances() ( - # delete all instances - set +e - nvidia-smi mig -dci - - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No compute instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac - - nvidia-smi mig -dgi - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No GPU instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac -) +# Detect dataproc image version from its various names +if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" +fi -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles -function configure_mig_cgi() { - delete_mig_instances - META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" - if test -n "${META_MIG_CGI_VALUE}"; then - nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C - else - if lspci | grep -q H100 ; then - # run the following command to list placement profiles - # nvidia-smi mig -lgipp - # - # This is the result when using H100 instances on 20241220 - # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 - # GPU 0 Profile ID 14 Placements: {0,2,4}:2 - # GPU 0 Profile ID 9 Placements: {0,4}:4 - # GPU 0 Profile ID 5 Placement : {0}:4 - # GPU 0 Profile ID 0 Placement : {0}:8 +function remove_old_backports { + # This script uses 'apt-get update' and is therefore potentially dependent on + # backports repositories which have been archived. In order to mitigate this + # problem, we will remove any reference to backports repos older than oldstable - # For H100 3D controllers, use profile 19, 7x1G instances - nvidia-smi mig -cgi 19 -C - elif lspci | grep -q A100 ; then - # Dataproc only supports A100s right now split in 2 if not specified - # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances - nvidia-smi mig -cgi 9,9 -C - else - echo "unrecognized 3D controller" - fi + # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 + oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}'); + stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}'); + + matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)" + if [[ -n "$matched_files" ]]; then + for filename in "$matched_files"; do + grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \ + sed -i -e 's/^.*-backports.*$//' "$filename" + done fi } -function enable_mig() { - nvidia-smi -mig 1 -} - - -function configure_dkms_certs() { - if test -v PSN && [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping"; - return 0 +function main() { + if [[ ${OS_NAME} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then + remove_old_backports fi - mkdir -p "${CA_TMPDIR}" - - # If the private key exists, verify it - if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then - echo "Private key material exists" - - local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then - modulus_md5sum="${expected_modulus_md5sum}" - - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key" - fi - - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert" - fi + check_os_and_secure_boot + + if [[ "${OS_NAME}" == "rocky" ]]; then + if dnf list kernel-devel-$(uname -r) && dnf list kernel-headers-$(uname -r); then + echo "kernel devel and headers packages are available. Proceed without kernel upgrade." else - modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" + upgrade_kernel fi - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - - return + fi + + if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then + export DEBIAN_FRONTEND=noninteractive + execute_with_retries "apt-get update" + execute_with_retries "apt-get install -y -q pciutils" + elif [[ ${OS_NAME} == rocky ]] ; then + execute_with_retries "dnf -y -q install pciutils" fi - # Retrieve cloud secrets keys - local sig_priv_secret_name - sig_priv_secret_name="${PSN}" - local sig_pub_secret_name - sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" - local sig_secret_project - sig_secret_project="$(get_metadata_attribute secret_project)" - local sig_secret_version - sig_secret_version="$(get_metadata_attribute secret_version)" - - # If metadata values are not set, do not write mok keys - if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi - - # Write private material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_priv_secret_name}" \ - | dd status=none of="${CA_TMPDIR}/db.rsa" - - # Write public material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_pub_secret_name}" \ - | base64 --decode \ - | dd status=none of="${CA_TMPDIR}/db.der" - - local mok_directory="$(dirname "${mok_key}")" - mkdir -p "${mok_directory}" - - # symlink private key and copy public cert from volatile storage to DKMS directory - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - cp -f "${CA_TMPDIR}/db.der" "${mok_der}" - - modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" -} - -function clear_dkms_key { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping" >&2 - return 0 - fi - rm -rf "${CA_TMPDIR}" "${mok_key}" -} - -function check_secure_boot() { - local SECURE_BOOT="disabled" - SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') - - PSN="$(get_metadata_attribute private_secret_name)" - readonly PSN - - if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then - echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." - exit 1 - elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then - echo "Secure boot is enabled, but no signing material provided." - echo "Please either disable secure boot or provide signing material as per" - echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - return 1 - fi - - CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" - readonly CA_TMPDIR - - if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv - mok_der=/var/lib/shim-signed/mok/MOK.der - else mok_key=/var/lib/dkms/mok.key - mok_der=/var/lib/dkms/mok.pub ; fi - - configure_dkms_certs -} - - -function exit_handler() { - # Purge private key material until next grant - clear_dkms_key - - set +ex - echo "Exit handler invoked" - - # Clear pip cache - pip cache purge || echo "unable to purge pip cache" - - # If system memory was sufficient to mount memory-backed filesystems - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - - # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then - umount -f ${shmdir} + # default MIG to on when this script is used + META_MIG_VALUE=1 + if (/usr/share/google/get_metadata_value attributes/ENABLE_MIG); then + META_MIG_VALUE=$(/usr/share/google/get_metadata_value attributes/ENABLE_MIG) + fi + + if (lspci | grep -q NVIDIA); then + if [[ $META_MIG_VALUE -ne 0 ]]; then + # if the first invocation, the NVIDIA drivers and tools are not installed + if [[ -f "/usr/bin/nvidia-smi" ]]; then + # check to see if we already enabled mig mode and rebooted so we don't end + # up in infinite reboot loop + NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` + if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then + if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then + echo "MIG is enabled on all GPUs, configuring instances" + configure_mig_cgi + exit 0 + else + echo "GPUs present but MIG is not enabled" + fi + else + echo "More than 1 GPU with MIG configured differently between them" + fi fi - done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' - fi - - if is_debuntu ; then - # Clean up OS package cache - apt-get -y -qq clean - apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove - # re-hold systemd package - if ge_debian12 ; then - apt-mark hold systemd libsystemd0 ; fi - hold_nvidia_packages - else - dnf clean all - fi - - # print disk usage statistics for large components - if is_ubuntu ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 | sort -h - elif is_debian ; then - du -x -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /var/lib/{docker,mysql,} \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ - /usr/bin \ - /usr \ - /var \ - / 2>/dev/null | sort -h - else - du -hs \ - /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ - /usr/lib64/google-cloud-sdk \ - /usr/lib \ - /opt/nvidia/* \ - /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 - fi - - # Process disk usage logs from installation period - rm -f /run/keep-running-df - sync - sleep 5.01s - # compute maximum size of disk during installation - # Log file contains logs like the following (minus the preceeding #): -#Filesystem 1K-blocks Used Available Use% Mounted on -#/dev/vda2 7096908 2611344 4182932 39% / - df / | tee -a "/run/disk-usage.log" - - perl -e '@siz=( sort { $a => $b } - map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; -print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" - - echo "exit_handler has completed" - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - dd if=/dev/zero of=/zero - sync - sleep 3s - rm -f /zero - fi - - return 0 -} - -function prepare_to_install(){ - # Verify OS compatability and Secure boot state - check_os - check_secure_boot - - prepare_gpu_env - - OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role - ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - - workdir=/opt/install-dpgce - tmpdir=/tmp/ - temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" - readonly temp_bucket - readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" - uname_r=$(uname -r) - readonly uname_r - readonly bdcfg="/usr/local/bin/bdconfig" - export DEBIAN_FRONTEND=noninteractive - - mkdir -p "${workdir}" - trap exit_handler EXIT - set_proxy - mount_ramdisk - - readonly install_log="${tmpdir}/install.log" - - if test -f "${workdir}/prepare-complete" ; then return ; fi - - repair_old_backports - - if is_debuntu ; then - clean_up_sources_lists - apt-get update -qq - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - if ge_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi - else - dnf clean all + fi fi + + # Detect NVIDIA GPU + if (lspci | grep -q NVIDIA); then + if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then + execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" + elif [[ ${OS_NAME} == rocky ]]; then + echo "kernel devel and headers not required on rocky. installing from binary" + fi - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e - time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero - ) fi - - install_dependencies - - # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" - touch "/run/keep-running-df" - screen -d -m -LUS keep-running-df \ - bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - - touch "${workdir}/prepare-complete" -} - -function main() { - # default MIG to on when this script is used - META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + install_nvidia_gpu_driver - if ! (lspci | grep -q NVIDIA) ; then return ; fi - if [[ $META_MIG_VALUE -ne 0 ]]; then - # if the first invocation, the NVIDIA drivers and tools are not installed - if [[ -f "/usr/bin/nvidia-smi" ]]; then - # check to see if we already enabled mig mode and rebooted so we don't end - # up in infinite reboot loop - mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" - NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)" - if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then - if (echo "${mig_mode_current}" | grep Enabled); then - echo "MIG is enabled on all GPUs, configuring instances" + if [[ ${META_MIG_VALUE} -ne 0 ]]; then + enable_mig + NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` + if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then + if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then + echo "MIG is fully enabled, we don't need to reboot" configure_mig_cgi - exit 0 else - echo "GPUs present but MIG is not enabled" + echo "MIG is configured on but NOT enabled, we need to reboot" + reboot fi else - echo "More than 1 GPU with MIG configured differently between them" + echo "MIG is NOT enabled all on GPUs, we need to reboot" + reboot fi + else + echo "Not enabling MIG" fi fi - - install_nvidia_gpu_driver - - if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi - - enable_mig - - mig_mode_current="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader)" - - NUM_GPUS_WITH_DIFF_MIG_MODES="$(echo "${mig_mode_current}" | uniq | wc -l)" - if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -ne 1 ]] ; then echo "MIG is NOT enabled all on GPUs. Failing" ; exit 1 ; fi - if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured on but NOT enabled. Failing" ; exit 1 ; fi - - echo "MIG is fully enabled" - configure_mig_cgi } -prepare_to_install - main diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 6fdfbb78c..0b4aabd57 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -232,10 +232,12 @@ CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 # EXCEPTIONS # Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) -if is_ubuntu18 ; then - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 - NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 - CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 +if [[ "${OS_NAME}" == "ubuntu" ]]; then + if is_ubuntu18 ; then + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 + NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 + CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 + fi fi # Verify Secure boot diff --git a/templates/common/util_functions b/templates/common/util_functions index 9f7075f0b..80ce5c09f 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -622,6 +622,14 @@ function common_exit_handler() { # Clear pip cache pip cache purge || echo "unable to purge pip cache" + # Restart YARN services if they are running already + for svc in resourcemanager nodemanager; do + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl stop "hadoop-yarn-${svc}.service" + systemctl start "hadoop-yarn-${svc}.service" + fi + done + # If system memory was sufficient to mount memory-backed filesystems if [[ "${tmpdir}" == "/mnt/shm" ]] ; then # remove the tmpfs pip cache-dir diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 26c4d02f9..7c8b47b32 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1011,6 +1011,7 @@ function configure_gpu_exclusive_mode() { if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU nvidia-smi -c EXCLUSIVE_PROCESS + clear_nvsmi_cache } function fetch_mig_scripts() { @@ -1154,6 +1155,17 @@ EOF systemctl start dataproc-cgroup-device-permissions } +function clear_nvsmi_cache() { + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then + rm "${nvsmi_query_xml}" + fi +} + +function query_nvsmi() { + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi + /usr/bin/nvidia-smi -q -x --dtd > "${nvsmi_query_xml}" +} + function nvsmi() { local nvsmi="/usr/bin/nvidia-smi" if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' @@ -1220,6 +1232,8 @@ function prepare_gpu_env(){ set -e echo "gpu_count=[${gpu_count}]" nvsmi_works="0" + nvsmi_query_xml="${tmpdir}/nvsmi.xml" + xmllint="/opt/conda/miniconda3/bin/xmllint" NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 @@ -1304,20 +1318,52 @@ function configure_mig_cgi() { # GPU 0 Profile ID 5 Placement : {0}:4 # GPU 0 Profile ID 0 Placement : {0}:8 - # For H100 3D controllers, use profile 19, 7x1G instances - nvidia-smi mig -cgi 19 -C + # For H100 3D controllers, consider profile 19, 7x1G instances + nvidia-smi mig -cgi 9,9 -C elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then - # Dataproc only supports A100s right now split in 2 if not specified + # Dataproc only supports H100s right now ; split in 2 if not specified # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances nvidia-smi mig -cgi 9,9 -C else echo "unrecognized 3D controller" fi fi + clear_nvsmi_cache } function enable_mig() { + if test -f "${workdir}/complete/enable-mig" ; then return ; fi + + # Start persistenced if it's not already running + if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi + for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do + # Write an ascii zero to the numa node indicator + echo "0" | dd of="${f}" status=none + done + time nvidia-smi --gpu-reset # 30s nvidia-smi -mig 1 + clear_nvsmi_cache + + touch "${workdir}/complete/enable-mig" +} + +function enable_and_configure_mig() { + # default MIG to on when this script is used + META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + + if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi + + enable_mig + + xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' + query_nvsmi + mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" + + if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi + if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured but NOT enabled. Failing" ; exit 1 ; fi + + echo "MIG is fully enabled" + configure_mig_cgi } function setup_gpu_yarn() { @@ -1334,8 +1380,8 @@ function setup_gpu_yarn() { fi # if this is called without the MIG script then the drivers are not installed - migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" - if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi + query_nvsmi + migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then From 0ac57a070220d56d58030d02643f954fb4822a85 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 12:02:47 -0800 Subject: [PATCH 058/130] return test suite to master --- spark-rapids/test_spark_rapids.py | 47 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 9b9481716..7af8e3154 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -20,10 +20,6 @@ class SparkRapidsTestCase(DataprocTestCase): def verify_spark_instance(self, name): self.assert_instance_command(name, "nvidia-smi") - def verify_pyspark(self, name): - # Verify that pyspark works - self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) - def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -62,6 +58,12 @@ def verify_spark_job_sql(self): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids(self, configuration, machine_suffixes, accelerator): + if self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky OS") + + if self.getImageVersion() <= pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in 2.0 and earlier images") + optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" @@ -70,10 +72,10 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): self.INIT_ACTIONS, optional_components=optional_components, metadata=metadata, - machine_type="n1-standard-32", + machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="50GB", + boot_disk_size="1024GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -86,6 +88,12 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): ("STANDARD", ["w-0"], GPU_T4)) def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): + if self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky OS") + + if self.getImageVersion() <= pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in 2.0 and earlier images") + optional_components = None metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" @@ -94,10 +102,10 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): self.INIT_ACTIONS, optional_components=optional_components, metadata=metadata, - machine_type="n1-standard-32", + machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="50GB", + boot_disk_size="1024GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -106,24 +114,15 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): # Only need to do this once self.verify_spark_job_sql() - @parameterized.parameters( - ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"), - ("STANDARD", ["w-0"], GPU_T4, "12.0.1", "525.147.05"), - ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"), - ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03") - ) + @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14")) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.1.1") \ - and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ - ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.1.1 not supported on older debian/ubuntu releases") + if self.getImageOs() == "rocky": + self.skipTest("Not supported for Rocky OS") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ - and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + if self.getImageVersion() <= pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in 2.0 and earlier images") metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK" ",cuda-version={0},driver-version={1}".format(cuda_version, driver_version)) @@ -132,10 +131,10 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-32", + machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="50GB", + boot_disk_size="1024GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: From b3e5618112c06caf699546161ec460357d7678f7 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 12:10:08 -0800 Subject: [PATCH 059/130] do not run all tests ; also do not retry failures --- cloudbuild/presubmit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index eec7adb76..d9ae3c9bb 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,6 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + continue echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 @@ -104,7 +105,6 @@ run_tests() { bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ - --flaky_test_attempts=3 \ --action_env="INTERNAL_IP_SSH=true" \ --test_output="all" \ --noshow_progress \ From b4e99ee90874f8d51f85b9fc63336b4e689e7958 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 25 Dec 2024 19:33:19 -0800 Subject: [PATCH 060/130] expanding non-default version tests ; adding utility function to verify pyspark ; disk size correction --- spark-rapids/test_spark_rapids.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 7af8e3154..3c9b2a2d6 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -20,6 +20,10 @@ class SparkRapidsTestCase(DataprocTestCase): def verify_spark_instance(self, name): self.assert_instance_command(name, "nvidia-smi") + def verify_pyspark(self, name): + # Verify that pyspark works + self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -114,13 +118,22 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): # Only need to do this once self.verify_spark_job_sql() - @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14")) + @parameterized.parameters( + ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"), + ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"), + ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03") + ) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): if self.getImageOs() == "rocky": self.skipTest("Not supported for Rocky OS") + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ + ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): + self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") From e4eab7b41ccd9b471cd0c51fc1a8143edeb3bf83 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 12:16:11 -0800 Subject: [PATCH 061/130] reverting to master --- spark-rapids/test_spark_rapids.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 3c9b2a2d6..7af8e3154 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -20,10 +20,6 @@ class SparkRapidsTestCase(DataprocTestCase): def verify_spark_instance(self, name): self.assert_instance_command(name, "nvidia-smi") - def verify_pyspark(self, name): - # Verify that pyspark works - self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) - def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -118,22 +114,13 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): # Only need to do this once self.verify_spark_job_sql() - @parameterized.parameters( - ("STANDARD", ["w-0"], GPU_T4, "11.8.0", "525.147.05"), - ("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"), - ("STANDARD", ["w-0"], GPU_T4, "12.6.2", "560.35.03") - ) + @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14")) def test_non_default_cuda_versions(self, configuration, machine_suffixes, accelerator, cuda_version, driver_version): if self.getImageOs() == "rocky": self.skipTest("Not supported for Rocky OS") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ - and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ - ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") - if self.getImageVersion() <= pkg_resources.parse_version("2.0"): self.skipTest("Not supported in 2.0 and earlier images") From 95b17ac10d260de329f0876d8b44deeae4527381 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 12:25:54 -0800 Subject: [PATCH 062/130] reverting test_spark-rapids.py to master --- spark-rapids/test_spark_rapids.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 7af8e3154..6e03f2d62 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: From 212b9af4d1c39dfbc8c6b8947d88b8796c37bbf9 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 12:31:30 -0800 Subject: [PATCH 063/130] do not consider templates as changed files --- cloudbuild/presubmit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index d9ae3c9bb..2b2e978b0 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -49,7 +49,7 @@ initialize_git_repo() { determine_tests_to_run() { # Infer the files that changed mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD) - mapfile -t CHANGED_FILES < <(git diff origin/master --name-only) + mapfile -t CHANGED_FILES < <(git diff origin/master --name-only | grep -v template) echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}" echo "Changed files: ${CHANGED_FILES[*]}" From e9b9e5de59966924ab2cb8110f86e35da24b143a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 12:44:21 -0800 Subject: [PATCH 064/130] using nvsmi for some error protection --- gpu/install_gpu_driver.sh | 22 +++++++++++----------- templates/gpu/util_functions | 22 +++++++++++----------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 8d3d5aa84..da30fcfe8 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1899,17 +1899,6 @@ EOF systemctl start dataproc-cgroup-device-permissions } -function clear_nvsmi_cache() { - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then - rm "${nvsmi_query_xml}" - fi -} - -function query_nvsmi() { - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi - /usr/bin/nvidia-smi -q -x --dtd > "${nvsmi_query_xml}" -} - function nvsmi() { local nvsmi="/usr/bin/nvidia-smi" if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' @@ -1928,6 +1917,17 @@ function nvsmi() { "${nvsmi}" $* } +function clear_nvsmi_cache() { + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then + rm "${nvsmi_query_xml}" + fi +} + +function query_nvsmi() { + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi + nvsmi -q -x --dtd > "${nvsmi_query_xml}" +} + function install_build_dependencies() { if test -f "${workdir}/complete/build-dependencies" ; then return ; fi diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 7c8b47b32..328b89196 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1155,17 +1155,6 @@ EOF systemctl start dataproc-cgroup-device-permissions } -function clear_nvsmi_cache() { - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then - rm "${nvsmi_query_xml}" - fi -} - -function query_nvsmi() { - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi - /usr/bin/nvidia-smi -q -x --dtd > "${nvsmi_query_xml}" -} - function nvsmi() { local nvsmi="/usr/bin/nvidia-smi" if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' @@ -1184,6 +1173,17 @@ function nvsmi() { "${nvsmi}" $* } +function clear_nvsmi_cache() { + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then + rm "${nvsmi_query_xml}" + fi +} + +function query_nvsmi() { + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi + nvsmi -q -x --dtd > "${nvsmi_query_xml}" +} + function install_build_dependencies() { if test -f "${workdir}/complete/build-dependencies" ; then return ; fi From adf4312102c8c44b6a59b5a52057d9019a8d8fdd Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 16:41:45 -0800 Subject: [PATCH 065/130] corrected comments --- templates/spark-rapids/mig.sh.in | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in index 27da6ffd0..28a463602 100644 --- a/templates/spark-rapids/mig.sh.in +++ b/templates/spark-rapids/mig.sh.in @@ -1,15 +1,16 @@ #!/bin/bash # [% INSERT legal/license_header %] -# This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures. # -# This script should be specified in --metadata=startup-script-url= option and -# --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it. -# The script does a reboot to fully enable MIG and then configures the MIG device based on the -# user specified MIG_CGI profiles specified via: --metadata=^:^MIG_CGI='9,9'. If MIG_CGI -# is not specified it assumes it's using an A100 and configures 2 instances with profile id 9. -# It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the -# YARN setup to fully utilize the MIG instances on YARN. +# This script installs NVIDIA GPU drivers and enables MIG on Hopper +# GPU architectures. +# +# This script should be specified in --initialization-actions= option +# and --metadata=ENABLE_MIG can be used to enable or disable MIG. The +# default is to enable it. The script configures the MIG device based +# on the user specified MIG_CGI profiles specified via: +# --metadata=^:^MIG_CGI='9,9'. If MIG_CGI is not specified it assumes +# it's using an H100 and configures 2 instances with profile id 9. # [% PROCESS common/template_disclaimer %] From dfcd8b02eadc8a253d99fadc25020614ce314275 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 16:45:40 -0800 Subject: [PATCH 066/130] defining xpath variables as local --- gpu/install_gpu_driver.sh | 3 ++- templates/gpu/util_functions | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index da30fcfe8..7cb4a1817 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -2099,8 +2099,8 @@ function enable_and_configure_mig() { enable_mig - xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi @@ -2125,6 +2125,7 @@ function setup_gpu_yarn() { # if this is called without the MIG script then the drivers are not installed query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 328b89196..1019a8f78 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1355,8 +1355,8 @@ function enable_and_configure_mig() { enable_mig - xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi @@ -1381,6 +1381,7 @@ function setup_gpu_yarn() { # if this is called without the MIG script then the drivers are not installed query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" From 9bb4d6664a8f5db95b8cd0e69496ab2a973cee98 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 18:16:42 -0800 Subject: [PATCH 067/130] tested on 2.1-ubuntu20 --- gpu/install_gpu_driver.sh | 74 ++++++++++++++++++++------------- templates/common/util_functions | 4 ++ templates/gpu/util_functions | 70 ++++++++++++++++++------------- 3 files changed, 92 insertions(+), 56 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 7cb4a1817..76a6703ef 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -28,6 +28,10 @@ function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | x function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) +# For version (or real number) comparison +# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second +# ( version_ge 2.0 2.1 ) evaluates to false +# ( version_ge 2.2 2.1 ) evaluates to true function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) @@ -1754,7 +1758,7 @@ function configure_gpu_exclusive_mode() { # only run this function when spark < 3.0 if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU - nvidia-smi -c EXCLUSIVE_PROCESS + nvsmi -c EXCLUSIVE_PROCESS clear_nvsmi_cache } @@ -1769,7 +1773,7 @@ function fetch_mig_scripts() { function install_spark_rapids() { # Update SPARK RAPIDS config local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 - local DEFAULT_XGBOOST_VERSION="1.7.6" + local DEFAULT_XGBOOST_VERSION="2.0.2" # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" @@ -1785,15 +1789,22 @@ function install_spark_rapids() { local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ + local jar_basename + + jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" + cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" } function configure_gpu_script() { @@ -1906,7 +1917,7 @@ function nvsmi() { elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi - if [[ "$1" == "-L" ]] ; then + if test -v 1 && [[ "$1" == "-L" ]] ; then local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi @@ -1924,6 +1935,7 @@ function clear_nvsmi_cache() { } function query_nvsmi() { + if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi nvsmi -q -x --dtd > "${nvsmi_query_xml}" } @@ -1997,6 +2009,9 @@ function prepare_gpu_env(){ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RAPIDS_RUNTIME + # determine whether we have nvidia-smi installed and working + nvsmi + set_cuda_version set_driver_version set_cuda_repo_shortname @@ -2084,8 +2099,8 @@ function enable_mig() { # Write an ascii zero to the numa node indicator echo "0" | dd of="${f}" status=none done - time nvidia-smi --gpu-reset # 30s - nvidia-smi -mig 1 + time nvsmi --gpu-reset # 30s + nvsmi -mig 1 clear_nvsmi_cache touch "${workdir}/complete/enable-mig" @@ -2098,7 +2113,6 @@ function enable_and_configure_mig() { if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi enable_mig - query_nvsmi local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" @@ -2123,19 +2137,23 @@ function setup_gpu_yarn() { return 0 fi - # if this is called without the MIG script then the drivers are not installed - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts + if [[ "${nvsmi_works}" == "1" ]] ; then + # if this is called without the MIG script then the drivers are not installed + query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' + set +e + migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" + set -e + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi fi fi fi diff --git a/templates/common/util_functions b/templates/common/util_functions index 80ce5c09f..93b276a68 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -2,6 +2,10 @@ function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | x function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) +# For version (or real number) comparison +# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second +# ( version_ge 2.0 2.1 ) evaluates to false +# ( version_ge 2.2 2.1 ) evaluates to true function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 1019a8f78..dca97b316 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1010,7 +1010,7 @@ function configure_gpu_exclusive_mode() { # only run this function when spark < 3.0 if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU - nvidia-smi -c EXCLUSIVE_PROCESS + nvsmi -c EXCLUSIVE_PROCESS clear_nvsmi_cache } @@ -1025,7 +1025,7 @@ function fetch_mig_scripts() { function install_spark_rapids() { # Update SPARK RAPIDS config local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 - local DEFAULT_XGBOOST_VERSION="1.7.6" + local DEFAULT_XGBOOST_VERSION="2.0.2" # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" @@ -1041,15 +1041,22 @@ function install_spark_rapids() { local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ + local jar_basename + + jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" + cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" } function configure_gpu_script() { @@ -1162,7 +1169,7 @@ function nvsmi() { elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi - if [[ "$1" == "-L" ]] ; then + if test -v 1 && [[ "$1" == "-L" ]] ; then local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi @@ -1180,6 +1187,7 @@ function clear_nvsmi_cache() { } function query_nvsmi() { + if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi nvsmi -q -x --dtd > "${nvsmi_query_xml}" } @@ -1253,6 +1261,9 @@ function prepare_gpu_env(){ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly RAPIDS_RUNTIME + # determine whether we have nvidia-smi installed and working + nvsmi + set_cuda_version set_driver_version set_cuda_repo_shortname @@ -1340,8 +1351,8 @@ function enable_mig() { # Write an ascii zero to the numa node indicator echo "0" | dd of="${f}" status=none done - time nvidia-smi --gpu-reset # 30s - nvidia-smi -mig 1 + time nvsmi --gpu-reset # 30s + nvsmi -mig 1 clear_nvsmi_cache touch "${workdir}/complete/enable-mig" @@ -1354,7 +1365,6 @@ function enable_and_configure_mig() { if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi enable_mig - query_nvsmi local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" @@ -1379,19 +1389,23 @@ function setup_gpu_yarn() { return 0 fi - # if this is called without the MIG script then the drivers are not installed - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts + if [[ "${nvsmi_works}" == "1" ]] ; then + # if this is called without the MIG script then the drivers are not installed + query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' + set +e + migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" + set -e + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi fi fi fi From 17f0fe86ec97fb16e6e980fd379a8503a35ca1f4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 18:33:10 -0800 Subject: [PATCH 068/130] using tests from https://github.com/GoogleCloudDataproc/initialization-actions/pull/1275 --- gpu/test_gpu.py | 327 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 242 insertions(+), 85 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f8438915f..f260d5927 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -4,26 +4,77 @@ from absl.testing import absltest from absl.testing import parameterized +import unittest + from integration_tests.dataproc_test_case import DataprocTestCase +DEFAULT_TIMEOUT = 15 # minutes +DEFAULT_CUDA_VERSION = "12.4" class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" - GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a - GPU_A100 = "type=nvidia-tesla-a100" + GPU_V100 = "type=nvidia-tesla-v100" + GPU_A100 = "type=nvidia-tesla-a100,count=2" GPU_H100 = "type=nvidia-h100-80gb,count=8" + # Tests for PyTorch + TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py" + + # Tests for TensorFlow + TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py" + + def assert_instance_command(self, + instance, + cmd, + timeout_in_minutes=DEFAULT_TIMEOUT): + + retry_count = 5 + + ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format( + instance, self.cluster_zone, cmd) + + while retry_count > 0: + try: + ret_code, stdout, stderr = self.assert_command( ssh_cmd, timeout_in_minutes ) + return ret_code, stdout, stderr + except Exception as e: + print("An error occurred: ", e) + retry_count -= 1 + if retry_count > 0: + time.sleep(10) + continue + else: + raise + def verify_instance(self, name): # Verify that nvidia-smi works - time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience + import random + # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions + time.sleep( 3 + random.randint(1, 30) ) self.assert_instance_command(name, "nvidia-smi", 1) - def verify_pyspark(self, name): - # Verify that pyspark works - self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + def verify_pytorch(self, name): + test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), + self.TORCH_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) + + verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + self.TORCH_TEST_SCRIPT_FILE_NAME) + self.assert_instance_command(name, verify_cmd) + self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) + + def verify_tensorflow(self, name): + test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), + self.TF_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) + + verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + self.TF_TEST_SCRIPT_FILE_NAME) + self.assert_instance_command(name, verify_cmd) + self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) def verify_mig_instance(self, name): self.assert_instance_command(name, @@ -41,6 +92,18 @@ def verify_instance_nvcc(self, name, cuda_version): self.assert_instance_command( name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) ) + def verify_instance_pyspark(self, name): + # Verify that pyspark works + self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + + def verify_instance_cuda_version(self, name, cuda_version): + self.assert_instance_command( + name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) ) + + def verify_instance_driver_version(self, name, driver_version): + self.assert_instance_command( + name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) ) + def verify_instance_spark(self): self.assert_dataproc_job( self.getClusterName(), @@ -56,6 +119,22 @@ def verify_instance_spark(self): + "spark.yarn.unmanagedAM.enabled=false" ) + def verify_driver_signature(self, name): + cert_path='/var/lib/dkms/mok.pub' + if self.getImageOs() == 'ubuntu': + cert_path='/var/lib/shim-signed/mok/MOK.der' + + cert_verification_cmd = """ +perl -Mv5.10 -e ' +my $cert = ( qx{openssl x509 -inform DER -in {} -text} + =~ /Serial Number:.*? +(.+?)\s*$/ms ); +my $kmod = ( qx{modinfo nvidia} + =~ /^sig_key:\s+(\S+)/ms ); +exit 1 unless $cert eq lc $kmod +' +""" + self.assert_instance_command( name, cert_verification_cmd.format(cert_path) ) + @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), # ("STANDARD", ["m"], GPU_T4, None, None), @@ -64,8 +143,14 @@ def verify_instance_spark(self): def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") + + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = None if driver_provider is not None: @@ -73,17 +158,18 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-highmem-32", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, - boot_disk_size="50GB") + timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) - if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ): - self.verify_pyspark(machine_name) + self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) + self.verify_instance_pyspark(machine_name) + self.verify_instance_spark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), @@ -91,13 +177,16 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("No need to regularly test not installing the agent") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - metadata = "install-gpu-agent=false" + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) + self.skipTest("known to fail") + if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) self.createCluster( @@ -107,22 +196,27 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) @parameterized.parameters( - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), + ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), # ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), ) def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") + + if configuration == 'KERBEROS' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -134,40 +228,46 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) @parameterized.parameters( -# ("SINGLE", ["m"], GPU_T4, None, "12.0"), - ("SINGLE", ["m"], GPU_T4, None, "11.8"), + ("SINGLE", ["m"], GPU_T4, None, "12.4"), +# ("SINGLE", ["m"], GPU_T4, None, "11.8"), ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), + ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") + if configuration == 'KERBEROS' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) + self.skipTest("known to fail") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) + self.skipTest("known to fail") + metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -177,12 +277,15 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB") + for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_nvcc(machine_name, cuda_version) + self.verify_instance_pyspark(machine_name) + self.verify_instance_spark() @parameterized.parameters( ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), @@ -192,25 +295,23 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): - - self.skipTest("Test is known to fail. Skipping so that we can exercise others") - - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") - - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + # Operation [projects/.../regions/.../operations/...] failed: + # Invalid value for field 'resource.machineType': \ + # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \ + # 'machineTypes/a3-highgpu-8g'. \ + # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature.. + # ('This use case not thoroughly tested') + unittest.expectedFailure(self) + self.skipTest("known to fail") + + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -222,7 +323,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", startup_script="gpu/mig.sh") @@ -236,12 +337,13 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ - and configuration == 'SINGLE': - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = None if driver_provider is not None: @@ -255,7 +357,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, boot_disk_size="50GB", - timeout_in_minutes=30) + timeout_in_minutes=90) self.verify_instance_spark() @@ -270,26 +372,21 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator, worker_accelerator, cuda_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ - and configuration == 'SINGLE': - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty") - - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") - - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -299,14 +396,74 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") + for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) + self.verify_instance_spark() + + @parameterized.parameters( +# ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), +# ("STANDARD", ["m"], GPU_T4, None, "12.0"), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'), +# ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), + ) + def tests_driver_signing(self, configuration, machine_suffixes, + master_accelerator, worker_accelerator, + cuda_version, image_os, image_version): + + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + + if configuration == 'KERBEROS' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) + self.skipTest("known to fail") + + kvp_array=[] + import os + + if "private_secret_name" in os.environ: + for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']: + kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) ) + + if kvp_array[0] == "public_secret_name=": + self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") + else: + self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") + + metadata = ",".join( kvp_array ) + + if self.getImageOs() != image_os: + self.skipTest("This test is only run on os {}".format(image_os)) + if self.getImageVersion() != image_version: + self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os)) + + self.createCluster( + configuration, + self.INIT_ACTIONS, + machine_type="n1-highmem-8", + master_accelerator=master_accelerator, + worker_accelerator=worker_accelerator, + metadata=metadata, + timeout_in_minutes=90, + boot_disk_size="50GB", + scopes="https://www.googleapis.com/auth/monitoring.write") + for machine_suffix in machine_suffixes: + hostname="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(hostname) + self.verify_instance_gpu_agent(hostname) +# self.verify_driver_signature(hostname) self.verify_instance_spark() From f42a86d803f8469c93f07d8d4784e7a524f6ea46 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 2 Jan 2025 19:04:00 -0800 Subject: [PATCH 069/130] reducing resources for build cluster ; pause for gcloud --- gpu/install_gpu_driver.sh | 2 ++ gpu/test_gpu.py | 4 ++-- templates/common/util_functions | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 76a6703ef..70242aad9 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -155,6 +155,8 @@ function cache_fetched_package() { local gcs_fn="$2" local local_fn="$3" + while ! command -v gcloud ; do sleep 5s ; done + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then time gcloud storage cp "${gcs_fn}" "${local_fn}" else diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f260d5927..0f6550ad7 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -158,11 +158,11 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-32", + machine_type="n1-standard-32", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl + timeout_in_minutes=90, # This cluster is sized and timed appropriately to build the kernel driver and nccl boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) diff --git a/templates/common/util_functions b/templates/common/util_functions index 93b276a68..6d58103a7 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -129,6 +129,8 @@ function cache_fetched_package() { local gcs_fn="$2" local local_fn="$3" + while ! command -v gcloud ; do sleep 5s ; done + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then time gcloud storage cp "${gcs_fn}" "${local_fn}" else From 811ad03c2b32e8873cc7691768c80931a448a03e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 18:25:59 -0800 Subject: [PATCH 070/130] exercising spark-rapids from this template --- templates/gpu/util_functions | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index dca97b316..0bc844e1f 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1024,14 +1024,14 @@ function fetch_mig_scripts() { function install_spark_rapids() { # Update SPARK RAPIDS config - local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 - local DEFAULT_XGBOOST_VERSION="2.0.2" + local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" - if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then - DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then + local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 fi readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) @@ -1086,6 +1086,7 @@ function configure_gpu_script() { # See the License for the specific language governing permissions and # limitations under the License. # +# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') @@ -1102,8 +1103,15 @@ EOF executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" local task_cpus=2 local gpu_amount - gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi + + # The current setting of spark.task.resource.gpu.amount (0.333) is + # not ideal to get the best performance from the RAPIDS Accelerator + # plugin. It's recommended to be 1/{executor core count} unless you + # have a special use case. +# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + gpu_amount="$(perl -e "print 1 / ${executor_cores}")" + +# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression cat >>"${spark_defaults_conf}" < Date: Fri, 3 Jan 2025 18:47:57 -0800 Subject: [PATCH 071/130] improved header documentation --- templates/spark-rapids/spark-rapids.sh.in | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index ac8ec5c3f..004080690 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -4,13 +4,16 @@ # [% PROCESS common/template_disclaimer %] # -# This script installs NVIDIA GPU drivers (version 550.135) along with -# CUDA 12.4. +# This script installs NVIDIA GPU drivers. +# +# Dataproc 2.0: Driver version 530.30.02, CUDA version 12.1.1, Rapids 23.08.2 +# Dataproc 2.1: Driver version 550.135, CUDA version 12.4.1, Rapids 24.08.1 +# Dataproc 2.2: Driver version 560.35.03, CUDA version 12.6.2, Rapids 24.08.1 # # Additionally, it installs the RAPIDS Spark plugin, configures Spark -# and YARN, installs an agent to collect GPU utilization metrics. The -# installer is compatible with Debian, Ubuntu, and Rocky Linux -# distributions. +# and YARN, and installs an agent to collect GPU utilization metrics. +# The installer is regularly exercised with Debian, Ubuntu, and Rocky +# Linux distributions. # # Note that the script is designed to work both when secure boot is # enabled with a custom image and when disabled during cluster From 992bd146a76e11afc065b938e6d2d365337897ad Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 19:44:46 -0800 Subject: [PATCH 072/130] generated from templates in commit d5f7ffb7cf19852e48ce17c9ffae3640e7b19ca2 --- gpu/install_gpu_driver.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 70242aad9..59a592d30 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1774,14 +1774,14 @@ function fetch_mig_scripts() { function install_spark_rapids() { # Update SPARK RAPIDS config - local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 - local DEFAULT_XGBOOST_VERSION="2.0.2" + local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" - if [[ "${DATAPROC_IMAGE_VERSION}" == "2.2" ]] ; then - DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then + local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 fi readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) @@ -1836,6 +1836,7 @@ function configure_gpu_script() { # See the License for the specific language governing permissions and # limitations under the License. # +# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') @@ -1852,8 +1853,15 @@ EOF executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" local task_cpus=2 local gpu_amount - gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - if ( version_ge "${gpu_amount}" "0.5" && version_lt "${gpu_amount}" "1.0" ) ; then gpu_amount="0.5" ; fi + + # The current setting of spark.task.resource.gpu.amount (0.333) is + # not ideal to get the best performance from the RAPIDS Accelerator + # plugin. It's recommended to be 1/{executor core count} unless you + # have a special use case. +# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + gpu_amount="$(perl -e "print 1 / ${executor_cores}")" + +# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression cat >>"${spark_defaults_conf}" < Date: Fri, 3 Jan 2025 21:17:07 -0800 Subject: [PATCH 073/130] replacing java spark tests with pyspark tests --- gpu/test_gpu.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 0f6550ad7..7eb1ac400 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -104,6 +104,15 @@ def verify_instance_driver_version(self, name, driver_version): self.assert_instance_command( name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) ) + def verify_pyspark(self, name): + self.assert_dataproc_job( + self.getClusterName(), + "pyspark", + """--properties="spark:spark.executor.resource.gpu.amount=1" \ + --properties="spark:spark.task.resource.gpu.amount=0.01" \ + 'gs://{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO) + ) + def verify_instance_spark(self): self.assert_dataproc_job( self.getClusterName(), @@ -169,7 +178,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, self.verify_instance(machine_name) self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) self.verify_instance_pyspark(machine_name) - self.verify_instance_spark() + self.verify_pyspark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), @@ -285,7 +294,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.verify_instance(machine_name) self.verify_instance_nvcc(machine_name, cuda_version) self.verify_instance_pyspark(machine_name) - self.verify_instance_spark() + self.verify_pyspark() @parameterized.parameters( ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), @@ -359,7 +368,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, boot_disk_size="50GB", timeout_in_minutes=90) - self.verify_instance_spark() + self.verify_pyspark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "11.8"), @@ -404,7 +413,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_gpu_agent(machine_name) - self.verify_instance_spark() + self.verify_pyspark() @parameterized.parameters( # ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), @@ -465,7 +474,7 @@ def tests_driver_signing(self, configuration, machine_suffixes, self.verify_instance_gpu_agent(hostname) # self.verify_driver_signature(hostname) - self.verify_instance_spark() + self.verify_pyspark() if __name__ == "__main__": absltest.main() From 88ccfec26939e3cfca9e1347abfb74f5988d1ea2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 21:30:58 -0800 Subject: [PATCH 074/130] pyspark test code --- gpu/verify_pyspark.py | 46 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 gpu/verify_pyspark.py diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py new file mode 100644 index 000000000..9cd0ca2c8 --- /dev/null +++ b/gpu/verify_pyspark.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# +# Copyright 2025 Google LLC and contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import matplotlib.pyplot as plt +import numpy as np + +from pyspark import SparkContext +from pyspark.sql import SparkSession +from pyspark import SparkConf, StorageLevel +from tqdm import tqdm +from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover +import pyspark.sql.functions as f +import nltk + +spark = SparkSession.builder.appName("spark-rapids").getOrCreate() + +#from utils import SimpleTimer, ResultsLogger, visualize_data + +conf = (SparkConf().setMaster("local[*]") + .setAppName("SparkVectorizer") + .set('spark.driver.memory', '300G') + .set('spark.driver.maxResultSize', '20G') + .set('spark.network.timeout', '7200s') + ) + +sc = SparkContext.getOrCreate(conf=conf) +sc.setLogLevel("FATAL") +spark = SparkSession(sc) +print(sc._conf.getAll()) # check context settings + +x = np.linspace(0, 3*np.pi, 500) +plt.plot(x, np.sin(x**2)) +plt.title('A simple chirp'); From 282ca0c86361b7f5a308a5ecb7314410f009b4d8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 21:32:52 -0800 Subject: [PATCH 075/130] corrected function signature --- gpu/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 7eb1ac400..9f2f4c17e 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -104,7 +104,7 @@ def verify_instance_driver_version(self, name, driver_version): self.assert_instance_command( name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) ) - def verify_pyspark(self, name): + def verify_pyspark(self): self.assert_dataproc_job( self.getClusterName(), "pyspark", From d6e9809207aa5a8ed79316220a81c2ca6c054dc5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 21:55:33 -0800 Subject: [PATCH 076/130] fixing order of operations for setting default cuda version ; removed excess cuda default logic ; too many gs:// ; testing 2.0-rocky8 instead of 2.1-rocky8 --- gpu/test_gpu.py | 16 ++++++++-------- templates/gpu/util_functions | 31 +++++++++---------------------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 9f2f4c17e..0910c1942 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -110,7 +110,7 @@ def verify_pyspark(self): "pyspark", """--properties="spark:spark.executor.resource.gpu.amount=1" \ --properties="spark:spark.task.resource.gpu.amount=0.01" \ - 'gs://{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO) + '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO) ) def verify_instance_spark(self): @@ -175,9 +175,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) - self.verify_instance_pyspark(machine_name) +# self.verify_instance(machine_name) +# self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) +# self.verify_instance_pyspark(machine_name) self.verify_pyspark() @parameterized.parameters( @@ -418,8 +418,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf @parameterized.parameters( # ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), # ("STANDARD", ["m"], GPU_T4, None, "12.0"), -# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.1.1", 'rocky', '2.0'), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), # ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'), # ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), @@ -470,8 +470,8 @@ def tests_driver_signing(self, configuration, machine_suffixes, scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: hostname="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(hostname) - self.verify_instance_gpu_agent(hostname) +# self.verify_instance(hostname) +# self.verify_instance_gpu_agent(hostname) # self.verify_driver_signature(hostname) self.verify_pyspark() diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 0bc844e1f..6409d3fb1 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -52,6 +52,15 @@ function set_support_matrix() { set_support_matrix function set_cuda_version() { + case "${DATAPROC_IMAGE_VERSION}" in + "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; + * ) + echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" + exit 1 + ;; + esac local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') if [[ -n "${cuda_url}" ]] ; then @@ -60,29 +69,8 @@ function set_cuda_version() { CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" - CUDA_FULL_VERSION="${CUDA_URL_VERSION}" fi fi - - if ( ! test -v DEFAULT_CUDA_VERSION ) ; then - DEFAULT_CUDA_VERSION='12.4.1' - fi - # EXCEPTIONS - # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) - case "${DATAPROC_IMAGE_VERSION}" in - "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; - "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; - * ) - echo "unrecognized Dataproc image version" - exit 1 - ;; - esac - - if le_ubuntu18 ; then - DEFAULT_CUDA_VERSION="12.1.1" - CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}" #12.1 - fi readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") @@ -95,7 +83,6 @@ function set_cuda_version() { CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} fi readonly CUDA_FULL_VERSION - } function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) From e3df6f21a19900a9df0a8cd1520d0b18b2c79948 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 22:09:39 -0800 Subject: [PATCH 077/130] including verify_pyspark.py in data list --- gpu/BUILD | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gpu/BUILD b/gpu/BUILD index b481c5b33..bd5500ccb 100644 --- a/gpu/BUILD +++ b/gpu/BUILD @@ -6,7 +6,11 @@ py_test( name = "test_gpu", size = "enormous", srcs = ["test_gpu.py"], - data = ["install_gpu_driver.sh", "mig.sh"], + data = [ + "install_gpu_driver.sh", + "verify_pyspark.py", + "mig.sh" + ], local = True, shard_count = 15, deps = [ From 89fe31b415f52971f1580a07aaf6ddb9b30ff036 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 23:24:34 -0800 Subject: [PATCH 078/130] verifying with gcloud dataproc jobs submit pyspark instead of spark ; skipping all tests that use ssh --- gpu/test_gpu.py | 23 ++++++++++++----------- gpu/verify_pyspark.py | 1 - 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 0910c1942..19fb7fe81 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -108,8 +108,8 @@ def verify_pyspark(self): self.assert_dataproc_job( self.getClusterName(), "pyspark", - """--properties="spark:spark.executor.resource.gpu.amount=1" \ - --properties="spark:spark.task.resource.gpu.amount=0.01" \ + """--properties="spark.executor.resource.gpu.amount=1" \ + --properties="spark.task.resource.gpu.amount=0.01" \ '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO) ) @@ -209,8 +209,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, boot_disk_size="50GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - +# self.verify_instance(machine_name) + self.verify_pyspark() @parameterized.parameters( ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), @@ -242,8 +242,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_gpu_agent(machine_name) +# self.verify_instance(machine_name) +# self.verify_instance_gpu_agent(machine_name) + self.verify_pyspark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "12.4"), @@ -291,9 +292,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_nvcc(machine_name, cuda_version) - self.verify_instance_pyspark(machine_name) + #self.verify_instance(machine_name) + #self.verify_instance_nvcc(machine_name, cuda_version) + #self.verify_instance_pyspark(machine_name) self.verify_pyspark() @parameterized.parameters( @@ -411,8 +412,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_gpu_agent(machine_name) +# self.verify_instance(machine_name) +# self.verify_instance_gpu_agent(machine_name) self.verify_pyspark() @parameterized.parameters( diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py index 9cd0ca2c8..9f2b18683 100644 --- a/gpu/verify_pyspark.py +++ b/gpu/verify_pyspark.py @@ -23,7 +23,6 @@ from tqdm import tqdm from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover import pyspark.sql.functions as f -import nltk spark = SparkSession.builder.appName("spark-rapids").getOrCreate() From e221ede81e85c68f79672fa3fb0b9f480d405ff8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 3 Jan 2025 23:39:33 -0800 Subject: [PATCH 079/130] re-enable ssh tests --- gpu/test_gpu.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 19fb7fe81..fda5785f3 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -175,9 +175,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) -# self.verify_instance(machine_name) -# self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) -# self.verify_instance_pyspark(machine_name) + self.verify_instance(machine_name) + self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) + self.verify_instance_pyspark(machine_name) self.verify_pyspark() @parameterized.parameters( @@ -209,7 +209,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, boot_disk_size="50GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) -# self.verify_instance(machine_name) + self.verify_instance(machine_name) self.verify_pyspark() @parameterized.parameters( ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), @@ -242,8 +242,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) -# self.verify_instance(machine_name) -# self.verify_instance_gpu_agent(machine_name) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) self.verify_pyspark() @parameterized.parameters( @@ -292,9 +292,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - #self.verify_instance(machine_name) - #self.verify_instance_nvcc(machine_name, cuda_version) - #self.verify_instance_pyspark(machine_name) + self.verify_instance(machine_name) + self.verify_instance_nvcc(machine_name, cuda_version) + self.verify_instance_pyspark(machine_name) self.verify_pyspark() @parameterized.parameters( @@ -412,8 +412,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) -# self.verify_instance(machine_name) -# self.verify_instance_gpu_agent(machine_name) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) self.verify_pyspark() @parameterized.parameters( @@ -471,9 +471,9 @@ def tests_driver_signing(self, configuration, machine_suffixes, scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: hostname="{}-{}".format(self.getClusterName(),machine_suffix) -# self.verify_instance(hostname) -# self.verify_instance_gpu_agent(hostname) -# self.verify_driver_signature(hostname) + self.verify_instance(hostname) + self.verify_instance_gpu_agent(hostname) + self.verify_driver_signature(hostname) self.verify_pyspark() From c9950a8e8071a4359a524016408e4ae8d517d6a8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 4 Jan 2025 00:16:16 -0800 Subject: [PATCH 080/130] refactored ssh command retry code into the base class --- gpu/test_gpu.py | 51 ------------------------- integration_tests/dataproc_test_case.py | 21 ++++++++-- 2 files changed, 17 insertions(+), 55 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index fda5785f3..ab2457ec2 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -16,39 +16,8 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" - GPU_V100 = "type=nvidia-tesla-v100" - GPU_A100 = "type=nvidia-tesla-a100,count=2" GPU_H100 = "type=nvidia-h100-80gb,count=8" - # Tests for PyTorch - TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py" - - # Tests for TensorFlow - TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py" - - def assert_instance_command(self, - instance, - cmd, - timeout_in_minutes=DEFAULT_TIMEOUT): - - retry_count = 5 - - ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format( - instance, self.cluster_zone, cmd) - - while retry_count > 0: - try: - ret_code, stdout, stderr = self.assert_command( ssh_cmd, timeout_in_minutes ) - return ret_code, stdout, stderr - except Exception as e: - print("An error occurred: ", e) - retry_count -= 1 - if retry_count > 0: - time.sleep(10) - continue - else: - raise - def verify_instance(self, name): # Verify that nvidia-smi works import random @@ -56,26 +25,6 @@ def verify_instance(self, name): time.sleep( 3 + random.randint(1, 30) ) self.assert_instance_command(name, "nvidia-smi", 1) - def verify_pytorch(self, name): - test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), - self.TORCH_TEST_SCRIPT_FILE_NAME) - self.upload_test_file(test_filename, name) - - verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( - self.TORCH_TEST_SCRIPT_FILE_NAME) - self.assert_instance_command(name, verify_cmd) - self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) - - def verify_tensorflow(self, name): - test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), - self.TF_TEST_SCRIPT_FILE_NAME) - self.upload_test_file(test_filename, name) - - verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( - self.TF_TEST_SCRIPT_FILE_NAME) - self.assert_instance_command(name, verify_cmd) - self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) - def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 936718498..aed5bd7e8 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -286,11 +286,24 @@ def assert_instance_command(self, Raises: AssertionError: if command returned non-0 exit code. """ + retry_count = 5 - ret_code, stdout, stderr = self.assert_command( - 'gcloud compute ssh {} --zone={} --command="{}"'.format( - instance, self.cluster_zone, cmd), timeout_in_minutes) - return ret_code, stdout, stderr + ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format( + instance, self.cluster_zone, cmd) + + while retry_count > 0: + try: + ret_code, stdout, stderr = self.assert_command( + ssh_cmd, timeout_in_minutes ) + return ret_code, stdout, stderr + except Exception as e: + print("An error occurred: ", e) + retry_count -= 1 + if retry_count > 0: + time.sleep(10) + continue + else: + raise def assert_dataproc_job(self, cluster_name, From 8143d4cef910b78ccb394b17c1663f0e4c1d95a1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 4 Jan 2025 00:23:17 -0800 Subject: [PATCH 081/130] remembered the imports ; sleep a random period --- integration_tests/dataproc_test_case.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index aed5bd7e8..e487dd8c5 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -7,6 +7,8 @@ import string import subprocess import sys +import time +import random from threading import Timer import pkg_resources @@ -300,7 +302,7 @@ def assert_instance_command(self, print("An error occurred: ", e) retry_count -= 1 if retry_count > 0: - time.sleep(10) + time.sleep( 3 + random.randint(1, 10) ) continue else: raise From 834f7d5f128719b16762869e5ee396a9a4754193 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 4 Jan 2025 01:16:07 -0800 Subject: [PATCH 082/130] A100->H100 --- gpu/test_gpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index ab2457ec2..1f3328eaa 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -247,9 +247,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.verify_pyspark() @parameterized.parameters( - ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), -# ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"), - ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"), + ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "11.8"), +# ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.0"), + ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.4"), ) def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -278,7 +278,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, configuration, self.INIT_ACTIONS, master_machine_type="a3-highgpu-8g", - worker_machine_type="a2-highgpu-2g", + worker_machine_type="a3-highgpu-8g", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, From 3d837955662b94899e793ec5cd069d50474affd6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 4 Jan 2025 01:25:32 -0800 Subject: [PATCH 083/130] fixing whitespace for python --- integration_tests/dataproc_test_case.py | 35 +++++++++++++------------ 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index e487dd8c5..c8a33b8bd 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -288,24 +288,25 @@ def assert_instance_command(self, Raises: AssertionError: if command returned non-0 exit code. """ - retry_count = 5 - ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format( - instance, self.cluster_zone, cmd) - - while retry_count > 0: - try: - ret_code, stdout, stderr = self.assert_command( - ssh_cmd, timeout_in_minutes ) - return ret_code, stdout, stderr - except Exception as e: - print("An error occurred: ", e) - retry_count -= 1 - if retry_count > 0: - time.sleep( 3 + random.randint(1, 10) ) - continue - else: - raise + retry_count = 5 + + ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format( + instance, self.cluster_zone, cmd) + + while retry_count > 0: + try: + ret_code, stdout, stderr = self.assert_command( + ssh_cmd, timeout_in_minutes ) + return ret_code, stdout, stderr + except Exception as e: + print("An error occurred: ", e) + retry_count -= 1 + if retry_count > 0: + time.sleep( 3 + random.randint(1, 10) ) + continue + else: + raise def assert_dataproc_job(self, cluster_name, From 7718e5abefc6689f0dde35afe44e24f3e55e7876 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 4 Jan 2025 17:08:37 -0800 Subject: [PATCH 084/130] moved knox variables to common env ; renamed ambiguous variable name --- templates/common/util_functions | 11 + templates/dask/dask.sh.in | 75 +++++ templates/dask/util_functions | 502 ++++++++++++++++++++++++++++++++ templates/rapids/rapids.sh.in | 79 +++++ 4 files changed, 667 insertions(+) create mode 100644 templates/dask/dask.sh.in create mode 100644 templates/dask/util_functions create mode 100644 templates/rapids/rapids.sh.in diff --git a/templates/common/util_functions b/templates/common/util_functions index 6d58103a7..ac4809796 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -539,6 +539,12 @@ function check_secure_boot() { mok_der=/var/lib/dkms/mok.pub ; fi } +function restart_knox() { + systemctl stop knox + rm -rf "${KNOX_HOME}/data/deployments/*" + systemctl start knox +} + function install_dependencies() { test -f "${workdir}/complete/install-dependencies" && return 0 pkg_list="screen" @@ -578,6 +584,11 @@ function prepare_common_env() { readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive + # Knox config + readonly KNOX_HOME=/usr/lib/knox + readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0" + readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" + mkdir -p "${workdir}/complete" set_proxy mount_ramdisk diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in new file mode 100644 index 000000000..84a279f0a --- /dev/null +++ b/templates/dask/dask.sh.in @@ -0,0 +1,75 @@ +#!/bin/bash +# +[% INSERT legal/license_header %] +# +[% PROCESS common/template_disclaimer %] +# +# This initialization action script will install Dask and other relevant +# libraries on a Dataproc cluster. This is supported for either "yarn" or +# "standalone" runtimes Please see dask.org and yarn.dask.org for more +# information. + +set -euxo pipefail + +[% INSERT common/util_functions %] + +[% INSERT gpu/util_functions %] + +[% INSERT dask/util_functions %] + +function main() { + # Install Dask + install_dask + + # In "standalone" mode, Dask relies on a systemd unit to launch. + # In "yarn" mode, it relies on a config.yaml file. + if [[ "${DASK_RUNTIME}" == "yarn" ]]; then + # Create Dask YARN config file + configure_dask_yarn + elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then + # Create Dask service + install_systemd_dask_service + + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + systemctl start "${DASK_SCHEDULER_SERVICE}" + systemctl status "${DASK_SCHEDULER_SERVICE}" + fi + + echo "Starting Dask 'standalone' cluster..." + if [[ "${enable_worker_service}" == "1" ]]; then + systemctl start "${DASK_WORKER_SERVICE}" + systemctl status "${DASK_WORKER_SERVICE}" + fi + + configure_knox_for_dask + + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" + if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then + configure_fluentd_for_dask + fi + else + echo "Unsupported Dask Runtime: ${DASK_RUNTIME}" + exit 1 + fi + + echo "Dask for ${DASK_RUNTIME} successfully initialized." +} + +function exit_handler() { + gpu_exit_handler + common_exit_handler + return 0 +} + +function prepare_to_install(){ + prepare_common_env + prepare_gpu_env + conda_env="$(get_metadata_attribute conda-env || echo 'dask')" + readonly conda_env + prepare_dask_env + trap exit_handler EXIT +} + +prepare_to_install + +main diff --git a/templates/dask/util_functions b/templates/dask/util_functions new file mode 100644 index 000000000..47e10a7d3 --- /dev/null +++ b/templates/dask/util_functions @@ -0,0 +1,502 @@ +function configure_dask_yarn() { + readonly DASK_YARN_CONFIG_DIR=/etc/dask/ + readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml + # Minimal custom configuration is required for this + # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage + # for information on tuning Dask-Yarn environments. + mkdir -p "${DASK_YARN_CONFIG_DIR}" + + local worker_class="dask.distributed.Nanny" + local gpu_count="0" + if command -v nvidia-smi ; then + gpu_count="1" + worker_class="dask_cuda.CUDAWorker" + fi + + cat <"${DASK_YARN_CONFIG_FILE}" +# Config file for Dask Yarn. +# +# These values are joined on top of the default config, found at +# https://yarn.dask.org/en/latest/configuration.html#default-configuration + +yarn: + environment: python://${DASK_CONDA_ENV}/bin/python + + worker: + count: 2 + gpus: ${gpu_count} + worker_class: ${worker_class} +EOF +} + +function install_systemd_dask_worker() { + echo "Installing systemd Dask Worker service..." + local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" + + mkdir -p "${dask_worker_local_dir}" + + local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh" + + local compute_mode_cmd="" + if command -v nvidia-smi ; then compute_mode_cmd="nvidia-smi --compute-mode=DEFAULT" ; fi + local worker_name="dask-worker" + if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda-worker" ; fi + local worker="${DASK_CONDA_ENV}/bin/${worker_name}" + cat <"${DASK_WORKER_LAUNCHER}" +#!/bin/bash +LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" +${compute_mode_cmd} +echo "${worker_name} starting, logging to \${LOGFILE}" +${worker} "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 +EOF + + chmod 750 "${DASK_WORKER_LAUNCHER}" + + local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service" + cat <"${dask_service_file}" +[Unit] +Description=Dask Worker Service +[Service] +Type=simple +Restart=on-failure +ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}' +[Install] +WantedBy=multi-user.target +EOF + chmod a+r "${dask_service_file}" + + systemctl daemon-reload + + # Enable the service + enable_systemd_dask_worker_service="0" + if [[ "${ROLE}" != "Master" ]]; then + enable_systemd_dask_worker_service="1" + else + # Enable service on single-node cluster (no workers) + local worker_count="$(get_metadata_attribute dataproc-worker-count)" + if [[ "${worker_count}" == "0" ]] && + [[ "$(get_metadata_attribute dask-cuda-worker-on-master 'true')" == "true" ]] && + [[ "$(get_metadata_attribute dask-worker-on-master 'true')" == "true" ]] ; then + enable_systemd_dask_worker_service="1" + fi + fi + readonly enable_systemd_dask_worker_service + + if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then + systemctl enable "${DASK_WORKER_SERVICE}" + systemctl restart "${DASK_WORKER_SERVICE}" + fi +} + +function install_systemd_dask_scheduler() { + # only run scheduler on primary master + if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi + echo "Installing systemd Dask Scheduler service..." + local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}" + + mkdir -p "${dask_scheduler_local_dir}" + + local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh" + + cat <"${DASK_SCHEDULER_LAUNCHER}" +#!/bin/bash +LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log" +echo "dask scheduler starting, logging to \${LOGFILE}" +${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1 +EOF + + chmod 750 "${DASK_SCHEDULER_LAUNCHER}" + + local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service" + cat <"${dask_service_file}" +[Unit] +Description=Dask Scheduler Service +[Service] +Type=simple +Restart=on-failure +ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}' +[Install] +WantedBy=multi-user.target +EOF + chmod a+r "${dask_service_file}" + + systemctl daemon-reload + + # Enable the service + systemctl enable "${DASK_SCHEDULER_SERVICE}" +} + +function install_systemd_dask_service() { + install_systemd_dask_scheduler + install_systemd_dask_worker +} + +function configure_knox_for_dask() { + if [[ ! -d "${KNOX_HOME}" ]]; then + echo "Skip configuring Knox rules for Dask" + return 0 + fi + + local DASK_UI_PORT=8787 + if [[ -f /etc/knox/conf/topologies/default.xml ]]; then + sed -i \ + "/<\/topology>/i DASK<\/role>http://localhost:${DASK_UI_PORT}<\/url><\/service> DASKWS<\/role>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \ + /etc/knox/conf/topologies/default.xml + fi + + mkdir -p "${KNOX_DASK_DIR}" + + cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +EOF + + cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +EOF + + mkdir -p "${KNOX_DASKWS_DIR}" + + cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF' + + + + + + + + + + + + + + + + + + + +EOF + + cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF' + + + + + + + +EOF + + chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}" + + # Do not restart knox during pre-init script run + if [[ -n "${ROLE}" ]]; then + restart_knox + fi +} + +function configure_fluentd_for_dask() { + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + cat >/etc/google-fluentd/config.d/dataproc-dask.conf < + @type tail + path /var/log/dask-scheduler.log + pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos + read_from_head true + tag google.dataproc.dask-scheduler + + @type none + + + + + @type record_transformer + + filename dask-scheduler.log + + +EOF + fi + + if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then + cat >>/etc/google-fluentd/config.d/dataproc-dask.conf < + @type tail + path /var/log/dask-worker.log + pos_file /var/tmp/fluentd.dataproc.dask.worker.pos + read_from_head true + tag google.dataproc.dask-worker + + @type none + + + + + @type record_transformer + + filename dask-worker.log + + +EOF + fi + + systemctl restart google-fluentd +} + +function install_dask() { + if is_cuda12 ; then + local python_spec="python>=3.11" + local cuda_spec="cuda-version>=12,<13" + local dask_spec="dask>=2024.7" + elif is_cuda11 ; then + local python_spec="python>=3.9" + local cuda_spec="cuda-version>=11,<12.0a0" + local dask_spec="dask" + fi + + CONDA_PACKAGES=() + if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then + # Pin `distributed` and `dask` package versions to old release + # because `dask-yarn` 0.9 uses skein in a way which + # is not compatible with `distributed` package 2022.2 and newer: + # https://github.com/dask/dask-yarn/issues/155 + + dask_spec="dask<2022.2" + python_spec="python>=3.7,<3.8.0a0" + if is_ubuntu18 ; then + # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic + CONDA_PACKAGES+=("fiona<1.8.22") + fi + CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2") + fi + + CONDA_PACKAGES+=( + "${cuda_spec}" + "${dask_spec}" + "dask-bigquery" + "dask-ml" + "dask-sql" + ) + + # Install dask + mamba="/opt/conda/miniconda3/bin/mamba" + conda="/opt/conda/miniconda3/bin/conda" + + ( set +e + local is_installed=0 + for installer in "${mamba}" "${conda}" ; do + test -d "${DASK_CONDA_ENV}" || \ + time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \ + -c 'conda-forge' -c 'nvidia' \ + ${CONDA_PACKAGES[*]} \ + "${python_spec}" \ + > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + sync + if [[ "$retval" == "0" ]] ; then + is_installed="1" + break + fi + "${conda}" config --set channel_priority flexible + done + if [[ "${is_installed}" == "0" ]]; then + echo "failed to install dask" + return 1 + fi + ) +} + +function install_dask_rapids() { + if is_cuda12 ; then + local python_spec="python>=3.11" + local cuda_spec="cuda-version>=12,<13" + local dask_spec="dask>=2024.7" + local numba_spec="numba" + elif is_cuda11 ; then + local python_spec="python>=3.9" + local cuda_spec="cuda-version>=11,<12.0a0" + local dask_spec="dask" + local numba_spec="numba" + fi + + rapids_spec="rapids>=${RAPIDS_VERSION}" + CONDA_PACKAGES=() + if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then + # Pin `distributed` and `dask` package versions to old release + # because `dask-yarn` 0.9 uses skein in a way which + # is not compatible with `distributed` package 2022.2 and newer: + # https://github.com/dask/dask-yarn/issues/155 + + dask_spec="dask<2022.2" + python_spec="python>=3.7,<3.8.0a0" + rapids_spec="rapids<=24.05" + if is_ubuntu18 ; then + # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic + CONDA_PACKAGES+=("fiona<1.8.22") + fi + CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2") + fi + + CONDA_PACKAGES+=( + "${cuda_spec}" + "${rapids_spec}" + "${dask_spec}" + "dask-bigquery" + "dask-ml" + "dask-sql" + "cudf" + "${numba_spec}" + ) + + # Install cuda, rapids, dask + mamba="/opt/conda/miniconda3/bin/mamba" + conda="/opt/conda/miniconda3/bin/conda" + + ( set +e + local is_installed="0" + for installer in "${mamba}" "${conda}" ; do + test -d "${DASK_CONDA_ENV}" || \ + time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \ + -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + ${CONDA_PACKAGES[*]} \ + "${python_spec}" \ + > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + sync + if [[ "$retval" == "0" ]] ; then + is_installed="1" + break + fi + "${conda}" config --set channel_priority flexible + done + if [[ "${is_installed}" == "0" ]]; then + echo "failed to install dask" + return 1 + fi + ) +} + +function prepare_dask_env() { + # Dask config + DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" + readonly DASK_RUNTIME + readonly DASK_SERVICE=dask-cluster + readonly DASK_WORKER_SERVICE=dask-worker + readonly DASK_SCHEDULER_SERVICE=dask-scheduler + readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/${conda_env}" +} + +function prepare_dask_rapids_env(){ + prepare_dask_env + # RAPIDS config + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') + readonly RAPIDS_RUNTIME + + local DEFAULT_DASK_RAPIDS_VERSION="24.08" + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then + DEFAULT_DASK_RAPIDS_VERSION="23.08" # Final release to support spark 3.1.3 + fi + readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) +} + + +function dask_exit_handler() { + echo "no exit handler for dask" +} diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in new file mode 100644 index 000000000..9c74f5f3f --- /dev/null +++ b/templates/rapids/rapids.sh.in @@ -0,0 +1,79 @@ +#!/bin/bash +# +[% INSERT legal/license_header %] +# +[% PROCESS common/template_disclaimer %] +# +# This initialization action script will install rapids on a Dataproc +# cluster. + +set -euxo pipefail + +[% INSERT common/util_functions %] + +[% INSERT gpu/util_functions %] + +[% INSERT dask/util_functions %] + +function main() { + # Install Dask with RAPIDS + install_dask_rapids + + # In "standalone" mode, Dask relies on a systemd unit to launch. + # In "yarn" mode, it relies a config.yaml file. + if [[ "${DASK_RUNTIME}" == "yarn" ]]; then + # Create cuda accelerated Dask YARN config file + configure_dask_yarn + else + # Create Dask service + install_systemd_dask_service + + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + systemctl start "${DASK_SCHEDULER_SERVICE}" + systemctl status "${DASK_SCHEDULER_SERVICE}" + fi + + echo "Starting Dask 'standalone' cluster..." + if [[ "${enable_worker_service}" == "1" ]]; then + systemctl start "${DASK_WORKER_SERVICE}" + systemctl status "${DASK_WORKER_SERVICE}" + fi + + configure_knox_for_dask + + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" + if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then + configure_fluentd_for_dask + fi + fi + + echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized." + if [[ "${ROLE}" == "Master" ]]; then + systemctl restart hadoop-yarn-resourcemanager.service + # Restart NodeManager on Master as well if this is a single-node-cluster. + if systemctl list-units | grep hadoop-yarn-nodemanager; then + systemctl restart hadoop-yarn-nodemanager.service + fi + else + systemctl restart hadoop-yarn-nodemanager.service + fi +} + +function exit_handler() { + gpu_exit_handler + common_exit_handler + return 0 +} + +function prepare_to_install(){ + prepare_common_env + prepare_gpu_env + conda_env="$(get_metadata_attribute conda-env || echo 'dask-rapids')" + readonly conda_env + prepare_dask_rapids_env + trap exit_handler EXIT +} + +prepare_to_install + +main From aded30b112fe39504f1a86b7896b6b893ed7b794 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 4 Jan 2025 17:20:17 -0800 Subject: [PATCH 085/130] remove gpu related code from dask action --- templates/dask/dask.sh.in | 2 - templates/dask/util_functions | 12 +- templates/rapids/util_functions | 0 templates/spark-rapids/spark-rapids.sh | 807 +++++++++++++++++++++++++ 4 files changed, 809 insertions(+), 12 deletions(-) create mode 100644 templates/rapids/util_functions create mode 100644 templates/spark-rapids/spark-rapids.sh diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index 84a279f0a..fd14be4a7 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -13,8 +13,6 @@ set -euxo pipefail [% INSERT common/util_functions %] -[% INSERT gpu/util_functions %] - [% INSERT dask/util_functions %] function main() { diff --git a/templates/dask/util_functions b/templates/dask/util_functions index 47e10a7d3..b9377b785 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -343,15 +343,8 @@ EOF } function install_dask() { - if is_cuda12 ; then - local python_spec="python>=3.11" - local cuda_spec="cuda-version>=12,<13" - local dask_spec="dask>=2024.7" - elif is_cuda11 ; then - local python_spec="python>=3.9" - local cuda_spec="cuda-version>=11,<12.0a0" - local dask_spec="dask" - fi + local python_spec="python>=3.11" + local dask_spec="dask>=2024.7" CONDA_PACKAGES=() if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then @@ -370,7 +363,6 @@ function install_dask() { fi CONDA_PACKAGES+=( - "${cuda_spec}" "${dask_spec}" "dask-bigquery" "dask-ml" diff --git a/templates/rapids/util_functions b/templates/rapids/util_functions new file mode 100644 index 000000000..e69de29bb diff --git a/templates/spark-rapids/spark-rapids.sh b/templates/spark-rapids/spark-rapids.sh new file mode 100644 index 000000000..c03bf80ef --- /dev/null +++ b/templates/spark-rapids/spark-rapids.sh @@ -0,0 +1,807 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2. +# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only +# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions. +# Note that the script is designed to work when secure boot is disabled during cluster creation. +# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu. + +set -euxo pipefail + +function os_id() { + grep '^ID=' /etc/os-release | cut -d= -f2 | xargs +} + +function os_version() { + grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs +} + +function is_debian() { + [[ "$(os_id)" == 'debian' ]] +} + +function is_debian10() { + is_debian && [[ "$(os_version)" == '10'* ]] +} + +function is_debian11() { + is_debian && [[ "$(os_version)" == '11'* ]] +} + +function is_debian12() { + is_debian && [[ "$(os_version)" == '12'* ]] +} + +function is_ubuntu() { + [[ "$(os_id)" == 'ubuntu' ]] +} + +function is_ubuntu18() { + is_ubuntu && [[ "$(os_version)" == '18.04'* ]] +} + +function is_ubuntu20() { + is_ubuntu && [[ "$(os_version)" == '20.04'* ]] +} + +function is_ubuntu22() { + is_ubuntu && [[ "$(os_version)" == '22.04'* ]] +} + +function is_rocky() { + [[ "$(os_id)" == 'rocky' ]] +} + +function is_rocky8() { + is_rocky && [[ "$(os_version)" == '8'* ]] +} + +function is_rocky9() { + is_rocky && [[ "$(os_version)" == '9'* ]] +} + +function os_vercat() { + if is_ubuntu ; then + os_version | sed -e 's/[^0-9]//g' + elif is_rocky ; then + os_version | sed -e 's/[^0-9].*$//g' + else + os_version + fi +} + +function get_metadata_attribute() { + local -r attribute_name=$1 + local -r default_value="${2:-}" + /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" +} + +CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" +PSN="$(get_metadata_attribute private_secret_name)" +readonly PSN +function configure_dkms_certs() { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + else + modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3" + fi + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key modulus" + fi + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert modulus" + fi + + return + fi + + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + # symlink private key and copy public cert from volatile storage for DKMS + if is_ubuntu ; then + mkdir -p /var/lib/shim-signed/mok + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv + cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der + else + mkdir -p /var/lib/dkms/ + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key + cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub + fi +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >2 + return 0 + fi + echo "WARN -- PURGING SIGNING MATERIAL -- WARN" >2 + echo "future dkms runs will not use correct signing key" >2 + rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv +} + +function add_contrib_components() { + if ! is_debian ; then + return + fi + if is_debian12 ; then + # Include in sources file components on which nvidia-open-kernel-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi +} + +# Short name for nvidia urls +if is_rocky ; then + shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" +else + shortname="$(os_id)$(os_vercat)" +fi +readonly shortname + +# Detect dataproc image version from its various names +if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" +fi + +# Fetch Linux Family distro and Dataproc Image version +readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') + +# Fetch SPARK config +readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) +if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then + readonly DEFAULT_XGBOOST_VERSION="1.7.6" + readonly SPARK_VERSION="3.0" +else + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 +fi + +# Update SPARK RAPIDS config +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0" +readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) +readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) + +# Fetch instance roles and runtime +readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) +readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) +readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + +# CUDA version and Driver version config +CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1') #12.2.2 +NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05 +CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 + +# EXCEPTIONS +# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) +if [[ "${OS_NAME}" == "ubuntu" ]]; then + if is_ubuntu18 ; then + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 + NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 + CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 + fi +fi + +# Verify Secure boot +SECURE_BOOT="disabled" +SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + +# Stackdriver GPU agent parameters +# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver +INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') +readonly INSTALL_GPU_AGENT + +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' + +NVIDIA_SMI_PATH='/usr/bin' +MIG_MAJOR_CAPS=0 +IS_MIG_ENABLED=0 + +function execute_with_retries() { + local -r cmd=$1 + for ((i = 0; i < 10; i++)); do + if time eval "$cmd"; then + return 0 + fi + sleep 5 + done + return 1 +} + +function install_spark_rapids() { + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ + -P /usr/lib/spark/jars/ +} + +function configure_spark() { + if [[ "${SPARK_VERSION}" == "3"* ]]; then + cat >>${SPARK_CONF_DIR}/spark-defaults.conf <>${SPARK_CONF_DIR}/spark-defaults.conf </lib/systemd/system/install-headers.service +[Unit] +Description=Install Linux headers for the current kernel +After=network-online.target + +[Service] +ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done' +Type=oneshot +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target +EOF + + # Reload systemd to recognize the new unit file + systemctl daemon-reload + + # Enable and start the service + systemctl enable --now install-headers.service +} + +readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' +readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" + +# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades +# Users should run apt-mark unhold before they wish to upgrade these packages +function hold_nvidia_packages() { + apt-mark hold nvidia-* + apt-mark hold libnvidia-* + if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then + apt-mark hold xserver-xorg-video-nvidia* + fi +} + +# Install NVIDIA GPU driver provided by NVIDIA +function install_nvidia_gpu_driver() { + + ## common steps for all linux family distros + readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*} + + ## For Debian & Ubuntu + readonly LOCAL_INSTALLER_DEB="cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" + readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" + readonly DIST_KEYRING_DIR="/var/cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local" + + ## installation steps based OS + if is_debian ; then + + export DEBIAN_FRONTEND=noninteractive + + execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb + + dpkg -i /tmp/local-installer.deb + rm /tmp/local-installer.deb + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + + add_contrib_components + + execute_with_retries "apt-get update" + + ## EXCEPTION + if is_debian10 ; then + apt-get remove -y libglvnd0 + apt-get install -y ca-certificates-java + fi + + configure_dkms_certs + execute_with_retries "apt-get install -y -q nvidia-kernel-open-dkms" + clear_dkms_key + execute_with_retries \ + "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" + execute_with_retries \ + "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" + + modprobe nvidia + + # enable a systemd service that updates kernel headers after reboot + setup_systemd_update_headers + # prevent auto upgrading nvidia packages + hold_nvidia_packages + + elif is_ubuntu ; then + + execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" + + # Ubuntu 18.04 is not supported by new style NV debs; install from .run files + github + if is_ubuntu18 ; then + + # fetch .run file + curl -o driver.run \ + "https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run" + # Install all but kernel driver + bash driver.run --no-kernel-modules --silent --install-libglvnd + rm driver.run + + WORKDIR=/opt/install-nvidia-driver + mkdir -p "${WORKDIR}" + pushd $_ + # Fetch open souce kernel module with corresponding tag + test -d open-gpu-kernel-modules || \ + git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \ + --branch "${NVIDIA_DRIVER_VERSION}" --single-branch + cd ${WORKDIR}/open-gpu-kernel-modules + # + # build kernel modules + # + make -j$(nproc) modules \ + > /var/log/open-gpu-kernel-modules-build.log \ + 2> /var/log/open-gpu-kernel-modules-build_error.log + configure_dkms_certs + # sign + for module in $(find kernel-open -name '*.ko'); do + /lib/modules/$(uname -r)/build/scripts/sign-file sha256 \ + "${CA_TMPDIR}/db.rsa" \ + "${CA_TMPDIR}/db.der" \ + "${module}" + done + clear_dkms_key + # install + make modules_install \ + >> /var/log/open-gpu-kernel-modules-build.log \ + 2>> /var/log/open-gpu-kernel-modules-build_error.log + depmod -a + modprobe nvidia + popd + + # + # Install CUDA + # + cuda_runfile="cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \ + -o cuda.run + time bash cuda.run --silent --toolkit --no-opengl-libs + rm cuda.run + else + # Install from repo provided by NV + readonly UBUNTU_REPO_CUDA_PIN="${NVIDIA_REPO_URL}/cuda-${shortname}.pin" + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb + + dpkg -i /tmp/local-installer.deb + rm /tmp/local-installer.deb + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + execute_with_retries "apt-get update" + + execute_with_retries "apt-get install -y -q --no-install-recommends dkms" + configure_dkms_certs + for pkg in "nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" \ + "cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" \ + "cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" ; do + execute_with_retries "apt-get install -y -q --no-install-recommends ${pkg}" + done + clear_dkms_key + + modprobe nvidia + fi + + + # enable a systemd service that updates kernel headers after reboot + setup_systemd_update_headers + # prevent auto upgrading nvidia packages + hold_nvidia_packages + + elif is_rocky ; then + + # Ensure the Correct Kernel Development Packages are Installed + execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*" + execute_with_retries "dnf -y -q install pciutils kernel-devel gcc" + + readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" + execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" + execute_with_retries "dnf clean all" + configure_dkms_certs + execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms" + clear_dkms_key + execute_with_retries "dnf -y -q install cuda-toolkit" + modprobe nvidia + + else + echo "Unsupported OS: '${OS_NAME}'" + exit 1 + fi + ldconfig + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" +} + +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_agent() { + download_agent + install_agent_dependency + start_agent_service +} + +function download_agent(){ + if [[ ${OS_NAME} == rocky ]]; then + execute_with_retries "dnf -y -q install git" + else + execute_with_retries "apt-get install git -y" + fi + mkdir -p /opt/google + chmod 777 /opt/google + cd /opt/google + test -d compute-gpu-monitoring || \ + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" +} + +function install_agent_dependency(){ + cd /opt/google/compute-gpu-monitoring/linux + python3 -m venv venv + venv/bin/pip install wheel + venv/bin/pip install -Ur requirements.txt +} + +function start_agent_service(){ + cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + systemctl daemon-reload + systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service +} + +function set_hadoop_property() { + local -r config_file=$1 + local -r property=$2 + local -r value=$3 + /usr/local/bin/bdconfig set_property \ + --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ + --name "${property}" --value "${value}" \ + --clobber +} + +function configure_yarn() { + if [[ ! -f ${HADOOP_CONF_DIR}/resource-types.xml ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + +} + +function configure_gpu_exclusive_mode() { + # check if running spark 3, if not, enable GPU exclusive mode + local spark_version + spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) + if [[ ${spark_version} != 3.* ]]; then + # include exclusive mode on GPU + nvidia-smi -c EXCLUSIVE_PROCESS + fi +} + +function fetch_mig_scripts() { + mkdir -p /usr/local/yarn-mig-scripts + chmod 755 /usr/local/yarn-mig-scripts + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + chmod 755 /usr/local/yarn-mig-scripts/* +} + +function configure_gpu_script() { + # Download GPU discovery script + local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' + mkdir -p ${spark_gpu_script_dir} + # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still + # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: + # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh + echo ' +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +NUM_MIG_DEVICES=$(nvidia-smi -L | grep MIG | wc -l) +ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e '\'':a'\'' -e '\''N'\'' -e'\''$!ba'\'' -e '\''s/\n/","/g'\'') +if [ $NUM_MIG_DEVICES -gt 0 ]; then + MIG_INDEX=$(( $NUM_MIG_DEVICES - 1 )) + ADDRS=$(seq -s '\''","'\'' 0 $MIG_INDEX) +fi +echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]} +' > ${spark_gpu_script_dir}/getGpusResources.sh + + chmod a+rwx -R ${spark_gpu_script_dir} +} + +function configure_gpu_isolation() { + # enable GPU isolation + sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" + if [[ $IS_MIG_ENABLED -ne 0 ]]; then + # configure the container-executor.cfg to have major caps + printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg" + printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + else + printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HDOOP_CONF_DIR}/container-executor.cfg" + fi + + # Configure a systemd unit to ensure that permissions are set on restart + cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service< Date: Sat, 4 Jan 2025 18:02:23 -0800 Subject: [PATCH 086/130] changing failure to warning --- templates/common/util_functions | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index ac4809796..b4ef14440 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -521,13 +521,13 @@ function check_secure_boot() { readonly PSN if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then - echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." - exit 1 + echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster." + return elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then echo "Secure boot is enabled, but no signing material provided." - echo "Please either disable secure boot or provide signing material as per" + echo "Consider either disabling secure boot or provide signing material as per" echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - return 1 + return fi CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" From 0439a8df0ad203579923dcbd485a31f2bb05752b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 4 Jan 2025 18:11:09 -0800 Subject: [PATCH 087/130] removing more gpu stuff from dask --- templates/dask/dask.sh.in | 2 -- 1 file changed, 2 deletions(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index fd14be4a7..20fd39619 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -54,14 +54,12 @@ function main() { } function exit_handler() { - gpu_exit_handler common_exit_handler return 0 } function prepare_to_install(){ prepare_common_env - prepare_gpu_env conda_env="$(get_metadata_attribute conda-env || echo 'dask')" readonly conda_env prepare_dask_env From 0251358290ff06d0654f7b8090c5a9346b6b2ef5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 14:47:13 -0800 Subject: [PATCH 088/130] moved MASTER global variable to common/util_functions --- templates/common/util_functions | 4 ++++ templates/spark-rapids/spark-rapids.sh.in | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index b4ef14440..accd93d3b 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -574,6 +574,10 @@ function prepare_common_env() { ROLE="$(get_metadata_attribute dataproc-role)" readonly ROLE + # master node + MASTER="$(get_metadata_attribute dataproc-master)" + readonly MASTER + workdir=/opt/install-dpgce tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 004080690..dc3ce3b36 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -60,9 +60,6 @@ function prepare_to_install(){ prepare_common_env prepare_gpu_env trap exit_handler EXIT - - # Fetch instance roles and runtime - readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) } prepare_to_install From a643c9ab56c13d6613ca997f974cb35f9353a0f1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 15:01:59 -0800 Subject: [PATCH 089/130] correct variable name --- templates/dask/dask.sh.in | 2 +- templates/rapids/rapids.sh.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index 20fd39619..d006ef388 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -34,7 +34,7 @@ function main() { fi echo "Starting Dask 'standalone' cluster..." - if [[ "${enable_worker_service}" == "1" ]]; then + if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in index 9c74f5f3f..8c7d85529 100644 --- a/templates/rapids/rapids.sh.in +++ b/templates/rapids/rapids.sh.in @@ -34,7 +34,7 @@ function main() { fi echo "Starting Dask 'standalone' cluster..." - if [[ "${enable_worker_service}" == "1" ]]; then + if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi From d6867d99369e0a0e0afd48a67fd15df9b1858237 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 15:36:19 -0800 Subject: [PATCH 090/130] moved hold_nvidia_packages out of common environment prepare into gpu env prep ; removed accidental inclusion of original spark-rapids.sh --- templates/common/util_functions | 1 - templates/gpu/util_functions | 3 + templates/spark-rapids/spark-rapids.sh | 807 ------------------------- 3 files changed, 3 insertions(+), 808 deletions(-) delete mode 100644 templates/spark-rapids/spark-rapids.sh diff --git a/templates/common/util_functions b/templates/common/util_functions index accd93d3b..ba66d2d55 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -674,7 +674,6 @@ function common_exit_handler() { # re-hold systemd package if ge_debian12 ; then apt-mark hold systemd libsystemd0 ; fi - hold_nvidia_packages else dnf clean all fi diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 6409d3fb1..46b49ef36 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1274,6 +1274,8 @@ function prepare_gpu_env(){ # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades # Users should run apt-mark unhold before they wish to upgrade these packages function hold_nvidia_packages() { + if ! is_debuntu ; then return ; fi + apt-mark hold nvidia-* apt-mark hold libnvidia-* if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then @@ -1436,4 +1438,5 @@ function gpu_exit_handler() { fi done fi + hold_nvidia_packages } diff --git a/templates/spark-rapids/spark-rapids.sh b/templates/spark-rapids/spark-rapids.sh deleted file mode 100644 index c03bf80ef..000000000 --- a/templates/spark-rapids/spark-rapids.sh +++ /dev/null @@ -1,807 +0,0 @@ -#!/bin/bash -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2. -# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only -# Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions. -# Note that the script is designed to work when secure boot is disabled during cluster creation. -# It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu. - -set -euxo pipefail - -function os_id() { - grep '^ID=' /etc/os-release | cut -d= -f2 | xargs -} - -function os_version() { - grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs -} - -function is_debian() { - [[ "$(os_id)" == 'debian' ]] -} - -function is_debian10() { - is_debian && [[ "$(os_version)" == '10'* ]] -} - -function is_debian11() { - is_debian && [[ "$(os_version)" == '11'* ]] -} - -function is_debian12() { - is_debian && [[ "$(os_version)" == '12'* ]] -} - -function is_ubuntu() { - [[ "$(os_id)" == 'ubuntu' ]] -} - -function is_ubuntu18() { - is_ubuntu && [[ "$(os_version)" == '18.04'* ]] -} - -function is_ubuntu20() { - is_ubuntu && [[ "$(os_version)" == '20.04'* ]] -} - -function is_ubuntu22() { - is_ubuntu && [[ "$(os_version)" == '22.04'* ]] -} - -function is_rocky() { - [[ "$(os_id)" == 'rocky' ]] -} - -function is_rocky8() { - is_rocky && [[ "$(os_version)" == '8'* ]] -} - -function is_rocky9() { - is_rocky && [[ "$(os_version)" == '9'* ]] -} - -function os_vercat() { - if is_ubuntu ; then - os_version | sed -e 's/[^0-9]//g' - elif is_rocky ; then - os_version | sed -e 's/[^0-9].*$//g' - else - os_version - fi -} - -function get_metadata_attribute() { - local -r attribute_name=$1 - local -r default_value="${2:-}" - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" -} - -CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" -PSN="$(get_metadata_attribute private_secret_name)" -readonly PSN -function configure_dkms_certs() { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping"; - return 0 - fi - - mkdir -p "${CA_TMPDIR}" - - # If the private key exists, verify it - if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then - echo "Private key material exists" - - local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then - modulus_md5sum="${expected_modulus_md5sum}" - else - modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3" - fi - - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key modulus" - fi - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key - - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert modulus" - fi - - return - fi - - - # Retrieve cloud secrets keys - local sig_priv_secret_name - sig_priv_secret_name="${PSN}" - local sig_pub_secret_name - sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" - local sig_secret_project - sig_secret_project="$(get_metadata_attribute secret_project)" - local sig_secret_version - sig_secret_version="$(get_metadata_attribute secret_version)" - - # If metadata values are not set, do not write mok keys - if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi - - # Write private material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_priv_secret_name}" \ - | dd status=none of="${CA_TMPDIR}/db.rsa" - - # Write public material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_pub_secret_name}" \ - | base64 --decode \ - | dd status=none of="${CA_TMPDIR}/db.der" - - # symlink private key and copy public cert from volatile storage for DKMS - if is_ubuntu ; then - mkdir -p /var/lib/shim-signed/mok - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv - cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der - else - mkdir -p /var/lib/dkms/ - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key - cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub - fi -} - -function clear_dkms_key { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping" >2 - return 0 - fi - echo "WARN -- PURGING SIGNING MATERIAL -- WARN" >2 - echo "future dkms runs will not use correct signing key" >2 - rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv -} - -function add_contrib_components() { - if ! is_debian ; then - return - fi - if is_debian12 ; then - # Include in sources file components on which nvidia-open-kernel-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib" - - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list - fi -} - -# Short name for nvidia urls -if is_rocky ; then - shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" -else - shortname="$(os_id)$(os_vercat)" -fi -readonly shortname - -# Detect dataproc image version from its various names -if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" -fi - -# Fetch Linux Family distro and Dataproc Image version -readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') - -# Fetch SPARK config -readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then - readonly DEFAULT_XGBOOST_VERSION="1.7.6" - readonly SPARK_VERSION="3.0" -else - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 -fi - -# Update SPARK RAPIDS config -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0" -readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) -readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) - -# Fetch instance roles and runtime -readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) -readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) -readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - -# CUDA version and Driver version config -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1') #12.2.2 -NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05 -CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 - -# EXCEPTIONS -# Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) -if [[ "${OS_NAME}" == "ubuntu" ]]; then - if is_ubuntu18 ; then - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 - NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 - CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 - fi -fi - -# Verify Secure boot -SECURE_BOOT="disabled" -SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') - -# Stackdriver GPU agent parameters -# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') -readonly INSTALL_GPU_AGENT - -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' - -NVIDIA_SMI_PATH='/usr/bin' -MIG_MAJOR_CAPS=0 -IS_MIG_ENABLED=0 - -function execute_with_retries() { - local -r cmd=$1 - for ((i = 0; i < 10; i++)); do - if time eval "$cmd"; then - return 0 - fi - sleep 5 - done - return 1 -} - -function install_spark_rapids() { - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ -} - -function configure_spark() { - if [[ "${SPARK_VERSION}" == "3"* ]]; then - cat >>${SPARK_CONF_DIR}/spark-defaults.conf <>${SPARK_CONF_DIR}/spark-defaults.conf </lib/systemd/system/install-headers.service -[Unit] -Description=Install Linux headers for the current kernel -After=network-online.target - -[Service] -ExecStart=/bin/bash -c 'count=0; while [ \$count -lt 3 ]; do /usr/bin/apt-get install -y -q linux-headers-\$(/bin/uname -r) && break; count=\$((count+1)); sleep 5; done' -Type=oneshot -RemainAfterExit=yes - -[Install] -WantedBy=multi-user.target -EOF - - # Reload systemd to recognize the new unit file - systemctl daemon-reload - - # Enable and start the service - systemctl enable --now install-headers.service -} - -readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' -readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" - -# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades -# Users should run apt-mark unhold before they wish to upgrade these packages -function hold_nvidia_packages() { - apt-mark hold nvidia-* - apt-mark hold libnvidia-* - if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then - apt-mark hold xserver-xorg-video-nvidia* - fi -} - -# Install NVIDIA GPU driver provided by NVIDIA -function install_nvidia_gpu_driver() { - - ## common steps for all linux family distros - readonly NVIDIA_DRIVER_VERSION_PREFIX=${NVIDIA_DRIVER_VERSION%%.*} - - ## For Debian & Ubuntu - readonly LOCAL_INSTALLER_DEB="cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local_${CUDA_VERSION}-${NVIDIA_DRIVER_VERSION}-1_amd64.deb" - readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" - readonly DIST_KEYRING_DIR="/var/cuda-repo-${shortname}-${CUDA_VERSION_MAJOR//./-}-local" - - ## installation steps based OS - if is_debian ; then - - export DEBIAN_FRONTEND=noninteractive - - execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb - - dpkg -i /tmp/local-installer.deb - rm /tmp/local-installer.deb - cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ - - add_contrib_components - - execute_with_retries "apt-get update" - - ## EXCEPTION - if is_debian10 ; then - apt-get remove -y libglvnd0 - apt-get install -y ca-certificates-java - fi - - configure_dkms_certs - execute_with_retries "apt-get install -y -q nvidia-kernel-open-dkms" - clear_dkms_key - execute_with_retries \ - "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" - execute_with_retries \ - "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" - - modprobe nvidia - - # enable a systemd service that updates kernel headers after reboot - setup_systemd_update_headers - # prevent auto upgrading nvidia packages - hold_nvidia_packages - - elif is_ubuntu ; then - - execute_with_retries "apt-get install -y -q 'linux-headers-$(uname -r)'" - - # Ubuntu 18.04 is not supported by new style NV debs; install from .run files + github - if is_ubuntu18 ; then - - # fetch .run file - curl -o driver.run \ - "https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run" - # Install all but kernel driver - bash driver.run --no-kernel-modules --silent --install-libglvnd - rm driver.run - - WORKDIR=/opt/install-nvidia-driver - mkdir -p "${WORKDIR}" - pushd $_ - # Fetch open souce kernel module with corresponding tag - test -d open-gpu-kernel-modules || \ - git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \ - --branch "${NVIDIA_DRIVER_VERSION}" --single-branch - cd ${WORKDIR}/open-gpu-kernel-modules - # - # build kernel modules - # - make -j$(nproc) modules \ - > /var/log/open-gpu-kernel-modules-build.log \ - 2> /var/log/open-gpu-kernel-modules-build_error.log - configure_dkms_certs - # sign - for module in $(find kernel-open -name '*.ko'); do - /lib/modules/$(uname -r)/build/scripts/sign-file sha256 \ - "${CA_TMPDIR}/db.rsa" \ - "${CA_TMPDIR}/db.der" \ - "${module}" - done - clear_dkms_key - # install - make modules_install \ - >> /var/log/open-gpu-kernel-modules-build.log \ - 2>> /var/log/open-gpu-kernel-modules-build_error.log - depmod -a - modprobe nvidia - popd - - # - # Install CUDA - # - cuda_runfile="cuda_${CUDA_VERSION}_${NVIDIA_DRIVER_VERSION}_linux.run" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \ - -o cuda.run - time bash cuda.run --silent --toolkit --no-opengl-libs - rm cuda.run - else - # Install from repo provided by NV - readonly UBUNTU_REPO_CUDA_PIN="${NVIDIA_REPO_URL}/cuda-${shortname}.pin" - - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 - - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o /tmp/local-installer.deb - - dpkg -i /tmp/local-installer.deb - rm /tmp/local-installer.deb - cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ - execute_with_retries "apt-get update" - - execute_with_retries "apt-get install -y -q --no-install-recommends dkms" - configure_dkms_certs - for pkg in "nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" \ - "cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}" \ - "cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" ; do - execute_with_retries "apt-get install -y -q --no-install-recommends ${pkg}" - done - clear_dkms_key - - modprobe nvidia - fi - - - # enable a systemd service that updates kernel headers after reboot - setup_systemd_update_headers - # prevent auto upgrading nvidia packages - hold_nvidia_packages - - elif is_rocky ; then - - # Ensure the Correct Kernel Development Packages are Installed - execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*" - execute_with_retries "dnf -y -q install pciutils kernel-devel gcc" - - readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" - execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - execute_with_retries "dnf clean all" - configure_dkms_certs - execute_with_retries "dnf -y -q module install nvidia-driver:latest-dkms" - clear_dkms_key - execute_with_retries "dnf -y -q install cuda-toolkit" - modprobe nvidia - - else - echo "Unsupported OS: '${OS_NAME}'" - exit 1 - fi - ldconfig - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" -} - -# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_agent() { - download_agent - install_agent_dependency - start_agent_service -} - -function download_agent(){ - if [[ ${OS_NAME} == rocky ]]; then - execute_with_retries "dnf -y -q install git" - else - execute_with_retries "apt-get install git -y" - fi - mkdir -p /opt/google - chmod 777 /opt/google - cd /opt/google - test -d compute-gpu-monitoring || \ - execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" -} - -function install_agent_dependency(){ - cd /opt/google/compute-gpu-monitoring/linux - python3 -m venv venv - venv/bin/pip install wheel - venv/bin/pip install -Ur requirements.txt -} - -function start_agent_service(){ - cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system - systemctl daemon-reload - systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service -} - -function set_hadoop_property() { - local -r config_file=$1 - local -r property=$2 - local -r value=$3 - /usr/local/bin/bdconfig set_property \ - --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ - --name "${property}" --value "${value}" \ - --clobber -} - -function configure_yarn() { - if [[ ! -f ${HADOOP_CONF_DIR}/resource-types.xml ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - -} - -function configure_gpu_exclusive_mode() { - # check if running spark 3, if not, enable GPU exclusive mode - local spark_version - spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) - if [[ ${spark_version} != 3.* ]]; then - # include exclusive mode on GPU - nvidia-smi -c EXCLUSIVE_PROCESS - fi -} - -function fetch_mig_scripts() { - mkdir -p /usr/local/yarn-mig-scripts - chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - chmod 755 /usr/local/yarn-mig-scripts/* -} - -function configure_gpu_script() { - # Download GPU discovery script - local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' - mkdir -p ${spark_gpu_script_dir} - # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still - # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: - # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh - echo ' -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -NUM_MIG_DEVICES=$(nvidia-smi -L | grep MIG | wc -l) -ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | sed -e '\'':a'\'' -e '\''N'\'' -e'\''$!ba'\'' -e '\''s/\n/","/g'\'') -if [ $NUM_MIG_DEVICES -gt 0 ]; then - MIG_INDEX=$(( $NUM_MIG_DEVICES - 1 )) - ADDRS=$(seq -s '\''","'\'' 0 $MIG_INDEX) -fi -echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]} -' > ${spark_gpu_script_dir}/getGpusResources.sh - - chmod a+rwx -R ${spark_gpu_script_dir} -} - -function configure_gpu_isolation() { - # enable GPU isolation - sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" - if [[ $IS_MIG_ENABLED -ne 0 ]]; then - # configure the container-executor.cfg to have major caps - printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg" - printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - else - printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HDOOP_CONF_DIR}/container-executor.cfg" - fi - - # Configure a systemd unit to ensure that permissions are set on restart - cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service< Date: Sun, 5 Jan 2025 16:29:55 -0800 Subject: [PATCH 091/130] added comments and timing collection --- templates/dask/dask.sh.in | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index d006ef388..2e23c9d6d 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -28,17 +28,21 @@ function main() { # Create Dask service install_systemd_dask_service + # only run scheduler on primary master if [[ "$(hostname -s)" == "${MASTER}" ]]; then - systemctl start "${DASK_SCHEDULER_SERVICE}" + date + time systemctl start "${DASK_SCHEDULER_SERVICE}" systemctl status "${DASK_SCHEDULER_SERVICE}" fi echo "Starting Dask 'standalone' cluster..." if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then - systemctl start "${DASK_WORKER_SERVICE}" + date + time systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi + date configure_knox_for_dask local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" From 1ab3f8d5f860cb7daa451831207bb8b18eca1887 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 16:30:38 -0800 Subject: [PATCH 092/130] no need to consider unsupported dataproc < 2.0 image versions ; reducing instance type a little --- dask/test_dask.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dask/test_dask.py b/dask/test_dask.py index 440493511..1126d7d80 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -56,16 +56,13 @@ def _run_dask_test_script(self, name, script): ) def test_dask(self, configuration, instances, runtime): - if self.getImageVersion() < pkg_resources.parse_version("2.0"): - self.skipTest("Not supported in pre-2.0 images") - metadata = None if runtime: metadata = "dask-runtime={}".format(runtime) self.createCluster(configuration, self.INIT_ACTIONS, - machine_type='n1-standard-16', + machine_type='n1-highmem-8', metadata=metadata, timeout_in_minutes=20) From 598b6907b4d622212c6bb432eb33aee9d3812c0b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 16:31:18 -0800 Subject: [PATCH 093/130] using "dask-scheduler" instead of "dask scheduler" --- templates/dask/util_functions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/dask/util_functions b/templates/dask/util_functions index b9377b785..1b459a546 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -102,7 +102,7 @@ function install_systemd_dask_scheduler() { #!/bin/bash LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log" echo "dask scheduler starting, logging to \${LOGFILE}" -${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1 +${DASK_CONDA_ENV}/bin/dask-scheduler >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_SCHEDULER_LAUNCHER}" From 81c7d28b06330d321558b42aac734fc29fbc9b3f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 16:51:25 -0800 Subject: [PATCH 094/130] wait for dask scheduler before starting worker --- templates/dask/dask.sh.in | 7 +++++++ templates/dask/util_functions | 6 +++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index 2e23c9d6d..d619f9f88 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -38,6 +38,13 @@ function main() { echo "Starting Dask 'standalone' cluster..." if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then date + # Pause while scheduler comes online + retries=30 + while ! nc -vz cluster-1718310842-m 8786 ; do + sleep 3s + ((retries--) + if [[ "${retries}" == "0" ]]; then echo "dask scheduler unreachable" ; exit 1 ; fi + fi time systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi diff --git a/templates/dask/util_functions b/templates/dask/util_functions index 1b459a546..f7fe507d1 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -39,15 +39,15 @@ function install_systemd_dask_worker() { local compute_mode_cmd="" if command -v nvidia-smi ; then compute_mode_cmd="nvidia-smi --compute-mode=DEFAULT" ; fi - local worker_name="dask-worker" - if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda-worker" ; fi + local worker_name="dask worker" + if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda worker" ; fi local worker="${DASK_CONDA_ENV}/bin/${worker_name}" cat <"${DASK_WORKER_LAUNCHER}" #!/bin/bash LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" ${compute_mode_cmd} echo "${worker_name} starting, logging to \${LOGFILE}" -${worker} "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 +${worker} --local-directory="${dask_worker_local_dir}" --memory-limit=auto "${MASTER}:8786" >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_WORKER_LAUNCHER}" From 48906e1862105a36bd803292f1748189b47f5d26 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 16:54:00 -0800 Subject: [PATCH 095/130] using variable instead of my own cluster master name --- templates/dask/dask.sh.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index d619f9f88..9d82c5063 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -40,7 +40,7 @@ function main() { date # Pause while scheduler comes online retries=30 - while ! nc -vz cluster-1718310842-m 8786 ; do + while ! nc -vz "${MASTER}" 8786 ; do sleep 3s ((retries--) if [[ "${retries}" == "0" ]]; then echo "dask scheduler unreachable" ; exit 1 ; fi From 510e5202972b11ee43cdece00925abc1513c9734 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 17:25:21 -0800 Subject: [PATCH 096/130] corrected syntax errors ; dump log on service failure --- templates/dask/dask.sh.in | 14 +++++++++++--- templates/dask/util_functions | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index 9d82c5063..b64482aeb 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -32,6 +32,10 @@ function main() { if [[ "$(hostname -s)" == "${MASTER}" ]]; then date time systemctl start "${DASK_SCHEDULER_SERVICE}" + local substate_val="$(systemctl show ${DASK_SCHEDULER_SERVICE} -p SubState --value)" + if [[ "${substate_val}" != 'running' ]] ; then + cat "/var/log/${DASK_SCHEDULER_SERVICE}.log" + fi systemctl status "${DASK_SCHEDULER_SERVICE}" fi @@ -42,10 +46,14 @@ function main() { retries=30 while ! nc -vz "${MASTER}" 8786 ; do sleep 3s - ((retries--) - if [[ "${retries}" == "0" ]]; then echo "dask scheduler unreachable" ; exit 1 ; fi - fi + ((retries--)) + if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi + done time systemctl start "${DASK_WORKER_SERVICE}" + local substate_val="$(systemctl show ${DASK_WORKER_SERVICE} -p SubState --value)" + if [[ "${substate_val}" != 'running' ]] ; then + cat "/var/log/${DASK_WORKER_SERVICE}.log" + fi systemctl status "${DASK_WORKER_SERVICE}" fi diff --git a/templates/dask/util_functions b/templates/dask/util_functions index f7fe507d1..fca23a74b 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -102,7 +102,7 @@ function install_systemd_dask_scheduler() { #!/bin/bash LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log" echo "dask scheduler starting, logging to \${LOGFILE}" -${DASK_CONDA_ENV}/bin/dask-scheduler >> "\${LOGFILE}" 2>&1 +${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_SCHEDULER_LAUNCHER}" From 9e9f87266f95d9eb8e2da23fcf67f8c1b6eb5a8e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 18:19:38 -0800 Subject: [PATCH 097/130] refactored some common code ; setting default value for metadata attribute correctly --- templates/dask/dask.sh.in | 33 ++------------------------------- templates/dask/util_functions | 35 ++++++++++++++++++++++++++++++++++- templates/rapids/rapids.sh.in | 16 +++------------- 3 files changed, 39 insertions(+), 45 deletions(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index b64482aeb..dd2932042 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -27,37 +27,8 @@ function main() { elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then # Create Dask service install_systemd_dask_service + start_systemd_dask_service - # only run scheduler on primary master - if [[ "$(hostname -s)" == "${MASTER}" ]]; then - date - time systemctl start "${DASK_SCHEDULER_SERVICE}" - local substate_val="$(systemctl show ${DASK_SCHEDULER_SERVICE} -p SubState --value)" - if [[ "${substate_val}" != 'running' ]] ; then - cat "/var/log/${DASK_SCHEDULER_SERVICE}.log" - fi - systemctl status "${DASK_SCHEDULER_SERVICE}" - fi - - echo "Starting Dask 'standalone' cluster..." - if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then - date - # Pause while scheduler comes online - retries=30 - while ! nc -vz "${MASTER}" 8786 ; do - sleep 3s - ((retries--)) - if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi - done - time systemctl start "${DASK_WORKER_SERVICE}" - local substate_val="$(systemctl show ${DASK_WORKER_SERVICE} -p SubState --value)" - if [[ "${substate_val}" != 'running' ]] ; then - cat "/var/log/${DASK_WORKER_SERVICE}.log" - fi - systemctl status "${DASK_WORKER_SERVICE}" - fi - - date configure_knox_for_dask local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" @@ -79,7 +50,7 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env - conda_env="$(get_metadata_attribute conda-env || echo 'dask')" + conda_env="$(get_metadata_attribute conda-env 'dask')" readonly conda_env prepare_dask_env trap exit_handler EXIT diff --git a/templates/dask/util_functions b/templates/dask/util_functions index fca23a74b..54066b984 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -40,7 +40,7 @@ function install_systemd_dask_worker() { local compute_mode_cmd="" if command -v nvidia-smi ; then compute_mode_cmd="nvidia-smi --compute-mode=DEFAULT" ; fi local worker_name="dask worker" - if test -f "${DASK_CONDA_ENV}/bin/dask-cuda-worker" ; then worker_name="dask-cuda worker" ; fi + if test -f "${DASK_CONDA_ENV}/bin/dask-cuda" ; then worker_name="dask-cuda worker" ; fi local worker="${DASK_CONDA_ENV}/bin/${worker_name}" cat <"${DASK_WORKER_LAUNCHER}" #!/bin/bash @@ -131,6 +131,39 @@ function install_systemd_dask_service() { install_systemd_dask_worker } +function start_systemd_dask_service() { + # only run scheduler on primary master + if [[ "$(hostname -s)" == "${MASTER}" ]]; then + date + time systemctl start "${DASK_SCHEDULER_SERVICE}" + local substate_val="$(systemctl show ${DASK_SCHEDULER_SERVICE} -p SubState --value)" + if [[ "${substate_val}" != 'running' ]] ; then + cat "/var/log/${DASK_SCHEDULER_SERVICE}.log" + fi + systemctl status "${DASK_SCHEDULER_SERVICE}" + fi + + echo "Starting Dask 'standalone' cluster..." + if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then + date + # Pause while scheduler comes online + retries=30 + while ! nc -vz "${MASTER}" 8786 ; do + sleep 3s + ((retries--)) + if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi + done + time systemctl start "${DASK_WORKER_SERVICE}" + local substate_val="$(systemctl show ${DASK_WORKER_SERVICE} -p SubState --value)" + if [[ "${substate_val}" != 'running' ]] ; then + cat "/var/log/${DASK_WORKER_SERVICE}.log" + fi + systemctl status "${DASK_WORKER_SERVICE}" + fi + + date +} + function configure_knox_for_dask() { if [[ ! -d "${KNOX_HOME}" ]]; then echo "Skip configuring Knox rules for Dask" diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in index 8c7d85529..a63f44b3b 100644 --- a/templates/rapids/rapids.sh.in +++ b/templates/rapids/rapids.sh.in @@ -27,21 +27,11 @@ function main() { else # Create Dask service install_systemd_dask_service - - if [[ "$(hostname -s)" == "${MASTER}" ]]; then - systemctl start "${DASK_SCHEDULER_SERVICE}" - systemctl status "${DASK_SCHEDULER_SERVICE}" - fi - - echo "Starting Dask 'standalone' cluster..." - if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then - systemctl start "${DASK_WORKER_SERVICE}" - systemctl status "${DASK_WORKER_SERVICE}" - fi + start_systemd_dask_service configure_knox_for_dask - local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')" if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi @@ -68,7 +58,7 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env prepare_gpu_env - conda_env="$(get_metadata_attribute conda-env || echo 'dask-rapids')" + conda_env="$(get_metadata_attribute conda-env 'dask-rapids')" readonly conda_env prepare_dask_rapids_env trap exit_handler EXIT From 7480a23b0ac4a72303163466b5f501dab0043b61 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 22:26:38 -0800 Subject: [PATCH 098/130] added new function is_ramdisk ; keeping conda cache in its own directory ; same for pip cache ; refactored pip setup and teardown --- templates/common/util_functions | 66 ++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index ba66d2d55..a52001db3 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -319,6 +319,20 @@ function set_proxy(){ export NO_PROXY="${no_proxy}" } +function is_ramdisk() { + if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi + if ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0 + elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi + + if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then + IS_RAMDISK="true" + return 0 + else + IS_RAMDISK="false" + return 1 + fi +} + function mount_ramdisk(){ local free_mem free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" @@ -327,18 +341,11 @@ function mount_ramdisk(){ # Write to a ramdisk instead of churning the persistent disk tmpdir="/mnt/shm" - mkdir -p "${tmpdir}" + mkdir -p "${tmpdir}/pkgs_dirs" mount -t tmpfs tmpfs "${tmpdir}" # Download conda packages to tmpfs - /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" - - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - pip cache purge || echo "unable to purge pip cache" - - # Download pip packages to tmpfs - pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs" # Download OS packages to tmpfs if is_debuntu ; then @@ -346,6 +353,7 @@ function mount_ramdisk(){ else mount -t tmpfs tmpfs /var/cache/dnf fi + is_ramdisk -f } function check_os() { @@ -553,6 +561,21 @@ function install_dependencies() { touch "${workdir}/complete/install-dependencies" } +function prepare_pip_env() { + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv" + source "${tmpdir}/python-venv/bin/activate" + + pip cache purge || echo "unable to purge pip cache" + if is_ramdisk ; then + # Download pip packages to tmpfs + mkdir -p "${tmpdir}/cache-dir" + pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir" + fi +} + + function prepare_common_env() { define_os_comparison_functions @@ -590,8 +613,6 @@ function prepare_common_env() { # Knox config readonly KNOX_HOME=/usr/lib/knox - readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0" - readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" mkdir -p "${workdir}/complete" set_proxy @@ -636,13 +657,17 @@ function prepare_common_env() { touch "${workdir}/complete/prepare.common" } +function pip_exit_handler() { + if is_ramdisk ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + fi +} + function common_exit_handler() { set +ex echo "Exit handler invoked" - # Clear pip cache - pip cache purge || echo "unable to purge pip cache" - # Restart YARN services if they are running already for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then @@ -653,9 +678,6 @@ function common_exit_handler() { # If system memory was sufficient to mount memory-backed filesystems if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then @@ -678,6 +700,7 @@ function common_exit_handler() { dnf clean all fi + # When creating image, print disk usage statistics, zero unused disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then # print disk usage statistics for large components if is_ubuntu ; then @@ -719,11 +742,12 @@ function common_exit_handler() { '@siz=( sort { $a => $b } map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting}; print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" + "starting-disk-used: $starting", $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" # zero free disk space From 6b73d22f1fc64b7cf9e80c6f115d1964d9283b59 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 22:28:13 -0800 Subject: [PATCH 099/130] calling functions from refactored pip setup/teardown --- templates/dask/dask.sh.in | 2 ++ templates/gpu/install_gpu_driver.sh.in | 2 ++ templates/spark-rapids/spark-rapids.sh.in | 2 ++ 3 files changed, 6 insertions(+) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index dd2932042..b0279160f 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -44,12 +44,14 @@ function main() { } function exit_handler() { + pip_exit_handler common_exit_handler return 0 } function prepare_to_install(){ prepare_common_env + prepare_pip_env conda_env="$(get_metadata_attribute conda-env 'dask')" readonly conda_env prepare_dask_env diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index ffdda45e4..0e27f1086 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -39,12 +39,14 @@ function main() { function exit_handler() { gpu_exit_handler + pip_exit_handler common_exit_handler return 0 } function prepare_to_install(){ prepare_common_env + prepare_pip_env prepare_gpu_env trap exit_handler EXIT } diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index dc3ce3b36..1467fedf9 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -52,12 +52,14 @@ function main() { function exit_handler() { gpu_exit_handler + pip_exit_handler common_exit_handler return 0 } function prepare_to_install(){ prepare_common_env + prepare_pip_env prepare_gpu_env trap exit_handler EXIT } From 2e45a7553fe45029ec8816995f1f04a9ca0d0a1a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 22:29:21 -0800 Subject: [PATCH 100/130] moved knox dask config to templates/dask/util_functions --- templates/dask/util_functions | 3 +++ 1 file changed, 3 insertions(+) diff --git a/templates/dask/util_functions b/templates/dask/util_functions index 54066b984..a2863ec8b 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -506,6 +506,9 @@ function prepare_dask_env() { readonly DASK_WORKER_SERVICE=dask-worker readonly DASK_SCHEDULER_SERVICE=dask-scheduler readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/${conda_env}" + # Knox dask config + readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0" + readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" } function prepare_dask_rapids_env(){ From 33fdd38029f0379ced2ad2a53d1b3ff94a809ace Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 22:29:51 -0800 Subject: [PATCH 101/130] added copyright to templates/legal/license_header --- templates/legal/license_header | 2 ++ 1 file changed, 2 insertions(+) diff --git a/templates/legal/license_header b/templates/legal/license_header index 4c05ecc74..0230ca951 100644 --- a/templates/legal/license_header +++ b/templates/legal/license_header @@ -1,3 +1,5 @@ +# Copyright 2015 Google LLC and contributors +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From 4f974c5aaada2809bf80fb86fcbc440428e41900 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 22:30:53 -0800 Subject: [PATCH 102/130] latest generated action --- gpu/install_gpu_driver.sh | 124 ++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 46 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 59a592d30..91ad4ede0 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1,5 +1,7 @@ #!/bin/bash # +# Copyright 2015 Google LLC and contributors +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -345,6 +347,20 @@ function set_proxy(){ export NO_PROXY="${no_proxy}" } +function is_ramdisk() { + if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi + if ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0 + elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi + + if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then + IS_RAMDISK="true" + return 0 + else + IS_RAMDISK="false" + return 1 + fi +} + function mount_ramdisk(){ local free_mem free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" @@ -353,18 +369,11 @@ function mount_ramdisk(){ # Write to a ramdisk instead of churning the persistent disk tmpdir="/mnt/shm" - mkdir -p "${tmpdir}" + mkdir -p "${tmpdir}/pkgs_dirs" mount -t tmpfs tmpfs "${tmpdir}" # Download conda packages to tmpfs - /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" - - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - pip cache purge || echo "unable to purge pip cache" - - # Download pip packages to tmpfs - pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs" # Download OS packages to tmpfs if is_debuntu ; then @@ -372,6 +381,7 @@ function mount_ramdisk(){ else mount -t tmpfs tmpfs /var/cache/dnf fi + is_ramdisk -f } function check_os() { @@ -547,13 +557,13 @@ function check_secure_boot() { readonly PSN if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then - echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." - exit 1 + echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster." + return elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then echo "Secure boot is enabled, but no signing material provided." - echo "Please either disable secure boot or provide signing material as per" + echo "Consider either disabling secure boot or provide signing material as per" echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - return 1 + return fi CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" @@ -565,6 +575,12 @@ function check_secure_boot() { mok_der=/var/lib/dkms/mok.pub ; fi } +function restart_knox() { + systemctl stop knox + rm -rf "${KNOX_HOME}/data/deployments/*" + systemctl start knox +} + function install_dependencies() { test -f "${workdir}/complete/install-dependencies" && return 0 pkg_list="screen" @@ -573,6 +589,21 @@ function install_dependencies() { touch "${workdir}/complete/install-dependencies" } +function prepare_pip_env() { + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv" + source "${tmpdir}/python-venv/bin/activate" + + pip cache purge || echo "unable to purge pip cache" + if is_ramdisk ; then + # Download pip packages to tmpfs + mkdir -p "${tmpdir}/cache-dir" + pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir" + fi +} + + function prepare_common_env() { define_os_comparison_functions @@ -594,6 +625,10 @@ function prepare_common_env() { ROLE="$(get_metadata_attribute dataproc-role)" readonly ROLE + # master node + MASTER="$(get_metadata_attribute dataproc-master)" + readonly MASTER + workdir=/opt/install-dpgce tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" @@ -604,6 +639,9 @@ function prepare_common_env() { readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive + # Knox config + readonly KNOX_HOME=/usr/lib/knox + mkdir -p "${workdir}/complete" set_proxy mount_ramdisk @@ -647,13 +685,17 @@ function prepare_common_env() { touch "${workdir}/complete/prepare.common" } +function pip_exit_handler() { + if is_ramdisk ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + fi +} + function common_exit_handler() { set +ex echo "Exit handler invoked" - # Clear pip cache - pip cache purge || echo "unable to purge pip cache" - # Restart YARN services if they are running already for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then @@ -664,9 +706,6 @@ function common_exit_handler() { # If system memory was sufficient to mount memory-backed filesystems if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then @@ -685,11 +724,11 @@ function common_exit_handler() { # re-hold systemd package if ge_debian12 ; then apt-mark hold systemd libsystemd0 ; fi - hold_nvidia_packages else dnf clean all fi + # When creating image, print disk usage statistics, zero unused disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then # print disk usage statistics for large components if is_ubuntu ; then @@ -731,11 +770,12 @@ function common_exit_handler() { '@siz=( sort { $a => $b } map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting}; print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" + "starting-disk-used: $starting", $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" # zero free disk space @@ -802,6 +842,15 @@ function set_support_matrix() { set_support_matrix function set_cuda_version() { + case "${DATAPROC_IMAGE_VERSION}" in + "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; + * ) + echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" + exit 1 + ;; + esac local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') if [[ -n "${cuda_url}" ]] ; then @@ -810,29 +859,8 @@ function set_cuda_version() { CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" - CUDA_FULL_VERSION="${CUDA_URL_VERSION}" fi fi - - if ( ! test -v DEFAULT_CUDA_VERSION ) ; then - DEFAULT_CUDA_VERSION='12.4.1' - fi - # EXCEPTIONS - # Change default CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) - case "${DATAPROC_IMAGE_VERSION}" in - "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; - "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; - * ) - echo "unrecognized Dataproc image version" - exit 1 - ;; - esac - - if le_ubuntu18 ; then - DEFAULT_CUDA_VERSION="12.1.1" - CUDA_VERSION_MAJOR="${DEFAULT_CUDA_VERSION%.*}" #12.1 - fi readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") @@ -845,7 +873,6 @@ function set_cuda_version() { CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} fi readonly CUDA_FULL_VERSION - } function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) @@ -2037,6 +2064,8 @@ function prepare_gpu_env(){ # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades # Users should run apt-mark unhold before they wish to upgrade these packages function hold_nvidia_packages() { + if ! is_debuntu ; then return ; fi + apt-mark hold nvidia-* apt-mark hold libnvidia-* if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then @@ -2199,6 +2228,7 @@ function gpu_exit_handler() { fi done fi + hold_nvidia_packages } @@ -2229,12 +2259,14 @@ function main() { function exit_handler() { gpu_exit_handler + pip_exit_handler common_exit_handler return 0 } function prepare_to_install(){ prepare_common_env + prepare_pip_env prepare_gpu_env trap exit_handler EXIT } From 75d8e321bda73d723616fc531138121f9573425d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 5 Jan 2025 23:53:47 -0800 Subject: [PATCH 103/130] removed redundant template disclaimer --- templates/spark-rapids/spark-rapids.sh.in | 1 - 1 file changed, 1 deletion(-) diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 1467fedf9..56603252b 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -22,7 +22,6 @@ # For details see # github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot # -[% PROCESS common/template_disclaimer %] set -euxo pipefail From 34fce25def2b95b8542ac15fd79b7df0c93ca62b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 6 Jan 2025 02:01:40 -0800 Subject: [PATCH 104/130] setup and tear-down for actions which work with conda --- templates/common/util_functions | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/templates/common/util_functions b/templates/common/util_functions index a52001db3..e27a1f9d5 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -575,6 +575,16 @@ function prepare_pip_env() { fi } +function prepare_conda_env() { + CONDA=/opt/conda/miniconda3/bin/conda + touch ~/.condarc + cp ~/.condarc ~/.condarc.default + if is_ramdisk ; then + # Download conda packages to tmpfs + mkdir -p "${tmpdir}/conda_cache" + ${CONDA} config --add pkgs_dirs "${tmpdir}/conda_cache" + fi +} function prepare_common_env() { define_os_comparison_functions @@ -664,6 +674,10 @@ function pip_exit_handler() { fi } +function conda_exit_handler() { + mv ~/.condarc.default ~/.condarc +} + function common_exit_handler() { set +ex echo "Exit handler invoked" From bbe062e8328941acff8f3209a839fc3fedf39016 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 6 Jan 2025 02:08:11 -0800 Subject: [PATCH 105/130] * refactored common conda installer functionality from dask.sh.in and rapids.sh.in into a function install_conda_packages * removed redundant yarn service restarts in rapids.sh.in * added conda prep and exit handlers --- templates/dask/util_functions | 80 ++++++++++++++++++++--------------- templates/gpu/util_functions | 2 +- templates/rapids/rapids.sh.in | 13 +----- 3 files changed, 50 insertions(+), 45 deletions(-) diff --git a/templates/dask/util_functions b/templates/dask/util_functions index a2863ec8b..5705c4a78 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -377,10 +377,14 @@ EOF function install_dask() { local python_spec="python>=3.11" - local dask_spec="dask>=2024.7" + local dask_version="2024.12.1" + local dask_spec="dask>=${dask_version}" + local cache_key_name="dask-${dask_version}" CONDA_PACKAGES=() if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then + dask_yarn_version="0.9" + cache_key_name="dask-yarn-${dask_yarn_version}" # Pin `distributed` and `dask` package versions to old release # because `dask-yarn` 0.9 uses skein in a way which # is not compatible with `distributed` package 2022.2 and newer: @@ -392,7 +396,7 @@ function install_dask() { # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic CONDA_PACKAGES+=("fiona<1.8.22") fi - CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2") + CONDA_PACKAGES+=('dask-yarn=${dask_yarn_version}' "distributed<2022.2") fi CONDA_PACKAGES+=( @@ -402,49 +406,30 @@ function install_dask() { "dask-sql" ) - # Install dask - mamba="/opt/conda/miniconda3/bin/mamba" - conda="/opt/conda/miniconda3/bin/conda" - - ( set +e - local is_installed=0 - for installer in "${mamba}" "${conda}" ; do - test -d "${DASK_CONDA_ENV}" || \ - time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \ - -c 'conda-forge' -c 'nvidia' \ - ${CONDA_PACKAGES[*]} \ - "${python_spec}" \ - > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } - sync - if [[ "$retval" == "0" ]] ; then - is_installed="1" - break - fi - "${conda}" config --set channel_priority flexible - done - if [[ "${is_installed}" == "0" ]]; then - echo "failed to install dask" - return 1 - fi - ) + unset CONDA_CHANNEL_ARGS + local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}}" + install_conda_packages "${cache_key}" } function install_dask_rapids() { + local numba_spec="numba" + local dask_version="2024.12.1" + local dask_spec="dask>=${dask_version}" + if is_cuda12 ; then local python_spec="python>=3.11" local cuda_spec="cuda-version>=12,<13" - local dask_spec="dask>=2024.7" - local numba_spec="numba" elif is_cuda11 ; then local python_spec="python>=3.9" local cuda_spec="cuda-version>=11,<12.0a0" - local dask_spec="dask" - local numba_spec="numba" fi rapids_spec="rapids>=${RAPIDS_VERSION}" CONDA_PACKAGES=() + local cache_key_name="dask-rapids-${RAPIDS_VERSION}" if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then + local rapids_version="24.05" + cache_key_name="dask-rapids-yarn-${rapids_version}" # Pin `distributed` and `dask` package versions to old release # because `dask-yarn` 0.9 uses skein in a way which # is not compatible with `distributed` package 2022.2 and newer: @@ -452,7 +437,7 @@ function install_dask_rapids() { dask_spec="dask<2022.2" python_spec="python>=3.7,<3.8.0a0" - rapids_spec="rapids<=24.05" + rapids_spec="rapids<=${rapids_version}" if is_ubuntu18 ; then # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic CONDA_PACKAGES+=("fiona<1.8.22") @@ -471,6 +456,31 @@ function install_dask_rapids() { "${numba_spec}" ) + CONDA_CHANNEL_ARGS="-c 'conda-forge' -c 'nvidia' -c 'rapidsai'" + + local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}}" + install_conda_packages "${cache_key}" +} + +# The bash array CONDA_PACKAGES must contain a set of package +# specifications before calling this function + +# The bash string CONDA_CHANNEL_ARGS may contain arguments to specify +# conda channels. Default is "-c 'conda-forge'" + +function install_conda_packages() { + local cache_key="${1}" + + local build_tarball="${cache_key}.tar.gz" + local gcs_tarball="${pkg_bucket}/conda/${cache_key%%_*}/${build_tarball}" + local local_tarball="${tmpdir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + echo "cache hit" + gcloud storage cat "${gcs_tarball}" | tar -C / -xz + return 0 + fi + # Install cuda, rapids, dask mamba="/opt/conda/miniconda3/bin/mamba" conda="/opt/conda/miniconda3/bin/conda" @@ -480,17 +490,21 @@ function install_dask_rapids() { for installer in "${mamba}" "${conda}" ; do test -d "${DASK_CONDA_ENV}" || \ time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \ - -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + "${CONDA_CHANNEL_ARGS:- -c 'conda-forge'}" \ ${CONDA_PACKAGES[*]} \ "${python_spec}" \ > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } sync if [[ "$retval" == "0" ]] ; then is_installed="1" + tar czf "${local_tarball}" "${DASK_CONDA_ENV}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" break fi "${conda}" config --set channel_priority flexible done + if [[ "${is_installed}" == "0" ]]; then echo "failed to install dask" return 1 diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 46b49ef36..e86a2ff66 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -779,7 +779,7 @@ function install_nvidia_userspace_runfile() { depmod -a else clear_dkms_key - tar czvf "${local_tarball}" \ + tar czf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in index a63f44b3b..7ca6c410c 100644 --- a/templates/rapids/rapids.sh.in +++ b/templates/rapids/rapids.sh.in @@ -36,27 +36,18 @@ function main() { configure_fluentd_for_dask fi fi - - echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized." - if [[ "${ROLE}" == "Master" ]]; then - systemctl restart hadoop-yarn-resourcemanager.service - # Restart NodeManager on Master as well if this is a single-node-cluster. - if systemctl list-units | grep hadoop-yarn-nodemanager; then - systemctl restart hadoop-yarn-nodemanager.service - fi - else - systemctl restart hadoop-yarn-nodemanager.service - fi } function exit_handler() { gpu_exit_handler + conda_exit_handler common_exit_handler return 0 } function prepare_to_install(){ prepare_common_env + prepare_conda_env prepare_gpu_env conda_env="$(get_metadata_attribute conda-env 'dask-rapids')" readonly conda_env From 10f16983a721bcaf5df7ce934a2d74b83da0e0ad Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 6 Jan 2025 10:50:11 -0800 Subject: [PATCH 106/130] tested rapids.sh init action with dataproc-repro --- templates/dask/dask.sh.in | 4 +++- templates/dask/util_functions | 13 +++++-------- templates/gpu/install_gpu_driver.sh.in | 2 ++ templates/gpu/util_functions | 7 ++----- templates/rapids/rapids.sh.in | 7 +++++-- templates/spark-rapids/spark-rapids.sh.in | 4 ++++ 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index b0279160f..8e6d2d7d4 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -51,7 +51,9 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env - prepare_pip_env + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') + readonly RAPIDS_RUNTIME + prepare_conda_env conda_env="$(get_metadata_attribute conda-env 'dask')" readonly conda_env prepare_dask_env diff --git a/templates/dask/util_functions b/templates/dask/util_functions index 5705c4a78..c9dc71b96 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -407,13 +407,13 @@ function install_dask() { ) unset CONDA_CHANNEL_ARGS - local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}}" + local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}" install_conda_packages "${cache_key}" } function install_dask_rapids() { local numba_spec="numba" - local dask_version="2024.12.1" + local dask_version="2024.7" local dask_spec="dask>=${dask_version}" if is_cuda12 ; then @@ -456,9 +456,9 @@ function install_dask_rapids() { "${numba_spec}" ) - CONDA_CHANNEL_ARGS="-c 'conda-forge' -c 'nvidia' -c 'rapidsai'" + CONDA_CHANNEL_ARGS="-c conda-forge -c nvidia -c rapidsai" - local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}}" + local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}" install_conda_packages "${cache_key}" } @@ -490,7 +490,7 @@ function install_conda_packages() { for installer in "${mamba}" "${conda}" ; do test -d "${DASK_CONDA_ENV}" || \ time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \ - "${CONDA_CHANNEL_ARGS:- -c 'conda-forge'}" \ + ${CONDA_CHANNEL_ARGS:- -c 'conda-forge'} \ ${CONDA_PACKAGES[*]} \ "${python_spec}" \ > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } @@ -527,9 +527,6 @@ function prepare_dask_env() { function prepare_dask_rapids_env(){ prepare_dask_env - # RAPIDS config - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') - readonly RAPIDS_RUNTIME local DEFAULT_DASK_RAPIDS_VERSION="24.08" if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 0e27f1086..57f4e640c 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -46,6 +46,8 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + readonly RAPIDS_RUNTIME prepare_pip_env prepare_gpu_env trap exit_handler EXIT diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index e86a2ff66..fb3e8fa4b 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -145,7 +145,8 @@ function set_cudnn_version() { readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" # Parameters for NVIDIA-provided cuDNN library - readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} + DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} + readonly DEFAULT_CUDNN_VERSION CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then @@ -1252,10 +1253,6 @@ function prepare_gpu_env(){ INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') readonly INSTALL_GPU_AGENT - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - readonly RAPIDS_RUNTIME - # determine whether we have nvidia-smi installed and working nvsmi diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in index 7ca6c410c..e6b973b45 100644 --- a/templates/rapids/rapids.sh.in +++ b/templates/rapids/rapids.sh.in @@ -47,11 +47,14 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env - prepare_conda_env - prepare_gpu_env + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') + readonly RAPIDS_RUNTIME conda_env="$(get_metadata_attribute conda-env 'dask-rapids')" readonly conda_env prepare_dask_rapids_env + prepare_conda_env + prepare_gpu_env trap exit_handler EXIT } diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 56603252b..29bc83824 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -58,6 +58,10 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + readonly RAPIDS_RUNTIME + prepare_pip_env prepare_gpu_env trap exit_handler EXIT From 8a4cbd94d71ab20151b522d150d6a3a137101185 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 6 Jan 2025 15:00:55 -0800 Subject: [PATCH 107/130] templates/dask/dask.sh.in, templates/dask/util_functions, templates/gpu/install_gpu_driver.sh.in, templates/gpu/util_functions, templates/rapids/rapids.sh.in, templates/spark-rapids/spark-rapids.sh.in: * cleaned up definition of RAPIDS_RUNTIME ; default to SPARK and use DASK only for dask-rapids templates/dask/util_functions, templates/gpu/util_functions, templates/common/util_functions: * added utility functions to check whether a phase has been complete, mark a phase complete and mark a phase as incomplete templates/dask/util_functions: * conda environment is now archived from the environment directory rather than from / templates/rapids/rapids.sh.in: * Now executing gpu installer logic before installing dask-rapids * now exiting if rapids runtime is not DASK --- templates/common/util_functions | 24 +++++++-- templates/dask/dask.sh.in | 2 - templates/dask/util_functions | 24 +++++++-- templates/gpu/install_gpu_driver.sh.in | 2 - templates/gpu/util_functions | 66 ++++++++++++++--------- templates/rapids/rapids.sh.in | 17 ++++-- templates/spark-rapids/spark-rapids.sh.in | 4 -- 7 files changed, 95 insertions(+), 44 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index e27a1f9d5..351e20fad 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -553,12 +553,28 @@ function restart_knox() { systemctl start knox } +function is_complete() { + phase="$1" + test -f "${workdir}/complete/${phase}" +} + +function mark_complete() { + phase="$1" + touch "${workdir}/complete/${phase}" +} + +function mark_incomplete() { + phase="$1" + rm -f "${workdir}/complete/${phase}" +} + function install_dependencies() { - test -f "${workdir}/complete/install-dependencies" && return 0 + is_complete install-dependencies && return 0 + pkg_list="screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi - touch "${workdir}/complete/install-dependencies" + mark_complete install-dependencies } function prepare_pip_env() { @@ -630,7 +646,7 @@ function prepare_common_env() { readonly install_log="${tmpdir}/install.log" - if test -f "${workdir}/complete/prepare.common" ; then return ; fi + is_complete prepare.common && return repair_old_backports @@ -664,7 +680,7 @@ function prepare_common_env() { bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" fi - touch "${workdir}/complete/prepare.common" + mark_complete prepare.common } function pip_exit_handler() { diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index 8e6d2d7d4..cafc2df89 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -51,8 +51,6 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') - readonly RAPIDS_RUNTIME prepare_conda_env conda_env="$(get_metadata_attribute conda-env 'dask')" readonly conda_env diff --git a/templates/dask/util_functions b/templates/dask/util_functions index c9dc71b96..d1aee00b4 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -376,6 +376,8 @@ EOF } function install_dask() { + is_complete install.dask && return + local python_spec="python>=3.11" local dask_version="2024.12.1" local dask_spec="dask>=${dask_version}" @@ -409,9 +411,13 @@ function install_dask() { unset CONDA_CHANNEL_ARGS local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}" install_conda_packages "${cache_key}" + + mark_complete install.dask } function install_dask_rapids() { + if ( is_complete install.dask-rapids && test -d "${DASK_CONDA_ENV}" ) ; then return ; fi + local numba_spec="numba" local dask_version="2024.7" local dask_spec="dask>=${dask_version}" @@ -460,6 +466,8 @@ function install_dask_rapids() { local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}" install_conda_packages "${cache_key}" + + mark_complete install.dask-rapids } # The bash array CONDA_PACKAGES must contain a set of package @@ -477,7 +485,8 @@ function install_conda_packages() { if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" - gcloud storage cat "${gcs_tarball}" | tar -C / -xz + mkdir -p "${DASK_CONDA_ENV}" + time ( gcloud storage cat "${gcs_tarball}" | tar -C "${DASK_CONDA_ENV}" -xz ) return 0 fi @@ -497,9 +506,13 @@ function install_conda_packages() { sync if [[ "$retval" == "0" ]] ; then is_installed="1" - tar czf "${local_tarball}" "${DASK_CONDA_ENV}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" + pushd "${DASK_CONDA_ENV}" + time ( + tar czf "${local_tarball}" . + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + ) + popd break fi "${conda}" config --set channel_priority flexible @@ -528,6 +541,9 @@ function prepare_dask_env() { function prepare_dask_rapids_env(){ prepare_dask_env + # Default rapids runtime + readonly DEFAULT_RAPIDS_RUNTIME='DASK' + local DEFAULT_DASK_RAPIDS_VERSION="24.08" if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then DEFAULT_DASK_RAPIDS_VERSION="23.08" # Final release to support spark 3.1.3 diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 57f4e640c..0e27f1086 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -46,8 +46,6 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - readonly RAPIDS_RUNTIME prepare_pip_env prepare_gpu_env trap exit_handler EXIT diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index fb3e8fa4b..61d6bf478 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -306,7 +306,7 @@ function uninstall_cuda_keyring_pkg() { } function install_local_cuda_repo() { - if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi + is_complete install-local-cuda-repo && return if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi CUDA_LOCAL_REPO_INSTALLED="1" @@ -329,7 +329,7 @@ function install_local_cuda_repo() { -o /etc/apt/preferences.d/cuda-repository-pin-600 fi - touch "${workdir}/complete/install-local-cuda-repo" + mark_complete install-local-cuda-repo } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" @@ -337,7 +337,8 @@ function uninstall_local_cuda_repo(){ } function install_local_cudnn_repo() { - if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi + is_complete install-local-cudnn-repo && return + pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" @@ -353,7 +354,7 @@ function install_local_cudnn_repo() { cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/complete/install-local-cudnn-repo" + mark_complete install-local-cudnn-repo } function uninstall_local_cudnn_repo() { @@ -362,7 +363,7 @@ function uninstall_local_cudnn_repo() { } function install_local_cudnn8_repo() { - if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi + is_complete install-local-cudnn8-repo && return if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" @@ -396,19 +397,19 @@ function install_local_cudnn8_repo() { rm -f "${local_deb_fn}" cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/complete/install-local-cudnn8-repo" + mark_complete install-local-cudnn8-repo } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cudnn8-repo" + mark_incomplete install-local-cudnn8-repo } function install_nvidia_nccl() { readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) - if test -f "${workdir}/complete/nccl" ; then return ; fi + is_complete nccl && return if is_cuda11 && is_debian12 ; then echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" @@ -499,14 +500,15 @@ function install_nvidia_nccl() { fi popd - touch "${workdir}/complete/nccl" + mark_complete nccl } function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { - if test -f "${workdir}/complete/cudnn" ; then return ; fi + is_complete cudnn && return + local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -565,7 +567,7 @@ function install_nvidia_cudnn() { ldconfig echo "NVIDIA cuDNN successfully installed for ${_shortname}." - touch "${workdir}/complete/cudnn" + mark_complete cudnn } function add_nonfree_components() { @@ -722,7 +724,8 @@ function install_nvidia_userspace_runfile() { # # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - if test -f "${workdir}/complete/userspace" ; then return ; fi + is_complete userspace && return + local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ @@ -788,12 +791,13 @@ function install_nvidia_userspace_runfile() { fi rm -f "${local_fn}" - touch "${workdir}/complete/userspace" + mark_complete userspace sync } function install_cuda_runfile() { - if test -f "${workdir}/complete/cuda" ; then return ; fi + is_complete cuda && return + local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ @@ -802,7 +806,7 @@ function install_cuda_runfile() { execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" rm -f "${local_fn}" - touch "${workdir}/complete/cuda" + mark_complete cuda sync } @@ -840,7 +844,7 @@ function load_kernel_module() { } function install_cuda(){ - if test -f "${workdir}/complete/cuda-repo" ; then return ; fi + is_complete cuda-repo && return if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" @@ -853,10 +857,12 @@ function install_cuda(){ # Includes CUDA packages add_repo_cuda - touch "${workdir}/complete/cuda-repo" + mark_complete cuda-repo } function install_nvidia_container_toolkit() { + is_complete install-nvtk && return + local container_runtime_default if command -v docker ; then container_runtime_default='docker' elif command -v containerd ; then container_runtime_default='containerd' @@ -872,11 +878,13 @@ function install_nvidia_container_toolkit() { execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" systemctl restart "${CONTAINER_RUNTIME}" + + mark_complete install-nvtk } # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if test -f "${workdir}/complete/gpu-driver" ; then return ; fi + is_complete gpu-driver && return if ( ge_debian12 && is_src_os ) ; then add_nonfree_components @@ -898,11 +906,11 @@ function install_nvidia_gpu_driver() { build_driver_from_github echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - touch "${workdir}/complete/gpu-driver" + mark_complete gpu-driver } function install_ops_agent(){ - if test -f "${workdir}/complete/ops-agent" ; then return ; fi + is_complete ops-agent && return mkdir -p /opt/google cd /opt/google @@ -910,7 +918,7 @@ function install_ops_agent(){ curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - touch "${workdir}/complete/ops-agent" + is_complete ops-agent } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics @@ -1189,7 +1197,7 @@ function query_nvsmi() { } function install_build_dependencies() { - if test -f "${workdir}/complete/build-dependencies" ; then return ; fi + is_complete build-dependencies && return if is_debuntu ; then if is_ubuntu22 && is_cuda12 ; then @@ -1227,7 +1235,7 @@ function install_build_dependencies() { execute_with_retries "${dnf_cmd}" fi - touch "${workdir}/complete/build-dependencies" + mark_complete build-dependencies } function prepare_gpu_env(){ @@ -1245,6 +1253,14 @@ function prepare_gpu_env(){ CUDNN8_PKG_NAME="" CUDA_LOCAL_REPO_INSTALLED="0" + if ! test -v DEFAULT_RAPIDS_RUNTIME ; then + readonly DEFAULT_RAPIDS_RUNTIME='SPARK' + fi + + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}") + readonly RAPIDS_RUNTIME + # Whether to install NVIDIA-provided or OS-provided GPU driver GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') readonly GPU_DRIVER_PROVIDER @@ -1337,7 +1353,7 @@ function configure_mig_cgi() { } function enable_mig() { - if test -f "${workdir}/complete/enable-mig" ; then return ; fi + is_complete enable-mig && return # Start persistenced if it's not already running if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi @@ -1349,7 +1365,7 @@ function enable_mig() { nvsmi -mig 1 clear_nvsmi_cache - touch "${workdir}/complete/enable-mig" + mark_complete enable-mig } function enable_and_configure_mig() { diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in index e6b973b45..75f4c7605 100644 --- a/templates/rapids/rapids.sh.in +++ b/templates/rapids/rapids.sh.in @@ -16,6 +16,20 @@ set -euxo pipefail [% INSERT dask/util_functions %] function main() { + setup_gpu_yarn + + echo "yarn setup complete" + + if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then + install_nvidia_nccl + install_nvidia_cudnn + fi + + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + echo "RAPIDS recognizes SPARK runtime - currently supported using gpu/install_gpu_driver.sh or spark-rapids/spark-rapids.sh" + exit 1 + fi + # Install Dask with RAPIDS install_dask_rapids @@ -47,9 +61,6 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') - readonly RAPIDS_RUNTIME conda_env="$(get_metadata_attribute conda-env 'dask-rapids')" readonly conda_env prepare_dask_rapids_env diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 29bc83824..56603252b 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -58,10 +58,6 @@ function exit_handler() { function prepare_to_install(){ prepare_common_env - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - readonly RAPIDS_RUNTIME - prepare_pip_env prepare_gpu_env trap exit_handler EXIT From b01b8675f06d76aee5ce72cba8766a225c147fcc Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 6 Jan 2025 17:48:39 -0800 Subject: [PATCH 108/130] refactor yarn functions into their own template --- templates/common/util_functions | 4 +- templates/common/yarn_functions | 69 ++++++++++++++++ templates/gpu/install_gpu_driver.sh.in | 17 ++++ templates/gpu/mig_functions | 97 +++++++++++++++++++++++ templates/gpu/util_functions | 11 ++- templates/rapids/rapids.sh.in | 2 + templates/spark-rapids/mig.sh.in | 49 +++++++++++- templates/spark-rapids/spark-rapids.sh.in | 22 ++++- 8 files changed, 261 insertions(+), 10 deletions(-) create mode 100644 templates/common/yarn_functions create mode 100644 templates/gpu/mig_functions diff --git a/templates/common/util_functions b/templates/common/util_functions index 351e20fad..dfd2cfdf1 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -580,8 +580,8 @@ function install_dependencies() { function prepare_pip_env() { # Clear pip cache # TODO: make this conditional on which OSs have pip without cache purge - test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv" - source "${tmpdir}/python-venv/bin/activate" + test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv" + source "${workdir}/python-venv/bin/activate" pip cache purge || echo "unable to purge pip cache" if is_ramdisk ; then diff --git a/templates/common/yarn_functions b/templates/common/yarn_functions new file mode 100644 index 000000000..8e38c7b0a --- /dev/null +++ b/templates/common/yarn_functions @@ -0,0 +1,69 @@ +function configure_yarn_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + + # Fix local dirs access permissions + local yarn_local_dirs=() + + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} + +function setup_gpu_yarn() { + # This configuration should be run on all nodes + # regardless if they have attached GPUs + configure_yarn_resources + + # When there is no GPU, but the installer is executing on a master node: + if [[ "${gpu_count}" == "0" ]] ; then + if [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager + fi + return 0 + fi + + install_nvidia_container_toolkit + configure_yarn_nodemanager_gpu + configure_gpu_script + configure_gpu_isolation +} + +function yarn_exit_handler() { + # Restart YARN services if they are running already + for svc in resourcemanager nodemanager; do + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl stop "hadoop-yarn-${svc}.service" + systemctl start "hadoop-yarn-${svc}.service" + fi + done + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' +} diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 0e27f1086..dcbd8c15e 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -10,9 +10,25 @@ set -euxo pipefail [% INSERT common/util_functions %] +[% INSERT common/yarn_functions %] + [% INSERT gpu/util_functions %] function main() { + install_nvidia_gpu_driver + install_cuda + load_kernel_module + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent +# install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode + setup_gpu_yarn echo "yarn setup complete" @@ -40,6 +56,7 @@ function main() { function exit_handler() { gpu_exit_handler pip_exit_handler + yarn_exit_handler common_exit_handler return 0 } diff --git a/templates/gpu/mig_functions b/templates/gpu/mig_functions new file mode 100644 index 000000000..233b2d02c --- /dev/null +++ b/templates/gpu/mig_functions @@ -0,0 +1,97 @@ +function fetch_mig_scripts() { + mkdir -p /usr/local/yarn-mig-scripts + chmod 755 /usr/local/yarn-mig-scripts + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + chmod 755 /usr/local/yarn-mig-scripts/* +} + +function delete_mig_instances() ( + # delete all instances + set +e + nvidia-smi mig -dci + + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No compute instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac + + nvidia-smi mig -dgi + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No GPU instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac +) + +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles +function configure_mig_cgi() { + delete_mig_instances + META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" + if test -n "${META_MIG_CGI_VALUE}"; then + nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C + else + # https://pci-ids.ucw.cz/v2.2/pci.ids + local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)" + if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then + # run the following command to list placement profiles + # nvidia-smi mig -lgipp + # + # This is the result when using H100 instances on 20241220 + # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 + # GPU 0 Profile ID 14 Placements: {0,2,4}:2 + # GPU 0 Profile ID 9 Placements: {0,4}:4 + # GPU 0 Profile ID 5 Placement : {0}:4 + # GPU 0 Profile ID 0 Placement : {0}:8 + + # For H100 3D controllers, consider profile 19, 7x1G instances + nvidia-smi mig -cgi 9,9 -C + elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then + # Dataproc only supports H100s right now ; split in 2 if not specified + # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances + nvidia-smi mig -cgi 9,9 -C + else + echo "unrecognized 3D controller" + fi + fi + clear_nvsmi_cache +} + +function enable_mig() { + is_complete enable-mig && return + + # Start persistenced if it's not already running + if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi + for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do + # Write an ascii zero to the numa node indicator + echo "0" | dd of="${f}" status=none + done + time nvsmi --gpu-reset # 30s + nvsmi -mig 1 + clear_nvsmi_cache + + mark_complete enable-mig +} + +function enable_and_configure_mig() { + # default MIG to on when this script is used + META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + + if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi + + enable_mig + query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' + mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" + + if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi + if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured but NOT enabled. Failing" ; exit 1 ; fi + + echo "MIG is fully enabled" + configure_mig_cgi +} diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 61d6bf478..e8aa1a8d5 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -943,9 +943,12 @@ function download_gpu_monitoring_agent(){ function install_gpu_monitoring_agent_dependency(){ cd /opt/google/compute-gpu-monitoring/linux - python3 -m venv venv - venv/bin/pip install wheel - venv/bin/pip install -Ur requirements.txt + /usr/bin/python3 -m venv venv + ( + source venv/bin/activate + pip install wheel + pip install -Ur requirements.txt + ) } function start_gpu_monitoring_agent_service(){ @@ -971,7 +974,7 @@ function install_gpu_agent() { | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" - python3 -m venv "${venv}" + /usr/bin/python3 -m venv "${venv}" ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in index 75f4c7605..4e46ab1d3 100644 --- a/templates/rapids/rapids.sh.in +++ b/templates/rapids/rapids.sh.in @@ -54,6 +54,7 @@ function main() { function exit_handler() { gpu_exit_handler + pip_exit_handler conda_exit_handler common_exit_handler return 0 @@ -65,6 +66,7 @@ function prepare_to_install(){ readonly conda_env prepare_dask_rapids_env prepare_conda_env + prepare_pip_env prepare_gpu_env trap exit_handler EXIT } diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in index 28a463602..99b494c4f 100644 --- a/templates/spark-rapids/mig.sh.in +++ b/templates/spark-rapids/mig.sh.in @@ -14,13 +14,55 @@ # [% PROCESS common/template_disclaimer %] -set -euxo pipefail - [% INSERT common/util_functions %] +[% INSERT common/yarn_functions %] + +[% INSERT gpu/mig_functions %] + [% INSERT gpu/util_functions %] +set -euxo pipefail + function main() { + if [[ "${nvsmi_works}" == "1" ]] ; then + # if this is called without the MIG script then the drivers are not installed + query_nvsmi + local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' + set +e + migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" + set -e + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi + fi + fi + fi + + # if mig is enabled drivers would have already been installed + if [[ $IS_MIG_ENABLED -eq 0 ]]; then + install_nvidia_gpu_driver + install_cuda + load_kernel_module + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent +# install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode + fi + setup_gpu_yarn echo "yarn setup complete" @@ -33,12 +75,15 @@ function main() { function exit_handler() { gpu_exit_handler + pip_exit_handler + yarn_exit_handler common_exit_handler return 0 } function prepare_to_install(){ prepare_common_env + prepare_pip_env prepare_gpu_env trap exit_handler EXIT } diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 56603252b..0bfc0b331 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -27,9 +27,25 @@ set -euxo pipefail [% INSERT common/util_functions %] +[% INSERT common/yarn_functions %] + [% INSERT gpu/util_functions %] function main() { + install_nvidia_gpu_driver + install_cuda + load_kernel_module + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then +# install_gpu_agent + install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode + setup_gpu_yarn echo "yarn setup complete" @@ -39,10 +55,11 @@ function main() { configure_gpu_script echo "RAPIDS initialized with Spark runtime" elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then - # we are not currently tooled for installing dask in this action. - echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh" + echo "This action only installs spark-rapids" + exit 1 else echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" + exit 1 fi echo "main complete" @@ -52,6 +69,7 @@ function main() { function exit_handler() { gpu_exit_handler pip_exit_handler + yarn_exit_handler common_exit_handler return 0 } From c6c09db27d71a0affe97665fabf9786d7b215ed8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 6 Jan 2025 17:49:24 -0800 Subject: [PATCH 109/130] refactor mig functions into their own template --- templates/common/util_functions | 61 ++----------- templates/dask/util_functions | 12 +-- templates/gpu/util_functions | 155 -------------------------------- 3 files changed, 10 insertions(+), 218 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index dfd2cfdf1..4d9f983a4 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -162,45 +162,6 @@ function set_hadoop_property() { --clobber } -function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts - if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - - # Fix local dirs access permissions - local yarn_local_dirs=() - - readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - - if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" - fi -} - function clean_up_sources_lists() { # # bigtop (primary) @@ -664,17 +625,18 @@ function prepare_common_env() { dnf clean all fi - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + # When creating a disk image: + if [[ -n "$(get_metadata_attribute creating-image "")" ]]; then + df / > "/run/disk-usage.log" - ( set +e + # zero free disk space + ( set +e time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) install_dependencies # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" touch "/run/keep-running-df" screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" @@ -698,25 +660,14 @@ function common_exit_handler() { set +ex echo "Exit handler invoked" - # Restart YARN services if they are running already - for svc in resourcemanager nodemanager; do - if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl stop "hadoop-yarn-${svc}.service" - systemctl start "hadoop-yarn-${svc}.service" - fi - done - # If system memory was sufficient to mount memory-backed filesystems - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + if is_ramdisk ; then # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then umount -f ${shmdir} fi done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' fi if is_debuntu ; then diff --git a/templates/dask/util_functions b/templates/dask/util_functions index d1aee00b4..5a1f7e201 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -74,9 +74,10 @@ EOF else # Enable service on single-node cluster (no workers) local worker_count="$(get_metadata_attribute dataproc-worker-count)" - if [[ "${worker_count}" == "0" ]] && - [[ "$(get_metadata_attribute dask-cuda-worker-on-master 'true')" == "true" ]] && - [[ "$(get_metadata_attribute dask-worker-on-master 'true')" == "true" ]] ; then + if ( [[ "${worker_count}" == "0" ]] || + ( [[ "$(get_metadata_attribute dask-cuda-worker-on-master 'true')" == "true" ]] && + [[ "$(get_metadata_attribute dask-worker-on-master 'true')" == "true" ]] ) + ) ; then enable_systemd_dask_worker_service="1" fi fi @@ -550,8 +551,3 @@ function prepare_dask_rapids_env(){ fi readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) } - - -function dask_exit_handler() { - echo "no exit handler for dask" -} diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index e8aa1a8d5..4834adb33 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1013,14 +1013,6 @@ function configure_gpu_exclusive_mode() { clear_nvsmi_cache } -function fetch_mig_scripts() { - mkdir -p /usr/local/yarn-mig-scripts - chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - chmod 755 /usr/local/yarn-mig-scripts/* -} - function install_spark_rapids() { # Update SPARK RAPIDS config local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" @@ -1299,153 +1291,6 @@ function hold_nvidia_packages() { fi } -function delete_mig_instances() ( - # delete all instances - set +e - nvidia-smi mig -dci - - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No compute instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac - - nvidia-smi mig -dgi - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No GPU instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac -) - -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles -function configure_mig_cgi() { - delete_mig_instances - META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" - if test -n "${META_MIG_CGI_VALUE}"; then - nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C - else - # https://pci-ids.ucw.cz/v2.2/pci.ids - local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)" - if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then - # run the following command to list placement profiles - # nvidia-smi mig -lgipp - # - # This is the result when using H100 instances on 20241220 - # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 - # GPU 0 Profile ID 14 Placements: {0,2,4}:2 - # GPU 0 Profile ID 9 Placements: {0,4}:4 - # GPU 0 Profile ID 5 Placement : {0}:4 - # GPU 0 Profile ID 0 Placement : {0}:8 - - # For H100 3D controllers, consider profile 19, 7x1G instances - nvidia-smi mig -cgi 9,9 -C - elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then - # Dataproc only supports H100s right now ; split in 2 if not specified - # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances - nvidia-smi mig -cgi 9,9 -C - else - echo "unrecognized 3D controller" - fi - fi - clear_nvsmi_cache -} - -function enable_mig() { - is_complete enable-mig && return - - # Start persistenced if it's not already running - if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi - for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do - # Write an ascii zero to the numa node indicator - echo "0" | dd of="${f}" status=none - done - time nvsmi --gpu-reset # 30s - nvsmi -mig 1 - clear_nvsmi_cache - - mark_complete enable-mig -} - -function enable_and_configure_mig() { - # default MIG to on when this script is used - META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") - - if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi - - enable_mig - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" - - if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi - if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured but NOT enabled. Failing" ; exit 1 ; fi - - echo "MIG is fully enabled" - configure_mig_cgi -} - -function setup_gpu_yarn() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs - configure_yarn_resources - - # When there is no GPU, but the installer is executing on a master node: - if [[ "${gpu_count}" == "0" ]] ; then - if [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - fi - return 0 - fi - - if [[ "${nvsmi_works}" == "1" ]] ; then - # if this is called without the MIG script then the drivers are not installed - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - set +e - migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" - set -e - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi - fi - fi - fi - - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver - install_cuda - load_kernel_module - - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent -# install_gpu_monitoring_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent has not been installed.' - fi - configure_gpu_exclusive_mode - fi - - install_nvidia_container_toolkit - configure_yarn_nodemanager_gpu - configure_gpu_script - configure_gpu_isolation -} - function gpu_exit_handler() { if [[ "${tmpdir}" == "/mnt/shm" ]] ; then for shmdir in /var/cudnn-local ; do From 88f9f7f70370697555fc413e1cbc49e7a4f99507 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 7 Jan 2025 15:16:47 -0800 Subject: [PATCH 110/130] state before gpu rebranch --- cloudbuild/presubmit.sh | 5 + templates/common/install_functions | 53 + templates/common/yarn_functions | 33 - templates/dask/dask.sh.in | 2 +- templates/dask/util_functions | 18 +- templates/gpu/install_functions | 947 ++++++++++++++++++ templates/gpu/install_gpu_driver.sh.in | 14 +- templates/gpu/spark_functions | 36 + templates/gpu/util_functions | 1093 +-------------------- templates/gpu/yarn_functions | 145 +++ templates/rapids/rapids.sh.in | 15 +- templates/spark-rapids/spark-rapids.sh.in | 13 +- 12 files changed, 1220 insertions(+), 1154 deletions(-) create mode 100644 templates/common/install_functions create mode 100644 templates/gpu/install_functions create mode 100644 templates/gpu/spark_functions create mode 100644 templates/gpu/yarn_functions diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index 2b2e978b0..1ec0d5756 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -50,6 +50,11 @@ determine_tests_to_run() { # Infer the files that changed mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD) mapfile -t CHANGED_FILES < <(git diff origin/master --name-only | grep -v template) + for tt in $(git diff origin/master --name-only | grep 'templates/.*/.*\.sh\.in'); do + local genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"` + perl templates/generate-action.pl "${genfile}" > "${genfile}" + CHANGED_FILES+=("${genfile}") + done echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}" echo "Changed files: ${CHANGED_FILES[*]}" diff --git a/templates/common/install_functions b/templates/common/install_functions new file mode 100644 index 000000000..f731feed6 --- /dev/null +++ b/templates/common/install_functions @@ -0,0 +1,53 @@ +# +# Generate repo file under /etc/apt/sources.list.d/ +# +function apt_add_repo() { + local -r repo_name="$1" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local -r include_src="${4:-yes}" + local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" + + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + if [[ "${include_src}" == "yes" ]] ; then + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + fi + + apt-get update -qq +} + +# +# Generate repo file under /etc/yum.repos.d/ +# +function dnf_add_repo() { + local -r repo_name="$1" + local -r repo_url="$3" # "http(s)://host/path/filename.repo" + local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" + + curl -s -L "${repo_url}" \ + | dd of="${repo_path}" status=progress +# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ +} + +# +# Keyrings default to +# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or +# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) +# +function os_add_repo() { + local -r repo_name="$1" + local -r signing_key_url="$2" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local kr_path + if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi + + mkdir -p "$(dirname "${kr_path}")" + + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + + if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi +} diff --git a/templates/common/yarn_functions b/templates/common/yarn_functions index 8e38c7b0a..6e556f975 100644 --- a/templates/common/yarn_functions +++ b/templates/common/yarn_functions @@ -1,17 +1,3 @@ -function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts - if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { set_hadoop_property 'yarn-site.xml' \ @@ -37,25 +23,6 @@ function configure_yarn_nodemanager() { fi } -function setup_gpu_yarn() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs - configure_yarn_resources - - # When there is no GPU, but the installer is executing on a master node: - if [[ "${gpu_count}" == "0" ]] ; then - if [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - fi - return 0 - fi - - install_nvidia_container_toolkit - configure_yarn_nodemanager_gpu - configure_gpu_script - configure_gpu_isolation -} - function yarn_exit_handler() { # Restart YARN services if they are running already for svc in resourcemanager nodemanager; do diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in index cafc2df89..2f8450dd6 100644 --- a/templates/dask/dask.sh.in +++ b/templates/dask/dask.sh.in @@ -31,7 +31,7 @@ function main() { configure_knox_for_dask - local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" + local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')" if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi diff --git a/templates/dask/util_functions b/templates/dask/util_functions index 5a1f7e201..d67da1fc1 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -423,15 +423,16 @@ function install_dask_rapids() { local dask_version="2024.7" local dask_spec="dask>=${dask_version}" - if is_cuda12 ; then - local python_spec="python>=3.11" - local cuda_spec="cuda-version>=12,<13" - elif is_cuda11 ; then - local python_spec="python>=3.9" - local cuda_spec="cuda-version>=11,<12.0a0" + local python_spec="python>=3.11" + local cuda_spec="cuda-version>=12,<13" + local cudart_spec="cuda-cudart" + if is_cuda11 ; then + python_spec="python>=3.9" + cuda_spec="cuda-version>=11,<12.0a0" + cudart_spec="cudatoolkit" fi - rapids_spec="rapids>=${RAPIDS_VERSION}" + local rapids_spec="rapids>=${RAPIDS_VERSION}" CONDA_PACKAGES=() local cache_key_name="dask-rapids-${RAPIDS_VERSION}" if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then @@ -443,7 +444,7 @@ function install_dask_rapids() { # https://github.com/dask/dask-yarn/issues/155 dask_spec="dask<2022.2" - python_spec="python>=3.7,<3.8.0a0" + python_spec="python>=3.9" rapids_spec="rapids<=${rapids_version}" if is_ubuntu18 ; then # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic @@ -454,6 +455,7 @@ function install_dask_rapids() { CONDA_PACKAGES+=( "${cuda_spec}" + "${cudart_spec}" "${rapids_spec}" "${dask_spec}" "dask-bigquery" diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions new file mode 100644 index 000000000..2ea8ca4d2 --- /dev/null +++ b/templates/gpu/install_functions @@ -0,0 +1,947 @@ +function set_cudnn_version() { + readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" + readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + + # Parameters for NVIDIA-provided cuDNN library + DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} + readonly DEFAULT_CUDNN_VERSION + CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") + # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} + if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" + elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" + elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" + fi + readonly CUDNN_VERSION +} + + +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) + +function set_cuda_repo_shortname() { +# Short name for urls +# https://developer.download.nvidia.com/compute/cuda/repos/${shortname} + if is_rocky ; then + shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" + else + shortname="$(os_id)$(os_vercat)" + fi +} + +function set_nv_urls() { + # Parameters for NVIDIA-provided package repositories + readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' + readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" + + # Parameter for NVIDIA-provided Rocky Linux GPU driver + readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" +} + +function set_cuda_runfile_url() { + local MAX_DRIVER_VERSION + local MAX_CUDA_VERSION + + local MIN_OPEN_DRIVER_VER="515.48.07" + local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" + local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER + + if is_cuda12 ; then + if is_debian12 ; then + MIN_DRIVER_VERSION="545.23.06" + MIN_CUDA_VERSION="12.3.0" + elif is_debian10 ; then + MAX_DRIVER_VERSION="555.42.02" + MAX_CUDA_VERSION="12.5.0" + elif is_ubuntu18 ; then + MAX_DRIVER_VERSION="530.30.02" + MAX_CUDA_VERSION="12.1.1" + fi + elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + if le_debian10 ; then + # cuda 11 is not supported for <= debian10 + MAX_CUDA_VERSION="0" + MAX_DRIVER_VERSION="0" + fi + else + echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + + if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then + echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then + echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then + echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + fi + + # driver version named in cuda runfile filename + # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) + readonly -A drv_for_cuda=( + ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" + ["11.8.0"]="520.61.05" + ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" + ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" + ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" + ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" + ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ) + + # Verify that the file with the indicated combination exists + local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} + CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" + local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" + local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" + + NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") + readonly NVIDIA_CUDA_URL + + CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_RUNFILE + + if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" + exit 1 + fi + + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then + echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" + elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then + echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then + echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then + echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" + fi +} + +function set_cudnn_tarball_url() { +CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" +CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" +if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then + # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" + if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then + # When cuDNN version is greater than or equal to 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" + fi + # Use legacy url format with one of the tarball name formats depending on version as above + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" +fi +if ( version_ge "${CUDA_VERSION}" "12.0" ); then + # Use modern url format When cuda version is greater than or equal to 12.0 + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" +fi +readonly CUDNN_TARBALL +readonly CUDNN_TARBALL_URL +} + +function install_cuda_keyring_pkg() { + if ( test -v CUDA_KEYRING_PKG_INSTALLED && + [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi + local kr_ver=1.1 + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ + -o "${tmpdir}/cuda-keyring.deb" + dpkg -i "${tmpdir}/cuda-keyring.deb" + rm -f "${tmpdir}/cuda-keyring.deb" + CUDA_KEYRING_PKG_INSTALLED="1" +} + +function uninstall_cuda_keyring_pkg() { + apt-get purge -yq cuda-keyring + CUDA_KEYRING_PKG_INSTALLED="0" +} + +function install_local_cuda_repo() { + is_complete install-local-cuda-repo && return + + if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + CUDA_LOCAL_REPO_INSTALLED="1" + pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" + CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" + readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" + readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" + readonly DIST_KEYRING_DIR="/var/${pkgname}" + + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" + + dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" + rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + + if is_ubuntu ; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ + -o /etc/apt/preferences.d/cuda-repository-pin-600 + fi + + mark_complete install-local-cuda-repo +} +function uninstall_local_cuda_repo(){ + apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" + rm -f "${workdir}/complete/install-local-cuda-repo" +} + +function install_local_cudnn_repo() { + is_complete install-local-cudnn-repo && return + + pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" + CUDNN_PKG_NAME="${pkgname}" + local_deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" + + # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${local_deb_url}" -o "${tmpdir}/local-installer.deb" + + dpkg -i "${tmpdir}/local-installer.deb" + + rm -f "${tmpdir}/local-installer.deb" + + cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings + + mark_complete install-local-cudnn-repo +} + +function uninstall_local_cudnn_repo() { + apt-get purge -yq "${CUDNN_PKG_NAME}" + rm -f "${workdir}/complete/install-local-cudnn-repo" +} + +function install_local_cudnn8_repo() { + is_complete install-local-cudnn8-repo && return + + if is_ubuntu ; then cudnn8_shortname="ubuntu2004" + elif is_debian ; then cudnn8_shortname="debian11" + else return 0 ; fi + if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 + elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 + else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" + + pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" + CUDNN8_PKG_NAME="${pkgname}" + + deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_fn="${tmpdir}/${deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" + + # cache the cudnn package + cache_fetched_package "${local_deb_url}" \ + "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ + "${local_deb_fn}" + + local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" + # If we are using a ram disk, mount another where we will unpack the cudnn local installer + if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then + mkdir -p "${cudnn_path}" + mount -t tmpfs tmpfs "${cudnn_path}" + fi + + dpkg -i "${local_deb_fn}" + + rm -f "${local_deb_fn}" + + cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings + mark_complete install-local-cudnn8-repo +} + +function uninstall_local_cudnn8_repo() { + apt-get purge -yq "${CUDNN8_PKG_NAME}" + mark_incomplete install-local-cudnn8-repo +} + +function install_nvidia_nccl() { + readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} + readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + + is_complete nccl && return + + if is_cuda11 && is_debian12 ; then + echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" + return + fi + + local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" + + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" + fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" + fi + + mkdir -p "${workdir}" + pushd "${workdir}" + + test -d "${workdir}/nccl" || { + local tarball_fn="v${NCCL_VERSION}-1.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "nccl-${NCCL_VERSION}-1" nccl + } + + local build_path + if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else + build_path="nccl/build/pkg/rpm/x86_64" ; fi + + test -d "${workdir}/nccl/build" || { + local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + else + # build and cache + pushd nccl + # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install + install_build_dependencies + if is_debuntu ; then + # These packages are required to build .deb packages from source + execute_with_retries \ + apt-get install -y -qq build-essential devscripts debhelper fakeroot + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.debian.build + elif is_rocky ; then + # These packages are required to build .rpm packages from source + execute_with_retries \ + dnf -y -q install rpm-build rpmdevtools + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.redhat.build + fi + tar czvf "/${local_tarball}" "../${build_path}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar xz + } + + if is_debuntu ; then + dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" + elif is_rocky ; then + rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" + fi + + popd + mark_complete nccl +} + +function install_nvidia_cudnn() { + is_complete cudnn && return + + local major_version + major_version="${CUDNN_VERSION%%.*}" + local cudnn_pkg_version + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" + + if is_rocky ; then + if is_cudnn8 ; then + execute_with_retries dnf -y -q install \ + "libcudnn${major_version}" \ + "libcudnn${major_version}-devel" + sync + elif is_cudnn9 ; then + execute_with_retries dnf -y -q install \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" + sync + else + echo "Unsupported cudnn version: '${major_version}'" + fi + elif is_debuntu; then + if ge_debian12 && is_src_os ; then + apt-get -y install nvidia-cudnn + else + if is_cudnn8 ; then + install_local_cudnn8_repo + + apt-get update -qq + + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn8=${cudnn_pkg_version}" \ + "libcudnn8-dev=${cudnn_pkg_version}" + + uninstall_local_cudnn8_repo + sync + elif is_cudnn9 ; then + install_cuda_keyring_pkg + + apt-get update -qq + + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" + sync + else + echo "Unsupported cudnn version: [${CUDNN_VERSION}]" + fi + fi + else + echo "Unsupported OS: '${_shortname}'" + exit 1 + fi + + ldconfig + + echo "NVIDIA cuDNN successfully installed for ${_shortname}." + mark_complete cudnn +} + +function add_nonfree_components() { + if is_src_nvidia ; then return; fi + if ge_debian12 ; then + # Include in sources file components on which nvidia-open-kernel-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib non-free non-free-firmware" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list + fi +} + +# +# Install package signing key and add corresponding repository +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html +function add_repo_nvidia_container_toolkit() { + local nvctk_root="https://nvidia.github.io/libnvidia-container" + local signing_key_url="${nvctk_root}/gpgkey" + local repo_data + + if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" + else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi + + os_add_repo nvidia-container-toolkit \ + "${signing_key_url}" \ + "${repo_data}" \ + "no" +} + +function add_repo_cuda() { + if is_debuntu ; then + install_cuda_keyring_pkg # 11.7+, 12.0+ + elif is_rocky ; then + execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" + fi +} + +function build_driver_from_github() { + # non-GPL driver will have been built on rocky8 + if is_rocky8 ; then return 0 ; fi + pushd "${workdir}" + + test -d "${workdir}/open-gpu-kernel-modules" || { + local tarball_fn="${DRIVER_VERSION}.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules + } + + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local def_dir="${modulus_md5sum:-unsigned}" + local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") + + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + echo "cache hit" + else + # build the kernel modules + pushd open-gpu-kernel-modules + install_build_dependencies + if ( is_cuda11 && is_ubuntu22 ) ; then + echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" + exit 1 + fi + execute_with_retries make -j$(nproc) modules \ + > kernel-open/build.log \ + 2> kernel-open/build_error.log + # Sign kernel modules + if [[ -n "${PSN}" ]]; then + configure_dkms_certs + for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do + "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ + "${mok_key}" \ + "${mok_der}" \ + "${module}" + done + clear_dkms_key + fi + make modules_install \ + >> kernel-open/build.log \ + 2>> kernel-open/build_error.log + # Collect build logs and installed binaries + tar czvf "${local_tarball}" \ + "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + } + + popd +} + +function build_driver_from_packages() { + if is_debuntu ; then + if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then + local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else + local pkglist=("nvidia-driver-${DRIVER}-open") ; fi + if is_debian ; then + pkglist=( + "firmware-nvidia-gsp=${DRIVER_VERSION}-1" + "nvidia-smi=${DRIVER_VERSION}-1" + "nvidia-alternative=${DRIVER_VERSION}-1" + "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1" + "nvidia-kernel-support=${DRIVER_VERSION}-1" + "nvidia-modprobe=${DRIVER_VERSION}-1" + "libnvidia-ml1=${DRIVER_VERSION}-1" + ) + fi + add_contrib_component + apt-get update -qq + execute_with_retries apt-get install -y -qq --no-install-recommends dkms + execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" + sync + + elif is_rocky ; then + if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then + echo "nvidia-driver:${DRIVER}-dkms installed successfully" + else + execute_with_retries dnf -y -q module install 'nvidia-driver:latest' + fi + sync + fi +} + +function install_nvidia_userspace_runfile() { + # Parameters for NVIDIA-provided Debian GPU driver + readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + + readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + + USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" + readonly USERSPACE_FILENAME + + # This .run file contains NV's OpenGL implementation as well as + # nvidia optimized implementations of the gtk+ 2,3 stack(s) not + # including glib (https://docs.gtk.org/glib/), and what appears to + # be a copy of the source from the kernel-open directory of for + # example DRIVER_VERSION=560.35.03 + # + # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz + # + # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run + # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. + is_complete userspace && return + + local local_fn="${tmpdir}/userspace.run" + + cache_fetched_package "${USERSPACE_URL}" \ + "${pkg_bucket}/${USERSPACE_FILENAME}" \ + "${local_fn}" + + local runfile_args + runfile_args="" + local cache_hit="0" + local local_tarball + + if is_rocky8 ; then + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local_tarball="${workdir}/${build_tarball}" + local def_dir="${modulus_md5sum:-unsigned}" + local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") + + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + cache_hit="1" + runfile_args="--no-kernel-modules" + echo "cache hit" + else + install_build_dependencies + configure_dkms_certs + local signing_options + signing_options="" + if [[ -n "${PSN}" ]]; then + signing_options="--module-signing-hash sha256 \ + --module-signing-x509-hash sha256 \ + --module-signing-secret-key \"${mok_key}\" \ + --module-signing-public-key \"${mok_der}\" \ + --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ + " + fi + runfile_args="--no-dkms ${signing_options}" + fi + } + else + runfile_args="--no-kernel-modules" + fi + + execute_with_retries bash "${local_fn}" -e -q \ + ${runfile_args} \ + --ui=none \ + --install-libglvnd \ + --tmpdir="${tmpdir}" + + if is_rocky8 ; then + if [[ "${cache_hit}" == "1" ]] ; then + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + else + clear_dkms_key + tar czf "${local_tarball}" \ + /var/log/nvidia-installer.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi + fi + + rm -f "${local_fn}" + mark_complete userspace + sync +} + +function install_cuda_runfile() { + is_complete cuda && return + + local local_fn="${tmpdir}/cuda.run" + + cache_fetched_package "${NVIDIA_CUDA_URL}" \ + "${pkg_bucket}/${CUDA_RUNFILE}" \ + "${local_fn}" + + execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" + rm -f "${local_fn}" + mark_complete cuda + sync +} + +function install_cuda_toolkit() { + local cudatk_package=cuda-toolkit + if ge_debian12 && is_src_os ; then + cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1" + elif [[ -n "${CUDA_VERSION}" ]]; then + cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}" + fi + cuda_package="cuda=${CUDA_FULL_VERSION}-1" + readonly cudatk_package + if is_debuntu ; then +# if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi + execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} + elif is_rocky ; then + # rocky9: cuda-11-[7,8], cuda-12-[1..6] + execute_with_retries dnf -y -q install "${cudatk_package}" + fi + sync +} + +function load_kernel_module() { + # for some use cases, the kernel module needs to be removed before first use of nvidia-smi + for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + done + + depmod -a + modprobe nvidia + for suffix in uvm modeset drm; do + modprobe "nvidia-${suffix}" + done + # TODO: if peermem is available, also modprobe nvidia-peermem +} + +function install_cuda(){ + is_complete cuda-repo && return + + if ( ge_debian12 && is_src_os ) ; then + echo "installed with the driver on ${_shortname}" + return 0 + fi + + # The OS package distributions are unreliable + install_cuda_runfile + + # Includes CUDA packages + add_repo_cuda + + mark_complete cuda-repo +} + +function install_nvidia_container_toolkit() { + is_complete install-nvtk && return + + local container_runtime_default + if command -v docker ; then container_runtime_default='docker' + elif command -v containerd ; then container_runtime_default='containerd' + elif command -v crio ; then container_runtime_default='crio' + else container_runtime_default='' ; fi + CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") + + if test -z "${CONTAINER_RUNTIME}" ; then return ; fi + + add_repo_nvidia_container_toolkit + if is_debuntu ; then + execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else + execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi + nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" + systemctl restart "${CONTAINER_RUNTIME}" + + mark_complete install-nvtk +} + +# Install NVIDIA GPU driver provided by NVIDIA +function install_nvidia_gpu_driver() { + is_complete gpu-driver && return + + if ( ge_debian12 && is_src_os ) ; then + add_nonfree_components + apt-get update -qq + apt-get -yq install \ + dkms \ + nvidia-open-kernel-dkms \ + nvidia-open-kernel-support \ + nvidia-smi \ + libglvnd0 \ + libcuda1 + echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" + return 0 + fi + + # OS driver packages do not produce reliable driver ; use runfile + install_nvidia_userspace_runfile + + build_driver_from_github + + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + mark_complete gpu-driver +} + +function install_ops_agent(){ + is_complete ops-agent && return + + mkdir -p /opt/google + cd /opt/google + # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation + curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install + + is_complete ops-agent +} + +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_monitoring_agent() { + download_gpu_monitoring_agent + install_gpu_monitoring_agent_dependency + start_gpu_monitoring_agent_service +} + +function download_gpu_monitoring_agent(){ + if is_rocky ; then + execute_with_retries "dnf -y -q install git" + else + execute_with_retries "apt-get install git -y" + fi + mkdir -p /opt/google + chmod 777 /opt/google + cd /opt/google + test -d compute-gpu-monitoring || \ + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" +} + +function install_gpu_monitoring_agent_dependency(){ + cd /opt/google/compute-gpu-monitoring/linux + /usr/bin/python3 -m venv venv + ( + source venv/bin/activate + pip install wheel + pip install -Ur requirements.txt + ) +} + +function start_gpu_monitoring_agent_service(){ + cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system + systemctl daemon-reload + systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service +} + +# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics +function install_gpu_agent() { + # Stackdriver GPU agent parameters +# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' + local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' + if ( ! command -v pip && is_debuntu ) ; then + execute_with_retries "apt-get install -y -qq python3-pip" + fi + local install_dir=/opt/gpu-utilization-agent + mkdir -p "${install_dir}" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ + | sed -e 's/-u --format=/--format=/' \ + | dd status=none of="${install_dir}/report_gpu_metrics.py" + local venv="${install_dir}/venv" + /usr/bin/python3 -m venv "${venv}" +( + source "${venv}/bin/activate" + python3 -m pip install --upgrade pip + execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" +) + sync + + # Generate GPU service. + cat </lib/systemd/system/gpu-utilization-agent.service +[Unit] +Description=GPU Utilization Metric Agent + +[Service] +Type=simple +PIDFile=/run/gpu_agent.pid +ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' +User=root +Group=root +WorkingDirectory=/ +Restart=always + +[Install] +WantedBy=multi-user.target +EOF + # Reload systemd manager configuration + systemctl daemon-reload + # Enable gpu-utilization-agent service + systemctl --no-reload --now enable gpu-utilization-agent.service +} + +function configure_gpu_exclusive_mode() { + # only run this function when spark < 3.0 + if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi + # include exclusive mode on GPU + nvsmi -c EXCLUSIVE_PROCESS + clear_nvsmi_cache +} + +function install_build_dependencies() { + is_complete build-dependencies && return + + if is_debuntu ; then + if is_ubuntu22 && is_cuda12 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + fi + + elif is_rocky ; then + execute_with_retries dnf -y -q install gcc + + local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" + set +e + eval "${dnf_cmd}" > "${install_log}" 2>&1 + local retval="$?" + set -e + + if [[ "${retval}" == "0" ]] ; then return ; fi + + if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then + # this kernel-devel may have been migrated to the vault + local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" + local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" + dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ + "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" + )" + fi + + execute_with_retries "${dnf_cmd}" + fi + mark_complete build-dependencies +} + +function install_gpu_driver_and_cuda() { + install_nvidia_gpu_driver + install_cuda + load_kernel_module +} + +function prepare_gpu_install_env() { + # Whether to install NVIDIA-provided or OS-provided GPU driver + GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') + readonly GPU_DRIVER_PROVIDER + + # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver + INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') + readonly INSTALL_GPU_AGENT + + set_cuda_repo_shortname + set_nv_urls + set_cuda_runfile_url + set_cudnn_version + set_cudnn_tarball_url + + if is_cuda11 ; then gcc_ver="11" + elif is_cuda12 ; then gcc_ver="12" ; fi +} + +function gpu_install_exit_handler() { + if is_ramdisk ; then + for shmdir in /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done + fi + hold_nvidia_packages +} \ No newline at end of file diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index dcbd8c15e..001ef7acc 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -10,14 +10,18 @@ set -euxo pipefail [% INSERT common/util_functions %] -[% INSERT common/yarn_functions %] +[% INSERT common/install_functions %] [% INSERT gpu/util_functions %] +[% INSERT gpu/install_functions %] + +[% INCLUDE gpu/yarn_functions %] + +[% INSERT gpu/spark_functions %] + function main() { - install_nvidia_gpu_driver - install_cuda - load_kernel_module + install_gpu_driver_and_cuda #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then @@ -54,6 +58,7 @@ function main() { } function exit_handler() { + gpu_install_exit_handler gpu_exit_handler pip_exit_handler yarn_exit_handler @@ -65,6 +70,7 @@ function prepare_to_install(){ prepare_common_env prepare_pip_env prepare_gpu_env + prepare_gpu_install_env trap exit_handler EXIT } diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions new file mode 100644 index 000000000..5da2530d4 --- /dev/null +++ b/templates/gpu/spark_functions @@ -0,0 +1,36 @@ +function install_spark_rapids() { + # Update SPARK RAPIDS config + local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 + + # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu + local -r scala_ver="2.12" + + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then + local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 + fi + + readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) + + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + + local jar_basename + + jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" + cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" +} diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 4834adb33..48473d13b 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -49,8 +49,6 @@ function set_support_matrix() { ) } -set_support_matrix - function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) @@ -140,1027 +138,9 @@ function set_driver_version() { fi } -function set_cudnn_version() { - readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" - readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" - - # Parameters for NVIDIA-provided cuDNN library - DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} - readonly DEFAULT_CUDNN_VERSION - CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") - # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} - if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" - elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then - # cuDNN v8 is not distribution for ubuntu20+, debian12 - CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" - elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then - # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 - CUDNN_VERSION="8.8.0.121" - fi - readonly CUDNN_VERSION -} - - -function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) -function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) - -function set_cuda_repo_shortname() { -# Short name for urls -# https://developer.download.nvidia.com/compute/cuda/repos/${shortname} - if is_rocky ; then - shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" - else - shortname="$(os_id)$(os_vercat)" - fi -} - -function set_nv_urls() { - # Parameters for NVIDIA-provided package repositories - readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' - readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" - - # Parameter for NVIDIA-provided Rocky Linux GPU driver - readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" -} - -function set_cuda_runfile_url() { - local MAX_DRIVER_VERSION - local MAX_CUDA_VERSION - - local MIN_OPEN_DRIVER_VER="515.48.07" - local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" - local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER - - if is_cuda12 ; then - if is_debian12 ; then - MIN_DRIVER_VERSION="545.23.06" - MIN_CUDA_VERSION="12.3.0" - elif is_debian10 ; then - MAX_DRIVER_VERSION="555.42.02" - MAX_CUDA_VERSION="12.5.0" - elif is_ubuntu18 ; then - MAX_DRIVER_VERSION="530.30.02" - MAX_CUDA_VERSION="12.1.1" - fi - elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then - if le_debian10 ; then - # cuda 11 is not supported for <= debian10 - MAX_CUDA_VERSION="0" - MAX_DRIVER_VERSION="0" - fi - else - echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - fi - - if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then - echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then - echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - fi - if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then - echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" - elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then - echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" - fi - - # driver version named in cuda runfile filename - # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) - readonly -A drv_for_cuda=( - ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" - ["11.8.0"]="520.61.05" - ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" - ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" - ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" - ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" - ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ - ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not - ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" - ) - - # Verify that the file with the indicated combination exists - local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} - CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" - local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" - local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" - - NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - readonly NVIDIA_CUDA_URL - - CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" - readonly CUDA_RUNFILE - - if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then - echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" - exit 1 - fi - - if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then - echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" - elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then - echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then - echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then - echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" - fi -} - -function set_cudnn_tarball_url() { -CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" -CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" -if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then - # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" - if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then - # When cuDNN version is greater than or equal to 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" - fi - # Use legacy url format with one of the tarball name formats depending on version as above - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" -fi -if ( version_ge "${CUDA_VERSION}" "12.0" ); then - # Use modern url format When cuda version is greater than or equal to 12.0 - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" -fi -readonly CUDNN_TARBALL -readonly CUDNN_TARBALL_URL -} - -function install_cuda_keyring_pkg() { - if ( test -v CUDA_KEYRING_PKG_INSTALLED && - [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi - local kr_ver=1.1 - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ - -o "${tmpdir}/cuda-keyring.deb" - dpkg -i "${tmpdir}/cuda-keyring.deb" - rm -f "${tmpdir}/cuda-keyring.deb" - CUDA_KEYRING_PKG_INSTALLED="1" -} - -function uninstall_cuda_keyring_pkg() { - apt-get purge -yq cuda-keyring - CUDA_KEYRING_PKG_INSTALLED="0" -} - -function install_local_cuda_repo() { - is_complete install-local-cuda-repo && return - - if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi - CUDA_LOCAL_REPO_INSTALLED="1" - pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" - CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" - readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" - readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" - readonly DIST_KEYRING_DIR="/var/${pkgname}" - - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" - - dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" - rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" - cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ - - if is_ubuntu ; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ - -o /etc/apt/preferences.d/cuda-repository-pin-600 - fi - - mark_complete install-local-cuda-repo -} -function uninstall_local_cuda_repo(){ - apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cuda-repo" -} - -function install_local_cudnn_repo() { - is_complete install-local-cudnn-repo && return - - pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" - CUDNN_PKG_NAME="${pkgname}" - local_deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" - - # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o "${tmpdir}/local-installer.deb" - - dpkg -i "${tmpdir}/local-installer.deb" - - rm -f "${tmpdir}/local-installer.deb" - - cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - - mark_complete install-local-cudnn-repo -} - -function uninstall_local_cudnn_repo() { - apt-get purge -yq "${CUDNN_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cudnn-repo" -} - -function install_local_cudnn8_repo() { - is_complete install-local-cudnn8-repo && return - - if is_ubuntu ; then cudnn8_shortname="ubuntu2004" - elif is_debian ; then cudnn8_shortname="debian11" - else return 0 ; fi - if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 - elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 - else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi - cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" - - pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" - CUDNN8_PKG_NAME="${pkgname}" - - deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_fn="${tmpdir}/${deb_fn}" - local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" - - # cache the cudnn package - cache_fetched_package "${local_deb_url}" \ - "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ - "${local_deb_fn}" - - local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" - # If we are using a ram disk, mount another where we will unpack the cudnn local installer - if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then - mkdir -p "${cudnn_path}" - mount -t tmpfs tmpfs "${cudnn_path}" - fi - - dpkg -i "${local_deb_fn}" - - rm -f "${local_deb_fn}" - - cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - mark_complete install-local-cudnn8-repo -} - -function uninstall_local_cudnn8_repo() { - apt-get purge -yq "${CUDNN8_PKG_NAME}" - mark_incomplete install-local-cudnn8-repo -} - -function install_nvidia_nccl() { - readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} - readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) - - is_complete nccl && return - - if is_cuda11 && is_debian12 ; then - echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" - return - fi - - local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - - # https://github.com/NVIDIA/nccl/blob/master/README.md - # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Fermi: SM_20, compute_30 - # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 - # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 - # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 - - # The following architectures are suppored by open kernel driver - # Volta: SM_70,SM_72, compute_70,compute_72 - # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 - - # The following architectures are supported by CUDA v11.8+ - # Ada: SM_89, compute_89 - # Hopper: SM_90,SM_90a compute_90,compute_90a - # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" - if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" - fi - if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" - fi - - mkdir -p "${workdir}" - pushd "${workdir}" - - test -d "${workdir}/nccl" || { - local tarball_fn="v${NCCL_VERSION}-1.tar.gz" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ - | tar xz - mv "nccl-${NCCL_VERSION}-1" nccl - } - - local build_path - if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else - build_path="nccl/build/pkg/rpm/x86_64" ; fi - - test -d "${workdir}/nccl/build" || { - local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" - - output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') - if echo "${output}" | grep -q "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - else - # build and cache - pushd nccl - # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install - install_build_dependencies - if is_debuntu ; then - # These packages are required to build .deb packages from source - execute_with_retries \ - apt-get install -y -qq build-essential devscripts debhelper fakeroot - export NVCC_GENCODE - execute_with_retries make -j$(nproc) pkg.debian.build - elif is_rocky ; then - # These packages are required to build .rpm packages from source - execute_with_retries \ - dnf -y -q install rpm-build rpmdevtools - export NVCC_GENCODE - execute_with_retries make -j$(nproc) pkg.redhat.build - fi - tar czvf "/${local_tarball}" "../${build_path}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" - make clean - popd - fi - gcloud storage cat "${gcs_tarball}" | tar xz - } - - if is_debuntu ; then - dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" - elif is_rocky ; then - rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" - fi - - popd - mark_complete nccl -} - function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) -function install_nvidia_cudnn() { - is_complete cudnn && return - - local major_version - major_version="${CUDNN_VERSION%%.*}" - local cudnn_pkg_version - cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" - - if is_rocky ; then - if is_cudnn8 ; then - execute_with_retries dnf -y -q install \ - "libcudnn${major_version}" \ - "libcudnn${major_version}-devel" - sync - elif is_cudnn9 ; then - execute_with_retries dnf -y -q install \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" - sync - else - echo "Unsupported cudnn version: '${major_version}'" - fi - elif is_debuntu; then - if ge_debian12 && is_src_os ; then - apt-get -y install nvidia-cudnn - else - if is_cudnn8 ; then - install_local_cudnn8_repo - - apt-get update -qq - - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn8=${cudnn_pkg_version}" \ - "libcudnn8-dev=${cudnn_pkg_version}" - - uninstall_local_cudnn8_repo - sync - elif is_cudnn9 ; then - install_cuda_keyring_pkg - - apt-get update -qq - - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" - sync - else - echo "Unsupported cudnn version: [${CUDNN_VERSION}]" - fi - fi - else - echo "Unsupported OS: '${_shortname}'" - exit 1 - fi - - ldconfig - - echo "NVIDIA cuDNN successfully installed for ${_shortname}." - mark_complete cudnn -} - -function add_nonfree_components() { - if is_src_nvidia ; then return; fi - if ge_debian12 ; then - # Include in sources file components on which nvidia-open-kernel-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib non-free non-free-firmware" - - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list - fi -} - -# -# Install package signing key and add corresponding repository -# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html -function add_repo_nvidia_container_toolkit() { - local nvctk_root="https://nvidia.github.io/libnvidia-container" - local signing_key_url="${nvctk_root}/gpgkey" - local repo_data - - if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" - else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi - - os_add_repo nvidia-container-toolkit \ - "${signing_key_url}" \ - "${repo_data}" \ - "no" -} - -function add_repo_cuda() { - if is_debuntu ; then - install_cuda_keyring_pkg # 11.7+, 12.0+ - elif is_rocky ; then - execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - fi -} - -function build_driver_from_github() { - # non-GPL driver will have been built on rocky8 - if is_rocky8 ; then return 0 ; fi - pushd "${workdir}" - - test -d "${workdir}/open-gpu-kernel-modules" || { - local tarball_fn="${DRIVER_VERSION}.tar.gz" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ - | tar xz - mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules - } - - local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" - test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local def_dir="${modulus_md5sum:-unsigned}" - local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") - - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" - - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then - echo "cache hit" - else - # build the kernel modules - pushd open-gpu-kernel-modules - install_build_dependencies - if ( is_cuda11 && is_ubuntu22 ) ; then - echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" - exit 1 - fi - execute_with_retries make -j$(nproc) modules \ - > kernel-open/build.log \ - 2> kernel-open/build_error.log - # Sign kernel modules - if [[ -n "${PSN}" ]]; then - configure_dkms_certs - for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do - "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ - "${mok_key}" \ - "${mok_der}" \ - "${module}" - done - clear_dkms_key - fi - make modules_install \ - >> kernel-open/build.log \ - 2>> kernel-open/build_error.log - # Collect build logs and installed binaries - tar czvf "${local_tarball}" \ - "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ - $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" - make clean - popd - fi - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv - depmod -a - } - - popd -} - -function build_driver_from_packages() { - if is_debuntu ; then - if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then - local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else - local pkglist=("nvidia-driver-${DRIVER}-open") ; fi - if is_debian ; then - pkglist=( - "firmware-nvidia-gsp=${DRIVER_VERSION}-1" - "nvidia-smi=${DRIVER_VERSION}-1" - "nvidia-alternative=${DRIVER_VERSION}-1" - "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1" - "nvidia-kernel-support=${DRIVER_VERSION}-1" - "nvidia-modprobe=${DRIVER_VERSION}-1" - "libnvidia-ml1=${DRIVER_VERSION}-1" - ) - fi - add_contrib_component - apt-get update -qq - execute_with_retries apt-get install -y -qq --no-install-recommends dkms - execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" - sync - - elif is_rocky ; then - if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then - echo "nvidia-driver:${DRIVER}-dkms installed successfully" - else - execute_with_retries dnf -y -q module install 'nvidia-driver:latest' - fi - sync - fi -} - -function install_nvidia_userspace_runfile() { - # Parameters for NVIDIA-provided Debian GPU driver - readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - - readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") - - USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" - readonly USERSPACE_FILENAME - - # This .run file contains NV's OpenGL implementation as well as - # nvidia optimized implementations of the gtk+ 2,3 stack(s) not - # including glib (https://docs.gtk.org/glib/), and what appears to - # be a copy of the source from the kernel-open directory of for - # example DRIVER_VERSION=560.35.03 - # - # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz - # - # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run - # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - is_complete userspace && return - - local local_fn="${tmpdir}/userspace.run" - - cache_fetched_package "${USERSPACE_URL}" \ - "${pkg_bucket}/${USERSPACE_FILENAME}" \ - "${local_fn}" - - local runfile_args - runfile_args="" - local cache_hit="0" - local local_tarball - - if is_rocky8 ; then - local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" - test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" - local_tarball="${workdir}/${build_tarball}" - local def_dir="${modulus_md5sum:-unsigned}" - local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") - - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" - - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then - cache_hit="1" - runfile_args="--no-kernel-modules" - echo "cache hit" - else - install_build_dependencies - configure_dkms_certs - local signing_options - signing_options="" - if [[ -n "${PSN}" ]]; then - signing_options="--module-signing-hash sha256 \ - --module-signing-x509-hash sha256 \ - --module-signing-secret-key \"${mok_key}\" \ - --module-signing-public-key \"${mok_der}\" \ - --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ - " - fi - runfile_args="--no-dkms ${signing_options}" - fi - } - else - runfile_args="--no-kernel-modules" - fi - - execute_with_retries bash "${local_fn}" -e -q \ - ${runfile_args} \ - --ui=none \ - --install-libglvnd \ - --tmpdir="${tmpdir}" - - if is_rocky8 ; then - if [[ "${cache_hit}" == "1" ]] ; then - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv - depmod -a - else - clear_dkms_key - tar czf "${local_tarball}" \ - /var/log/nvidia-installer.log \ - $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - fi - fi - - rm -f "${local_fn}" - mark_complete userspace - sync -} - -function install_cuda_runfile() { - is_complete cuda && return - - local local_fn="${tmpdir}/cuda.run" - - cache_fetched_package "${NVIDIA_CUDA_URL}" \ - "${pkg_bucket}/${CUDA_RUNFILE}" \ - "${local_fn}" - - execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" - rm -f "${local_fn}" - mark_complete cuda - sync -} - -function install_cuda_toolkit() { - local cudatk_package=cuda-toolkit - if ge_debian12 && is_src_os ; then - cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1" - elif [[ -n "${CUDA_VERSION}" ]]; then - cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}" - fi - cuda_package="cuda=${CUDA_FULL_VERSION}-1" - readonly cudatk_package - if is_debuntu ; then -# if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi - execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} - elif is_rocky ; then - # rocky9: cuda-11-[7,8], cuda-12-[1..6] - execute_with_retries dnf -y -q install "${cudatk_package}" - fi - sync -} - -function load_kernel_module() { - # for some use cases, the kernel module needs to be removed before first use of nvidia-smi - for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" - done - - depmod -a - modprobe nvidia - for suffix in uvm modeset drm; do - modprobe "nvidia-${suffix}" - done - # TODO: if peermem is available, also modprobe nvidia-peermem -} - -function install_cuda(){ - is_complete cuda-repo && return - - if ( ge_debian12 && is_src_os ) ; then - echo "installed with the driver on ${_shortname}" - return 0 - fi - - # The OS package distributions are unreliable - install_cuda_runfile - - # Includes CUDA packages - add_repo_cuda - - mark_complete cuda-repo -} - -function install_nvidia_container_toolkit() { - is_complete install-nvtk && return - - local container_runtime_default - if command -v docker ; then container_runtime_default='docker' - elif command -v containerd ; then container_runtime_default='containerd' - elif command -v crio ; then container_runtime_default='crio' - else container_runtime_default='' ; fi - CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") - - if test -z "${CONTAINER_RUNTIME}" ; then return ; fi - - add_repo_nvidia_container_toolkit - if is_debuntu ; then - execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else - execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi - nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" - systemctl restart "${CONTAINER_RUNTIME}" - - mark_complete install-nvtk -} - -# Install NVIDIA GPU driver provided by NVIDIA -function install_nvidia_gpu_driver() { - is_complete gpu-driver && return - - if ( ge_debian12 && is_src_os ) ; then - add_nonfree_components - apt-get update -qq - apt-get -yq install \ - dkms \ - nvidia-open-kernel-dkms \ - nvidia-open-kernel-support \ - nvidia-smi \ - libglvnd0 \ - libcuda1 - echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" - return 0 - fi - - # OS driver packages do not produce reliable driver ; use runfile - install_nvidia_userspace_runfile - - build_driver_from_github - - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - mark_complete gpu-driver -} - -function install_ops_agent(){ - is_complete ops-agent && return - - mkdir -p /opt/google - cd /opt/google - # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation - curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh - execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - - is_complete ops-agent -} - -# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_monitoring_agent() { - download_gpu_monitoring_agent - install_gpu_monitoring_agent_dependency - start_gpu_monitoring_agent_service -} - -function download_gpu_monitoring_agent(){ - if is_rocky ; then - execute_with_retries "dnf -y -q install git" - else - execute_with_retries "apt-get install git -y" - fi - mkdir -p /opt/google - chmod 777 /opt/google - cd /opt/google - test -d compute-gpu-monitoring || \ - execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" -} - -function install_gpu_monitoring_agent_dependency(){ - cd /opt/google/compute-gpu-monitoring/linux - /usr/bin/python3 -m venv venv - ( - source venv/bin/activate - pip install wheel - pip install -Ur requirements.txt - ) -} - -function start_gpu_monitoring_agent_service(){ - cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system - systemctl daemon-reload - systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service -} - -# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_agent() { - # Stackdriver GPU agent parameters -# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' - local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' - if ( ! command -v pip && is_debuntu ) ; then - execute_with_retries "apt-get install -y -qq python3-pip" - fi - local install_dir=/opt/gpu-utilization-agent - mkdir -p "${install_dir}" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ - | sed -e 's/-u --format=/--format=/' \ - | dd status=none of="${install_dir}/report_gpu_metrics.py" - local venv="${install_dir}/venv" - /usr/bin/python3 -m venv "${venv}" -( - source "${venv}/bin/activate" - python3 -m pip install --upgrade pip - execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" -) - sync - - # Generate GPU service. - cat </lib/systemd/system/gpu-utilization-agent.service -[Unit] -Description=GPU Utilization Metric Agent - -[Service] -Type=simple -PIDFile=/run/gpu_agent.pid -ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' -User=root -Group=root -WorkingDirectory=/ -Restart=always - -[Install] -WantedBy=multi-user.target -EOF - # Reload systemd manager configuration - systemctl daemon-reload - # Enable gpu-utilization-agent service - systemctl --no-reload --now enable gpu-utilization-agent.service -} - -function configure_gpu_exclusive_mode() { - # only run this function when spark < 3.0 - if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi - # include exclusive mode on GPU - nvsmi -c EXCLUSIVE_PROCESS - clear_nvsmi_cache -} - -function install_spark_rapids() { - # Update SPARK RAPIDS config - local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" - local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 - - # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu - local -r scala_ver="2.12" - - if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then - local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 - fi - - readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) - readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) - - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - - local jar_basename - - jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" - cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" - - jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" - cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" - - jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" - cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ - "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" -} - -function configure_gpu_script() { - # Download GPU discovery script - local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' - mkdir -p ${spark_gpu_script_dir} - # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still - # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: - # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh - local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh" - cat > "${gpus_resources_script}" <<'EOF' -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} - -ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') - -echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} -EOF - - chmod a+rx "${gpus_resources_script}" - - local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - - local executor_cores - executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" - local executor_memory - executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" - local task_cpus=2 - local gpu_amount - - # The current setting of spark.task.resource.gpu.amount (0.333) is - # not ideal to get the best performance from the RAPIDS Accelerator - # plugin. It's recommended to be 1/{executor core count} unless you - # have a special use case. -# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - gpu_amount="$(perl -e "print 1 / ${executor_cores}")" - -# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression - - cat >>"${spark_defaults_conf}" <> "${HADOOP_CONF_DIR}/container-executor.cfg" - printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" - else - printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" - fi - - # Configure a systemd unit to ensure that permissions are set on restart - cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service< "${nvsmi_query_xml}" } -function install_build_dependencies() { - is_complete build-dependencies && return - - if is_debuntu ; then - if is_ubuntu22 && is_cuda12 ; then - # On ubuntu22, the default compiler does not build some kernel module versions - # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 - execute_with_retries apt-get install -y -qq gcc-12 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-12 - fi - - elif is_rocky ; then - execute_with_retries dnf -y -q install gcc - - local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" - set +e - eval "${dnf_cmd}" > "${install_log}" 2>&1 - local retval="$?" - set -e - - if [[ "${retval}" == "0" ]] ; then return ; fi - - if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then - # this kernel-devel may have been migrated to the vault - local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" - local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" - dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ - "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" - )" - fi +function prepare_gpu_env(){ + set_support_matrix - execute_with_retries "${dnf_cmd}" - fi - mark_complete build-dependencies -} + set_cuda_version + set_driver_version -function prepare_gpu_env(){ set +e gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" set -e @@ -1256,27 +199,8 @@ function prepare_gpu_env(){ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}") readonly RAPIDS_RUNTIME - # Whether to install NVIDIA-provided or OS-provided GPU driver - GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') - readonly GPU_DRIVER_PROVIDER - - # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver - INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') - readonly INSTALL_GPU_AGENT - # determine whether we have nvidia-smi installed and working nvsmi - - set_cuda_version - set_driver_version - set_cuda_repo_shortname - set_nv_urls - set_cuda_runfile_url - set_cudnn_version - set_cudnn_tarball_url - - if is_cuda11 ; then gcc_ver="11" - elif is_cuda12 ; then gcc_ver="12" ; fi } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades @@ -1292,12 +216,5 @@ function hold_nvidia_packages() { } function gpu_exit_handler() { - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - for shmdir in /var/cudnn-local ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then - umount -f ${shmdir} - fi - done - fi - hold_nvidia_packages + echo "no operations in gpu exit handler" } diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions new file mode 100644 index 000000000..5b8455c19 --- /dev/null +++ b/templates/gpu/yarn_functions @@ -0,0 +1,145 @@ +[% INSERT common/yarn_functions %] + +function configure_yarn_gpu_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +function configure_gpu_script() { + # Download GPU discovery script + local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' + mkdir -p ${spark_gpu_script_dir} + # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still + # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: + # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh + local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh" + cat > "${gpus_resources_script}" <<'EOF' +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} + +ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') + +echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} +EOF + + chmod a+rx "${gpus_resources_script}" + + local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" + + local executor_cores + executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" + local executor_memory + executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" + local task_cpus=2 + local gpu_amount + + # The current setting of spark.task.resource.gpu.amount (0.333) is + # not ideal to get the best performance from the RAPIDS Accelerator + # plugin. It's recommended to be 1/{executor core count} unless you + # have a special use case. +# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + gpu_amount="$(perl -e "print 1 / ${executor_cores}")" + +# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression + + cat >>"${spark_defaults_conf}" <> "${HADOOP_CONF_DIR}/container-executor.cfg" + printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" + else + printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" + fi + + # Configure a systemd unit to ensure that permissions are set on restart + cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service< Date: Tue, 7 Jan 2025 18:50:33 -0800 Subject: [PATCH 111/130] templates/common/util_functions: * increased minimum memory threshold for ram disk * moved apt_add_repo and friends to common/install_functions templates/dask/util_functions: * validating conda tarball before caching to gcs templates/generate-action.pl: * improved usage documentation a little templates/gpu/install_functions * using /opt/conda/miniconda3/bin/python3 instead of /usr/bin/ for venv pre-install --- templates/common/util_functions | 56 +-------------------------------- templates/dask/util_functions | 4 +-- templates/generate-action.pl | 14 ++++++++- templates/gpu/install_functions | 4 +-- 4 files changed, 18 insertions(+), 60 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 4d9f983a4..336af37f8 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -297,7 +297,7 @@ function is_ramdisk() { function mount_ramdisk(){ local free_mem free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" - if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi # Write to a ramdisk instead of churning the persistent disk @@ -350,60 +350,6 @@ function check_os() { fi } -# -# Generate repo file under /etc/apt/sources.list.d/ -# -function apt_add_repo() { - local -r repo_name="$1" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local -r include_src="${4:-yes}" - local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" - - echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" - if [[ "${include_src}" == "yes" ]] ; then - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" - fi - - apt-get update -qq -} - -# -# Generate repo file under /etc/yum.repos.d/ -# -function dnf_add_repo() { - local -r repo_name="$1" - local -r repo_url="$3" # "http(s)://host/path/filename.repo" - local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - - curl -s -L "${repo_url}" \ - | dd of="${repo_path}" status=progress -# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ -} - -# -# Keyrings default to -# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or -# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) -# -function os_add_repo() { - local -r repo_name="$1" - local -r signing_key_url="$2" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local kr_path - if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi - - mkdir -p "$(dirname "${kr_path}")" - - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ - | gpg --import --no-default-keyring --keyring "${kr_path}" - - if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" - else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi -} - function configure_dkms_certs() { if test -v PSN && [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping"; diff --git a/templates/dask/util_functions b/templates/dask/util_functions index d67da1fc1..ce6964e94 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -510,8 +510,8 @@ function install_conda_packages() { if [[ "$retval" == "0" ]] ; then is_installed="1" pushd "${DASK_CONDA_ENV}" - time ( - tar czf "${local_tarball}" . + time ( set -e + tar czf "${local_tarball}" . && tar tzf "${local_tarball}" gcloud storage cp "${local_tarball}" "${gcs_tarball}" rm "${local_tarball}" ) diff --git a/templates/generate-action.pl b/templates/generate-action.pl index 7cc954a67..950bd15fe 100644 --- a/templates/generate-action.pl +++ b/templates/generate-action.pl @@ -10,7 +10,19 @@ my $action = $ARGV[0]; my $v = { template_path => "${action}.in" }; -sub usage{ die "Usage: $0 " } +sub usage{ + # TODO: use File::Find to list the available actions for the user + my $message = < argument is the destination action name, not the source. +EOF + print STDERR $message; + die "Usage:$/$0 " +} usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}" ); diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions index 2ea8ca4d2..f6aa9fcf9 100644 --- a/templates/gpu/install_functions +++ b/templates/gpu/install_functions @@ -798,7 +798,7 @@ function download_gpu_monitoring_agent(){ function install_gpu_monitoring_agent_dependency(){ cd /opt/google/compute-gpu-monitoring/linux - /usr/bin/python3 -m venv venv + /opt/conda/miniconda3/bin/python3 -m venv venv ( source venv/bin/activate pip install wheel @@ -829,7 +829,7 @@ function install_gpu_agent() { | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" - /usr/bin/python3 -m venv "${venv}" + /opt/conda/miniconda3/bin/python3 -m venv "${venv}" ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip From d45e16bc8474f5c745985dd65f76c31ac11046a5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 8 Jan 2025 22:10:42 -0800 Subject: [PATCH 112/130] templates/dask/util_functions: * increase wait time for scheduler to come online * reduce noise from tar -t templates/gpu/yarn_functions, templates/gpu/install_functions: * protect many functions from running without attached accelerator templates/gpu/install_gpu_driver.sh.in * set +e in exit handler templates/gpu/spark_functions: * re-factor new function into this template templates/spark-rapids/spark-rapids.sh.in * removed redundant call to configure_gpu_script * set +e in exit handler --- templates/dask/util_functions | 7 ++++--- templates/gpu/install_functions | 11 +++++++++-- templates/gpu/install_gpu_driver.sh.in | 1 + templates/gpu/spark_functions | 7 +++++++ templates/gpu/yarn_functions | 7 ++++++- templates/spark-rapids/spark-rapids.sh.in | 2 +- 6 files changed, 28 insertions(+), 7 deletions(-) diff --git a/templates/dask/util_functions b/templates/dask/util_functions index ce6964e94..afcaadb58 100644 --- a/templates/dask/util_functions +++ b/templates/dask/util_functions @@ -150,7 +150,8 @@ function start_systemd_dask_service() { # Pause while scheduler comes online retries=30 while ! nc -vz "${MASTER}" 8786 ; do - sleep 3s + date + sleep 7s ((retries--)) if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi done @@ -399,7 +400,7 @@ function install_dask() { # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic CONDA_PACKAGES+=("fiona<1.8.22") fi - CONDA_PACKAGES+=('dask-yarn=${dask_yarn_version}' "distributed<2022.2") + CONDA_PACKAGES+=("dask-yarn=${dask_yarn_version}" "distributed<2022.2") fi CONDA_PACKAGES+=( @@ -511,7 +512,7 @@ function install_conda_packages() { is_installed="1" pushd "${DASK_CONDA_ENV}" time ( set -e - tar czf "${local_tarball}" . && tar tzf "${local_tarball}" + tar czf "${local_tarball}" . && tar tzf "${local_tarball}" > /dev/null gcloud storage cp "${local_tarball}" "${gcs_tarball}" rm "${local_tarball}" ) diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions index f6aa9fcf9..746eb79bb 100644 --- a/templates/gpu/install_functions +++ b/templates/gpu/install_functions @@ -685,9 +685,12 @@ function install_cuda_toolkit() { } function load_kernel_module() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + ( set +e + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + ) done depmod -a @@ -700,6 +703,7 @@ function load_kernel_module() { function install_cuda(){ is_complete cuda-repo && return + if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" @@ -740,6 +744,7 @@ function install_nvidia_container_toolkit() { # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { is_complete gpu-driver && return + if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components @@ -778,6 +783,7 @@ function install_ops_agent(){ # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_monitoring_agent() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi download_gpu_monitoring_agent install_gpu_monitoring_agent_dependency start_gpu_monitoring_agent_service @@ -861,6 +867,7 @@ EOF } function configure_gpu_exclusive_mode() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # only run this function when spark < 3.0 if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU @@ -944,4 +951,4 @@ function gpu_install_exit_handler() { done fi hold_nvidia_packages -} \ No newline at end of file +} diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index 001ef7acc..a7c4d353f 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -58,6 +58,7 @@ function main() { } function exit_handler() { + set +e gpu_install_exit_handler gpu_exit_handler pip_exit_handler diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions index 5da2530d4..fa29330de 100644 --- a/templates/gpu/spark_functions +++ b/templates/gpu/spark_functions @@ -1,3 +1,10 @@ +function download_spark_jar() { + local -r url=$1 + local -r jar_name=${url##*/} + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${url}" -o "${SPARK_JARS_DIR}/${jar_name}" +} + function install_spark_rapids() { # Update SPARK RAPIDS config local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions index 5b8455c19..d9040b1d6 100644 --- a/templates/gpu/yarn_functions +++ b/templates/gpu/yarn_functions @@ -15,6 +15,7 @@ function configure_yarn_gpu_resources() { } function configure_gpu_script() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # Download GPU discovery script local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' mkdir -p ${spark_gpu_script_dir} @@ -89,6 +90,7 @@ EOF } function configure_yarn_nodemanager_gpu() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' @@ -98,6 +100,7 @@ function configure_yarn_nodemanager_gpu() { } function configure_gpu_isolation() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # enable GPU isolation sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" if [[ $IS_MIG_ENABLED -ne 0 ]]; then @@ -140,6 +143,8 @@ function setup_gpu_yarn() { install_nvidia_container_toolkit configure_yarn_nodemanager_gpu - configure_gpu_script + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + configure_gpu_script + fi configure_gpu_isolation } diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in index 2435bb732..16e67aba1 100644 --- a/templates/spark-rapids/spark-rapids.sh.in +++ b/templates/spark-rapids/spark-rapids.sh.in @@ -51,7 +51,6 @@ function main() { if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then install_spark_rapids - configure_gpu_script echo "RAPIDS initialized with Spark runtime" elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then echo "This action only installs spark-rapids" @@ -66,6 +65,7 @@ function main() { } function exit_handler() { + set +e gpu_install_exit_handler gpu_exit_handler pip_exit_handler From a7b47071d55780060449c9cb65e168bf230ab9a5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 8 Jan 2025 23:11:50 -0800 Subject: [PATCH 113/130] refactored spark variable definition and reduced excess lines by bulking the readonly operations --- templates/common/util_functions | 59 +++++++++------------------------ templates/gpu/spark_functions | 29 ++++++++++++++++ 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 336af37f8..9a6407a7b 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -329,25 +329,6 @@ function check_os() { exit 1 fi - SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - readonly SPARK_VERSION - if version_lt "${SPARK_VERSION}" "3.1" || \ - version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 - fi - - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi } function configure_dkms_certs() { @@ -510,42 +491,30 @@ function prepare_conda_env() { } function prepare_common_env() { - define_os_comparison_functions - # Verify OS compatability and Secure boot state check_os check_secure_boot - readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" - - # Dataproc configurations - readonly HADOOP_CONF_DIR='/etc/hadoop/conf' - readonly HIVE_CONF_DIR='/etc/hive/conf' - readonly SPARK_CONF_DIR='/etc/spark/conf' - + # read-only configuration variables + _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + HADOOP_CONF_DIR='/etc/hadoop/conf' + HIVE_CONF_DIR='/etc/hive/conf' OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - - # master node MASTER="$(get_metadata_attribute dataproc-master)" - readonly MASTER - workdir=/opt/install-dpgce - tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" - readonly temp_bucket - readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + pkg_bucket="gs://${temp_bucket}/dpgce-packages" uname_r=$(uname -r) - readonly uname_r - readonly bdcfg="/usr/local/bin/bdconfig" - export DEBIAN_FRONTEND=noninteractive + bdcfg="/usr/local/bin/bdconfig" + KNOX_HOME=/usr/lib/knox - # Knox config - readonly KNOX_HOME=/usr/lib/knox + readonly HADOOP_CONF_DIR HIVE_CONF_DIR OS_NAME ROLE MASTER workdir + readonly temp_bucket pkg_bucket uname_r bdconfig KNOX_HOME + + tmpdir=/tmp/ + + export DEBIAN_FRONTEND=noninteractive mkdir -p "${workdir}/complete" set_proxy @@ -685,3 +654,5 @@ print( " samples-taken: ", scalar @siz, $/, fi echo "exit_handler has completed" } + +define_os_comparison_functions diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions index fa29330de..25a99221e 100644 --- a/templates/gpu/spark_functions +++ b/templates/gpu/spark_functions @@ -41,3 +41,32 @@ function install_spark_rapids() { "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ "/usr/lib/spark/jars/${jar_basename}" } + +function prepare_spark_env() { + SPARK_NLP_VERSION="3.2.1" # Must include subminor version here + SPARK_JARS_DIR=/usr/lib/spark/jars + SPARK_CONF_DIR='/etc/spark/conf' + SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")" + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + + readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION + + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 + fi + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi + +} From 35ca7043d7205a63cf618988770ee58d8f2dd3c4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 8 Jan 2025 23:26:22 -0800 Subject: [PATCH 114/130] development on these scripts will happen in the spark-rapids-template-20241225 branch --- templates/spark-rapids/mig.sh.in | 93 ----------------------- templates/spark-rapids/spark-rapids.sh.in | 87 --------------------- 2 files changed, 180 deletions(-) delete mode 100644 templates/spark-rapids/mig.sh.in delete mode 100644 templates/spark-rapids/spark-rapids.sh.in diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in deleted file mode 100644 index 99b494c4f..000000000 --- a/templates/spark-rapids/mig.sh.in +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash -# -[% INSERT legal/license_header %] -# -# This script installs NVIDIA GPU drivers and enables MIG on Hopper -# GPU architectures. -# -# This script should be specified in --initialization-actions= option -# and --metadata=ENABLE_MIG can be used to enable or disable MIG. The -# default is to enable it. The script configures the MIG device based -# on the user specified MIG_CGI profiles specified via: -# --metadata=^:^MIG_CGI='9,9'. If MIG_CGI is not specified it assumes -# it's using an H100 and configures 2 instances with profile id 9. -# -[% PROCESS common/template_disclaimer %] - -[% INSERT common/util_functions %] - -[% INSERT common/yarn_functions %] - -[% INSERT gpu/mig_functions %] - -[% INSERT gpu/util_functions %] - -set -euxo pipefail - -function main() { - if [[ "${nvsmi_works}" == "1" ]] ; then - # if this is called without the MIG script then the drivers are not installed - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - set +e - migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" - set -e - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi - fi - fi - fi - - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver - install_cuda - load_kernel_module - - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent -# install_gpu_monitoring_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent has not been installed.' - fi - configure_gpu_exclusive_mode - fi - - setup_gpu_yarn - - echo "yarn setup complete" - - enable_and_configure_mig - - echo "main complete" - return 0 -} - -function exit_handler() { - gpu_exit_handler - pip_exit_handler - yarn_exit_handler - common_exit_handler - return 0 -} - -function prepare_to_install(){ - prepare_common_env - prepare_pip_env - prepare_gpu_env - trap exit_handler EXIT -} - -prepare_to_install - -main diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in deleted file mode 100644 index 16e67aba1..000000000 --- a/templates/spark-rapids/spark-rapids.sh.in +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash -# -[% INSERT legal/license_header %] -# -[% PROCESS common/template_disclaimer %] -# -# This script installs NVIDIA GPU drivers. -# -# Dataproc 2.0: Driver version 530.30.02, CUDA version 12.1.1, Rapids 23.08.2 -# Dataproc 2.1: Driver version 550.135, CUDA version 12.4.1, Rapids 24.08.1 -# Dataproc 2.2: Driver version 560.35.03, CUDA version 12.6.2, Rapids 24.08.1 -# -# Additionally, it installs the RAPIDS Spark plugin, configures Spark -# and YARN, and installs an agent to collect GPU utilization metrics. -# The installer is regularly exercised with Debian, Ubuntu, and Rocky -# Linux distributions. -# -# Note that the script is designed to work both when secure boot is -# enabled with a custom image and when disabled during cluster -# creation. -# -# For details see -# github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot -# - -set -euxo pipefail - -[% INSERT common/util_functions %] -[% INSERT common/install_functions %] -[% INSERT gpu/util_functions %] -[% INSERT gpu/install_functions %] -[% INCLUDE gpu/yarn_functions %] -[% INSERT gpu/spark_functions %] - -function main() { - install_gpu_driver_and_cuda - - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then -# install_gpu_agent - install_gpu_monitoring_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent has not been installed.' - fi - configure_gpu_exclusive_mode - - setup_gpu_yarn - - echo "yarn setup complete" - - if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then - install_spark_rapids - echo "RAPIDS initialized with Spark runtime" - elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then - echo "This action only installs spark-rapids" - exit 1 - else - echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" - exit 1 - fi - - echo "main complete" - return 0 -} - -function exit_handler() { - set +e - gpu_install_exit_handler - gpu_exit_handler - pip_exit_handler - yarn_exit_handler - common_exit_handler - return 0 -} - -function prepare_to_install(){ - prepare_common_env - prepare_pip_env - prepare_gpu_env - prepare_gpu_install_env - trap exit_handler EXIT -} - -prepare_to_install - -main From 43232b25d25b99dcfbb3a3e5e2933c02a239fd2d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 12:00:42 -0800 Subject: [PATCH 115/130] revert dask/ to master --- dask/test_dask.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dask/test_dask.py b/dask/test_dask.py index 1126d7d80..440493511 100644 --- a/dask/test_dask.py +++ b/dask/test_dask.py @@ -56,13 +56,16 @@ def _run_dask_test_script(self, name, script): ) def test_dask(self, configuration, instances, runtime): + if self.getImageVersion() < pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in pre-2.0 images") + metadata = None if runtime: metadata = "dask-runtime={}".format(runtime) self.createCluster(configuration, self.INIT_ACTIONS, - machine_type='n1-highmem-8', + machine_type='n1-standard-16', metadata=metadata, timeout_in_minutes=20) From 4b6e520812ac77597e3c5833175fc7a80da04062 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 8 Jan 2025 00:46:00 -0800 Subject: [PATCH 116/130] moving that .in suffix to the correct variable --- templates/generate-action.pl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/templates/generate-action.pl b/templates/generate-action.pl index 950bd15fe..690acb409 100644 --- a/templates/generate-action.pl +++ b/templates/generate-action.pl @@ -8,7 +8,10 @@ use strict; my $action = $ARGV[0]; -my $v = { template_path => "${action}.in" }; +my $v = { + template_path => "${action}", + IA_VERSION => "${IA_VERSION}", +}; sub usage{ # TODO: use File::Find to list the available actions for the user @@ -24,7 +27,7 @@ sub usage{ die "Usage:$/$0 " } -usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}" ); +usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}.in" ); my $tt = Template->new( { INCLUDE_PATH => "$ENV{PWD}/templates", @@ -33,4 +36,4 @@ sub usage{ }) || die "$Template::ERROR$/"; -$tt->process($v->{template_path}) or die( $tt->error(), "\n" ); +$tt->process("$v->{template_path}.in") or die( $tt->error(), "\n" ); From 4a024e0548d48e126bb65bf53ca0835e26d94f37 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 12:27:52 -0800 Subject: [PATCH 117/130] reverted to master ; changes ended up in gpu-template-20250107 --- gpu/BUILD | 6 +- gpu/install_gpu_driver.sh | 2685 +++++++++++++------------------------ gpu/test_gpu.py | 302 ++--- gpu/verify_pyspark.py | 45 - 4 files changed, 1033 insertions(+), 2005 deletions(-) delete mode 100644 gpu/verify_pyspark.py diff --git a/gpu/BUILD b/gpu/BUILD index bd5500ccb..b481c5b33 100644 --- a/gpu/BUILD +++ b/gpu/BUILD @@ -6,11 +6,7 @@ py_test( name = "test_gpu", size = "enormous", srcs = ["test_gpu.py"], - data = [ - "install_gpu_driver.sh", - "verify_pyspark.py", - "mig.sh" - ], + data = ["install_gpu_driver.sh", "mig.sh"], local = True, shard_count = 15, deps = [ diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 91ad4ede0..25efb2a49 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1,7 +1,5 @@ #!/bin/bash # -# Copyright 2015 Google LLC and contributors -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,14 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -# -# This initialization action is generated from -# initialization-actions/templates/gpu/install_gpu_driver.sh.in -# -# Modifications made directly to the generated file will be lost when -# the template is re-evaluated - # # This script installs NVIDIA GPU drivers and collects GPU utilization metrics. @@ -30,38 +20,32 @@ function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | x function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) -# For version (or real number) comparison -# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second -# ( version_ge 2.0 2.1 ) evaluates to false -# ( version_ge 2.2 2.1 ) evaluates to true function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) -function define_os_comparison_functions() { - - readonly -A supported_os=( - ['debian']="10 11 12" - ['rocky']="8 9" - ['ubuntu']="18.04 20.04 22.04" - ) - - # dynamically define OS version test utility functions - if [[ "$(os_id)" == "rocky" ]]; - then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') - else _os_version="$(os_version)"; fi - for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" - - for osver in $(echo "${supported_os["${os_id_val}"]}") ; do - eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" - eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" - eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" - done +readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" +) + +# dynamically define OS version test utility functions +if [[ "$(os_id)" == "rocky" ]]; +then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') +else _os_version="$(os_version)"; fi +for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" done - eval "function is_debuntu() ( set +x ; is_debian || is_ubuntu ; )" -} +done + +function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) function os_vercat() ( set +x if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' @@ -69,7 +53,7 @@ function os_vercat() ( set +x else os_version ; fi ; ) function repair_old_backports { - if ! is_debuntu ; then return ; fi + if ge_debian12 || ! is_debuntu ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will use archive.debian.org for the oldoldstable repo @@ -110,7 +94,6 @@ function print_metadata_value_if_exists() { return ${return_code} } -# replicates /usr/share/google/get_metadata_value function get_metadata_value() ( set +x local readonly varname=$1 @@ -134,13 +117,226 @@ function get_metadata_attribute() ( get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) +OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +readonly OS_NAME + +# node role +ROLE="$(get_metadata_attribute dataproc-role)" +readonly ROLE + +# CUDA version and Driver version +# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html +# https://developer.nvidia.com/cuda-downloads +# Rocky8: 12.0: 525.147.05 +readonly -A DRIVER_FOR_CUDA=( + ["11.8"]="560.35.03" + ["12.0"]="525.60.13" ["12.4"]="560.35.03" ["12.6"]="560.35.03" +) +# https://developer.nvidia.com/cudnn-downloads +if is_debuntu ; then +readonly -A CUDNN_FOR_CUDA=( + ["11.8"]="9.5.1.17" + ["12.0"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.6"]="9.5.1.17" +) +elif is_rocky ; then +# rocky: +# 12.0: 8.8.1.3 +# 12.1: 8.9.3.28 +# 12.2: 8.9.7.29 +# 12.3: 9.0.0.312 +# 12.4: 9.1.1.17 +# 12.5: 9.2.1.18 +# 12.6: 9.5.1.17 +readonly -A CUDNN_FOR_CUDA=( + ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.4"]="9.1.1.17" ["12.6"]="9.5.1.17" +) +fi +# https://developer.nvidia.com/nccl/nccl-download +# 12.2: 2.19.3, 12.5: 2.21.5 +readonly -A NCCL_FOR_CUDA=( + ["11.8"]="2.15.5" + ["12.0"]="2.16.5" ["12.4"]="2.23.4" ["12.6"]="2.23.4" +) +readonly -A CUDA_SUBVER=( + ["11.8"]="11.8.0" + ["12.0"]="12.0.0" ["12.4"]="12.4.1" ["12.6"]="12.6.2" +) + +RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') +readonly DEFAULT_CUDA_VERSION='12.4' +CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") +if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then + # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27 + CUDA_VERSION="${DEFAULT_CUDA_VERSION}" +fi + +if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then + # Only CUDA 12.0 supported on older debuntu + CUDA_VERSION="12.0" +fi +readonly CUDA_VERSION +readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}" + +function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) +function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) +function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) + +function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) +function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) +function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) + +DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}" +if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then + DEFAULT_DRIVER="560.28.03" ; fi +if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03" ; fi +if ( is_rocky && le_cuda11 ) ; then DEFAULT_DRIVER="525.147.05" ; fi +if ( is_ubuntu20 && le_cuda11 ) ; then DEFAULT_DRIVER="535.183.06" ; fi +if ( is_rocky9 && ge_cuda12 ) ; then DEFAULT_DRIVER="565.57.01" ; fi +DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") + +readonly DRIVER_VERSION +readonly DRIVER=${DRIVER_VERSION%%.*} + +readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" +readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + +# Parameters for NVIDIA-provided cuDNN library +readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} +CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) +# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} +if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" +elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" +elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" +fi +readonly CUDNN_VERSION + +readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} +readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + +# Parameters for NVIDIA-provided Debian GPU driver +readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + +readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + +# Short name for urls +if is_ubuntu22 ; then + # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at + # https://developer.download.nvidia.com/compute/machine-learning/repos/ + # use packages from previous release until such time as nvidia + # release ubuntu2204 builds + + nccl_shortname="ubuntu2004" + shortname="$(os_id)$(os_vercat)" +elif ge_rocky9 ; then + # use packages from previous release until such time as nvidia + # release rhel9 builds + + nccl_shortname="rhel8" + shortname="rhel9" +elif is_rocky ; then + shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" + nccl_shortname="${shortname}" +else + shortname="$(os_id)$(os_vercat)" + nccl_shortname="${shortname}" +fi + +# Parameters for NVIDIA-provided package repositories +readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' +readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" + +# Parameters for NVIDIA-provided NCCL library +readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" +NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") +readonly NCCL_REPO_URL +readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub + +function set_cuda_runfile_url() { + local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}" + local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}" + + if ge_cuda12 ; then + if ( le_debian11 || le_ubuntu18 ) ; then + RUNFILE_DRIVER_VERSION="525.60.13" + RUNFILE_CUDA_VERSION="12.0.0" + elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then + RUNFILE_DRIVER_VERSION="525.147.05" + RUNFILE_CUDA_VERSION="12.0.0" + fi + else + RUNFILE_DRIVER_VERSION="520.61.05" + RUNFILE_CUDA_VERSION="11.8.0" + fi + + readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run" + CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}" + DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}" + readonly DEFAULT_NVIDIA_CUDA_URL + + NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") + readonly NVIDIA_CUDA_URL +} + +set_cuda_runfile_url + +# Parameter for NVIDIA-provided Rocky Linux GPU driver +readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" + +CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" +CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" +if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then + # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" + if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then + # When cuDNN version is greater than or equal to 8.4.1.50 use this format + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" + fi + # Use legacy url format with one of the tarball name formats depending on version as above + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" +fi +if ( version_ge "${CUDA_VERSION}" "12.0" ); then + # Use modern url format When cuda version is greater than or equal to 12.0 + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" +fi +readonly CUDNN_TARBALL +readonly CUDNN_TARBALL_URL + +# Whether to install NVIDIA-provided or OS-provided GPU driver +GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') +readonly GPU_DRIVER_PROVIDER + +# Stackdriver GPU agent parameters +readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' +# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver +INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') +readonly INSTALL_GPU_AGENT + +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' + +NVIDIA_SMI_PATH='/usr/bin' +MIG_MAJOR_CAPS=0 +IS_MIG_ENABLED=0 + function execute_with_retries() ( set +x local -r cmd="$*" if [[ "$cmd" =~ "^apt-get install" ]] ; then apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove + apt-get -y autoremove fi for ((i = 0; i < 3; i++)); do set -x @@ -152,1154 +348,154 @@ function execute_with_retries() ( return 1 ) -function cache_fetched_package() { - local src_url="$1" - local gcs_fn="$2" - local local_fn="$3" +CUDA_KEYRING_PKG_INSTALLED="0" +function install_cuda_keyring_pkg() { + if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi + local kr_ver=1.1 + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ + -o "${tmpdir}/cuda-keyring.deb" + dpkg -i "${tmpdir}/cuda-keyring.deb" + rm -f "${tmpdir}/cuda-keyring.deb" + CUDA_KEYRING_PKG_INSTALLED="1" +} + +function uninstall_cuda_keyring_pkg() { + apt-get purge -yq cuda-keyring + CUDA_KEYRING_PKG_INSTALLED="0" +} + +CUDA_LOCAL_REPO_INSTALLED="0" +function install_local_cuda_repo() { + if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + CUDA_LOCAL_REPO_INSTALLED="1" + pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" + CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" + readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" + readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" + readonly DIST_KEYRING_DIR="/var/${pkgname}" - while ! command -v gcloud ; do sleep 5s ; done + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" - if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then - time gcloud storage cp "${gcs_fn}" "${local_fn}" - else - time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ - gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" + rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" + cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + + if is_ubuntu ; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ + -o /etc/apt/preferences.d/cuda-repository-pin-600 fi } +function uninstall_local_cuda_repo(){ + apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" + CUDA_LOCAL_REPO_INSTALLED="0" +} -function add_contrib_component() { - if ! is_debuntu ; then return ; fi - if ge_debian12 ; then - # Include in sources file components on which nvidia-kernel-open-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib" +CUDNN_LOCAL_REPO_INSTALLED="0" +CUDNN_PKG_NAME="" +function install_local_cudnn_repo() { + if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + pkgname="cudnn-local-repo-${shortname}-${CUDNN}" + CUDNN_PKG_NAME="${pkgname}" + local_deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}" - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list - fi + # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${local_deb_url}" -o "${tmpdir}/local-installer.deb" + + dpkg -i "${tmpdir}/local-installer.deb" + + rm -f "${tmpdir}/local-installer.deb" + + cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings + + CUDNN_LOCAL_REPO_INSTALLED="1" } -function set_hadoop_property() { - local -r config_file=$1 - local -r property=$2 - local -r value=$3 - "${bdcfg}" set_property \ - --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ - --name "${property}" --value "${value}" \ - --clobber +function uninstall_local_cudnn_repo() { + apt-get purge -yq "${CUDNN_PKG_NAME}" + CUDNN_LOCAL_REPO_INSTALLED="0" } -function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts - if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' +CUDNN8_LOCAL_REPO_INSTALLED="0" +CUDNN8_PKG_NAME="" +function install_local_cudnn8_repo() { + if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + if is_ubuntu ; then cudnn8_shortname="ubuntu2004" + elif is_debian ; then cudnn8_shortname="debian11" + else return 0 ; fi + if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 + elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 + else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" + CUDNN8_PKG_NAME="${pkgname}" - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} + deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_fn="${tmpdir}/${deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" + curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + "${local_deb_url}" -o "${local_deb_fn}" -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + dpkg -i "${local_deb_fn}" - # Fix local dirs access permissions - local yarn_local_dirs=() + rm -f "${local_deb_fn}" - readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings + CUDNN8_LOCAL_REPO_INSTALLED="1" +} - if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" - fi +function uninstall_local_cudnn8_repo() { + apt-get purge -yq "${CUDNN8_PKG_NAME}" + CUDNN8_LOCAL_REPO_INSTALLED="0" } -function clean_up_sources_lists() { - # - # bigtop (primary) - # - local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" +function install_nvidia_nccl() { + local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then - region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" + if is_rocky ; then + execute_with_retries \ + dnf -y -q install \ + "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" + sync + elif is_ubuntu ; then + install_cuda_keyring_pkg - local regional_bigtop_repo_uri - regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | - sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | - grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | - cut -d ' ' -f 2 | - head -1) + apt-get update -qq - if [[ "${regional_bigtop_repo_uri}" == */ ]]; then - local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" + if is_ubuntu18 ; then + execute_with_retries \ + apt-get install -q -y \ + libnccl2 libnccl-dev + sync else - local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + execute_with_retries \ + apt-get install -q -y \ + "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" + sync fi - - local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" - rm -f "${bigtop_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ - "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" - - sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + else + echo "Unsupported OS: '${OS_NAME}'" + # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems + # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz + # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz + # nvhpc_2024_247_Linux_x86_64_cuda_multi/install + return fi +} - # - # adoptium - # - # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu - local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" - local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" - rm -f "${adoptium_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ - | gpg --dearmor -o "${adoptium_kr_path}" - echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ - > /etc/apt/sources.list.d/adoptium.list +function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) +function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) - - # - # docker - # - local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" - local docker_repo_file="/etc/apt/sources.list.d/docker.list" - local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" - - rm -f "${docker_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ - | gpg --dearmor -o "${docker_kr_path}" - echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ - > ${docker_repo_file} - - # - # google cloud + logging/monitoring - # - if ls /etc/apt/sources.list.d/google-cloud*.list ; then - rm -f /usr/share/keyrings/cloud.google.gpg - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - for list in google-cloud google-cloud-logging google-cloud-monitoring ; do - list_file="/etc/apt/sources.list.d/${list}.list" - if [[ -f "${list_file}" ]]; then - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" - fi - done - fi - - # - # cran-r - # - if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then - keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" - if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi - rm -f /usr/share/keyrings/cran-r.gpg - curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ - gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list - fi - - # - # mysql - # - if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then - rm -f /usr/share/keyrings/mysql.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ - gpg --dearmor -o /usr/share/keyrings/mysql.gpg - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list - fi - - if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi - -} - -function set_proxy(){ - METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" - - if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi - - export METADATA_HTTP_PROXY - export http_proxy="${METADATA_HTTP_PROXY}" - export https_proxy="${METADATA_HTTP_PROXY}" - export HTTP_PROXY="${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="${METADATA_HTTP_PROXY}" - no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" - local no_proxy_svc - for no_proxy_svc in compute secretmanager dns servicedirectory logging \ - bigquery composer pubsub bigquerydatatransfer dataflow \ - storage datafusion ; do - no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" - done - - export NO_PROXY="${no_proxy}" -} - -function is_ramdisk() { - if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi - if ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0 - elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi - - if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then - IS_RAMDISK="true" - return 0 - else - IS_RAMDISK="false" - return 1 - fi -} - -function mount_ramdisk(){ - local free_mem - free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" - if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi - - # Write to a ramdisk instead of churning the persistent disk - - tmpdir="/mnt/shm" - mkdir -p "${tmpdir}/pkgs_dirs" - mount -t tmpfs tmpfs "${tmpdir}" - - # Download conda packages to tmpfs - /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs" - - # Download OS packages to tmpfs - if is_debuntu ; then - mount -t tmpfs tmpfs /var/cache/apt/archives - else - mount -t tmpfs tmpfs /var/cache/dnf - fi - is_ramdisk -f -} - -function check_os() { - if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then - echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." - exit 1 - elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then - echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." - exit 1 - elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then - echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." - exit 1 - fi - - SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - readonly SPARK_VERSION - if version_lt "${SPARK_VERSION}" "3.1" || \ - version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 - fi - - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi -} - -# -# Generate repo file under /etc/apt/sources.list.d/ -# -function apt_add_repo() { - local -r repo_name="$1" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local -r include_src="${4:-yes}" - local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" - - echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" - if [[ "${include_src}" == "yes" ]] ; then - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" - fi - - apt-get update -qq -} - -# -# Generate repo file under /etc/yum.repos.d/ -# -function dnf_add_repo() { - local -r repo_name="$1" - local -r repo_url="$3" # "http(s)://host/path/filename.repo" - local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - - curl -s -L "${repo_url}" \ - | dd of="${repo_path}" status=progress -# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ -} - -# -# Keyrings default to -# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or -# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) -# -function os_add_repo() { - local -r repo_name="$1" - local -r signing_key_url="$2" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local kr_path - if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi - - mkdir -p "$(dirname "${kr_path}")" - - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ - | gpg --import --no-default-keyring --keyring "${kr_path}" - - if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" - else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi -} - -function configure_dkms_certs() { - if test -v PSN && [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping"; - return 0 - fi - - mkdir -p "${CA_TMPDIR}" - - # If the private key exists, verify it - if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then - echo "Private key material exists" - - local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then - modulus_md5sum="${expected_modulus_md5sum}" - - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key" - fi - - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert" - fi - else - modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" - fi - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - - return - fi - - # Retrieve cloud secrets keys - local sig_priv_secret_name - sig_priv_secret_name="${PSN}" - local sig_pub_secret_name - sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" - local sig_secret_project - sig_secret_project="$(get_metadata_attribute secret_project)" - local sig_secret_version - sig_secret_version="$(get_metadata_attribute secret_version)" - - # If metadata values are not set, do not write mok keys - if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi - - # Write private material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_priv_secret_name}" \ - | dd status=none of="${CA_TMPDIR}/db.rsa" - - # Write public material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_pub_secret_name}" \ - | base64 --decode \ - | dd status=none of="${CA_TMPDIR}/db.der" - - local mok_directory="$(dirname "${mok_key}")" - mkdir -p "${mok_directory}" - - # symlink private key and copy public cert from volatile storage to DKMS directory - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - cp -f "${CA_TMPDIR}/db.der" "${mok_der}" - - modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" -} - -function clear_dkms_key { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping" >&2 - return 0 - fi - rm -rf "${CA_TMPDIR}" "${mok_key}" -} - -function check_secure_boot() { - local SECURE_BOOT="disabled" - SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') - - PSN="$(get_metadata_attribute private_secret_name)" - readonly PSN - - if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then - echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster." - return - elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then - echo "Secure boot is enabled, but no signing material provided." - echo "Consider either disabling secure boot or provide signing material as per" - echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - return - fi - - CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" - readonly CA_TMPDIR - - if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv - mok_der=/var/lib/shim-signed/mok/MOK.der - else mok_key=/var/lib/dkms/mok.key - mok_der=/var/lib/dkms/mok.pub ; fi -} - -function restart_knox() { - systemctl stop knox - rm -rf "${KNOX_HOME}/data/deployments/*" - systemctl start knox -} - -function install_dependencies() { - test -f "${workdir}/complete/install-dependencies" && return 0 - pkg_list="screen" - if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} - elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi - touch "${workdir}/complete/install-dependencies" -} - -function prepare_pip_env() { - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv" - source "${tmpdir}/python-venv/bin/activate" - - pip cache purge || echo "unable to purge pip cache" - if is_ramdisk ; then - # Download pip packages to tmpfs - mkdir -p "${tmpdir}/cache-dir" - pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir" - fi -} - - -function prepare_common_env() { - define_os_comparison_functions - - # Verify OS compatability and Secure boot state - check_os - check_secure_boot - - readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" - - # Dataproc configurations - readonly HADOOP_CONF_DIR='/etc/hadoop/conf' - readonly HIVE_CONF_DIR='/etc/hive/conf' - readonly SPARK_CONF_DIR='/etc/spark/conf' - - OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role - ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - - # master node - MASTER="$(get_metadata_attribute dataproc-master)" - readonly MASTER - - workdir=/opt/install-dpgce - tmpdir=/tmp/ - temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" - readonly temp_bucket - readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" - uname_r=$(uname -r) - readonly uname_r - readonly bdcfg="/usr/local/bin/bdconfig" - export DEBIAN_FRONTEND=noninteractive - - # Knox config - readonly KNOX_HOME=/usr/lib/knox - - mkdir -p "${workdir}/complete" - set_proxy - mount_ramdisk - - readonly install_log="${tmpdir}/install.log" - - if test -f "${workdir}/complete/prepare.common" ; then return ; fi - - repair_old_backports - - if is_debuntu ; then - clean_up_sources_lists - apt-get update -qq - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - if ge_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi - if is_ubuntu ; then - while ! command -v gcloud ; do sleep 5s ; done - fi - else - dnf clean all - fi - - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - - ( set +e - time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero - ) - - install_dependencies - - # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" - touch "/run/keep-running-df" - screen -d -m -LUS keep-running-df \ - bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - fi - - touch "${workdir}/complete/prepare.common" -} - -function pip_exit_handler() { - if is_ramdisk ; then - # remove the tmpfs pip cache-dir - pip config unset global.cache-dir || echo "unable to unset global pip cache" - fi -} - -function common_exit_handler() { - set +ex - echo "Exit handler invoked" - - # Restart YARN services if they are running already - for svc in resourcemanager nodemanager; do - if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl stop "hadoop-yarn-${svc}.service" - systemctl start "hadoop-yarn-${svc}.service" - fi - done - - # If system memory was sufficient to mount memory-backed filesystems - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then - umount -f ${shmdir} - fi - done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' - fi - - if is_debuntu ; then - # Clean up OS package cache - apt-get -y -qq clean - apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove - # re-hold systemd package - if ge_debian12 ; then - apt-mark hold systemd libsystemd0 ; fi - else - dnf clean all - fi - - # When creating image, print disk usage statistics, zero unused disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - # print disk usage statistics for large components - if is_ubuntu ; then - du -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ - /usr/lib \ - /opt/nvidia/* \ - /opt/conda/miniconda3 | sort -h - elif is_debian ; then - du -x -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ - /var/lib/{docker,mysql,} \ - /opt/nvidia/* \ - /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ - /usr/bin \ - /usr \ - /var \ - / 2>/dev/null | sort -h - else - du -hs \ - /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ - /usr/lib64/google-cloud-sdk \ - /opt/nvidia/* \ - /opt/conda/miniconda3 - fi - - # Process disk usage logs from installation period - rm -f /run/keep-running-df - sync - sleep 5.01s - # compute maximum size of disk during installation - # Log file contains logs like the following (minus the preceeding #): -#Filesystem 1K-blocks Used Available Use% Mounted on -#/dev/vda2 7096908 2611344 4182932 39% / - df / | tee -a "/run/disk-usage.log" - - perl -e \ - '@siz=( sort { $a => $b } - map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting}; -print( " samples-taken: ", scalar @siz, $/, - "starting-disk-used: $starting", $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" - - - # zero free disk space - dd if=/dev/zero of=/zero - sync - sleep 3s - rm -f /zero - fi - echo "exit_handler has completed" -} - - -function set_support_matrix() { - # CUDA version and Driver version - # https://docs.nvidia.com/deploy/cuda-compatibility/ - # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html - # https://developer.nvidia.com/cuda-downloads - - # Minimum supported version for open kernel driver is 515.43.04 - # https://github.com/NVIDIA/open-gpu-kernel-modules/tags - # Rocky8: 12.0: 525.147.05 - local latest - latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" - readonly -A DRIVER_FOR_CUDA=( - ["11.7"]="515.65.01" ["11.8"]="525.147.05" - ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" - ) - readonly -A DRIVER_SUBVER=( - ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" - ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" - ) - # https://developer.nvidia.com/cudnn-downloads - if is_debuntu ; then - readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" - ) - elif is_rocky ; then - # rocky: - # 12.0: 8.8.1.3 - # 12.1: 8.9.3.28 - # 12.2: 8.9.7.29 - # 12.3: 9.0.0.312 - # 12.4: 9.1.1.17 - # 12.5: 9.2.1.18 - # 12.6: 9.5.1.17 - readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" - ) - fi - # https://developer.nvidia.com/nccl/nccl-download - # 12.2: 2.19.3, 12.5: 2.21.5 - readonly -A NCCL_FOR_CUDA=( - ["11.7"]="2.21.5" ["11.8"]="2.21.5" - ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" - ) - readonly -A CUDA_SUBVER=( - ["11.7"]="11.7.1" ["11.8"]="11.8.0" - ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" - ) -} - -set_support_matrix - -function set_cuda_version() { - case "${DATAPROC_IMAGE_VERSION}" in - "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) - "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; - * ) - echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" - exit 1 - ;; - esac - local cuda_url - cuda_url=$(get_metadata_attribute 'cuda-url' '') - if [[ -n "${cuda_url}" ]] ; then - # if cuda-url metadata variable has been passed, extract default version from url - local CUDA_URL_VERSION - CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" - if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then - DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" - fi - fi - readonly DEFAULT_CUDA_VERSION - - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") - if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then - CUDA_FULL_VERSION="${CUDA_VERSION}" - CUDA_VERSION="${CUDA_VERSION%.*}" - fi - readonly CUDA_VERSION - if ( ! test -v CUDA_FULL_VERSION ) ; then - CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} - fi - readonly CUDA_FULL_VERSION -} - -function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) -function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) -function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) - -function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) -function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) -function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) - -function set_driver_version() { - local gpu_driver_url - gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') - - local cuda_url - cuda_url=$(get_metadata_attribute 'cuda-url' '') - - local DEFAULT_DRIVER - # Take default from gpu-driver-url metadata value - if [[ -n "${gpu_driver_url}" ]] ; then - DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" - if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi - # Take default from cuda-url metadata value as a backup - elif [[ -n "${cuda_url}" ]] ; then - local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" - if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then - major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" - driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then - # use the version indicated by the cuda url as the default if it exists - DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then - # use the maximum sub-version available for the major version indicated in cuda url as the default - DEFAULT_DRIVER="${driver_max_maj_version}" - fi - fi - fi - - if ( ! test -v DEFAULT_DRIVER ) ; then - # If a default driver version has not been extracted, use the default for this version of CUDA - DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} - fi - - DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") - - readonly DRIVER_VERSION - readonly DRIVER="${DRIVER_VERSION%%.*}" - - export DRIVER_VERSION DRIVER - - gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then - echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" - exit 1 - fi -} - -function set_cudnn_version() { - readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" - readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" - - # Parameters for NVIDIA-provided cuDNN library - readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} - CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") - # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} - if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" - elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then - # cuDNN v8 is not distribution for ubuntu20+, debian12 - CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" - elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then - # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 - CUDNN_VERSION="8.8.0.121" - fi - readonly CUDNN_VERSION -} - - -function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) -function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) - -function set_cuda_repo_shortname() { -# Short name for urls -# https://developer.download.nvidia.com/compute/cuda/repos/${shortname} - if is_rocky ; then - shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" - else - shortname="$(os_id)$(os_vercat)" - fi -} - -function set_nv_urls() { - # Parameters for NVIDIA-provided package repositories - readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' - readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" - - # Parameter for NVIDIA-provided Rocky Linux GPU driver - readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" -} - -function set_cuda_runfile_url() { - local MAX_DRIVER_VERSION - local MAX_CUDA_VERSION - - local MIN_OPEN_DRIVER_VER="515.48.07" - local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" - local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER - - if is_cuda12 ; then - if is_debian12 ; then - MIN_DRIVER_VERSION="545.23.06" - MIN_CUDA_VERSION="12.3.0" - elif is_debian10 ; then - MAX_DRIVER_VERSION="555.42.02" - MAX_CUDA_VERSION="12.5.0" - elif is_ubuntu18 ; then - MAX_DRIVER_VERSION="530.30.02" - MAX_CUDA_VERSION="12.1.1" - fi - elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then - if le_debian10 ; then - # cuda 11 is not supported for <= debian10 - MAX_CUDA_VERSION="0" - MAX_DRIVER_VERSION="0" - fi - else - echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - fi - - if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then - echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then - echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" - fi - if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then - echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" - elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then - echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" - fi - - # driver version named in cuda runfile filename - # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) - readonly -A drv_for_cuda=( - ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" - ["11.8.0"]="520.61.05" - ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" - ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" - ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" - ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" - ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ - ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not - ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" - ) - - # Verify that the file with the indicated combination exists - local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} - CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" - local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" - local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" - - NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - readonly NVIDIA_CUDA_URL - - CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" - readonly CUDA_RUNFILE - - if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then - echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" - exit 1 - fi - - if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then - echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" - elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then - echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then - echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then - echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" - fi -} - -function set_cudnn_tarball_url() { -CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" -CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" -if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then - # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" - if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then - # When cuDNN version is greater than or equal to 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" - fi - # Use legacy url format with one of the tarball name formats depending on version as above - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" -fi -if ( version_ge "${CUDA_VERSION}" "12.0" ); then - # Use modern url format When cuda version is greater than or equal to 12.0 - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" -fi -readonly CUDNN_TARBALL -readonly CUDNN_TARBALL_URL -} - -function install_cuda_keyring_pkg() { - if ( test -v CUDA_KEYRING_PKG_INSTALLED && - [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi - local kr_ver=1.1 - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ - -o "${tmpdir}/cuda-keyring.deb" - dpkg -i "${tmpdir}/cuda-keyring.deb" - rm -f "${tmpdir}/cuda-keyring.deb" - CUDA_KEYRING_PKG_INSTALLED="1" -} - -function uninstall_cuda_keyring_pkg() { - apt-get purge -yq cuda-keyring - CUDA_KEYRING_PKG_INSTALLED="0" -} - -function install_local_cuda_repo() { - if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi - - if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi - CUDA_LOCAL_REPO_INSTALLED="1" - pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" - CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" - readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" - readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" - readonly DIST_KEYRING_DIR="/var/${pkgname}" - - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" - - dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" - rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" - cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ - - if is_ubuntu ; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ - -o /etc/apt/preferences.d/cuda-repository-pin-600 - fi - - touch "${workdir}/complete/install-local-cuda-repo" -} -function uninstall_local_cuda_repo(){ - apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cuda-repo" -} - -function install_local_cudnn_repo() { - if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi - pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" - CUDNN_PKG_NAME="${pkgname}" - local_deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" - - # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o "${tmpdir}/local-installer.deb" - - dpkg -i "${tmpdir}/local-installer.deb" - - rm -f "${tmpdir}/local-installer.deb" - - cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - - touch "${workdir}/complete/install-local-cudnn-repo" -} - -function uninstall_local_cudnn_repo() { - apt-get purge -yq "${CUDNN_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cudnn-repo" -} - -function install_local_cudnn8_repo() { - if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi - - if is_ubuntu ; then cudnn8_shortname="ubuntu2004" - elif is_debian ; then cudnn8_shortname="debian11" - else return 0 ; fi - if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 - elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 - else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi - cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" - - pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" - CUDNN8_PKG_NAME="${pkgname}" - - deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_fn="${tmpdir}/${deb_fn}" - local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" - - # cache the cudnn package - cache_fetched_package "${local_deb_url}" \ - "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ - "${local_deb_fn}" - - local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" - # If we are using a ram disk, mount another where we will unpack the cudnn local installer - if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then - mkdir -p "${cudnn_path}" - mount -t tmpfs tmpfs "${cudnn_path}" - fi - - dpkg -i "${local_deb_fn}" - - rm -f "${local_deb_fn}" - - cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/complete/install-local-cudnn8-repo" -} - -function uninstall_local_cudnn8_repo() { - apt-get purge -yq "${CUDNN8_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cudnn8-repo" -} - -function install_nvidia_nccl() { - readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} - readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) - - if test -f "${workdir}/complete/nccl" ; then return ; fi - - if is_cuda11 && is_debian12 ; then - echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" - return - fi - - local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - - # https://github.com/NVIDIA/nccl/blob/master/README.md - # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Fermi: SM_20, compute_30 - # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 - # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 - # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 - - # The following architectures are suppored by open kernel driver - # Volta: SM_70,SM_72, compute_70,compute_72 - # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 - - # The following architectures are supported by CUDA v11.8+ - # Ada: SM_89, compute_89 - # Hopper: SM_90,SM_90a compute_90,compute_90a - # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" - if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" - fi - if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" - fi - - mkdir -p "${workdir}" - pushd "${workdir}" - - test -d "${workdir}/nccl" || { - local tarball_fn="v${NCCL_VERSION}-1.tar.gz" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ - | tar xz - mv "nccl-${NCCL_VERSION}-1" nccl - } - - local build_path - if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else - build_path="nccl/build/pkg/rpm/x86_64" ; fi - - test -d "${workdir}/nccl/build" || { - local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" - - output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') - if echo "${output}" | grep -q "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - else - # build and cache - pushd nccl - # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install - install_build_dependencies - if is_debuntu ; then - # These packages are required to build .deb packages from source - execute_with_retries \ - apt-get install -y -qq build-essential devscripts debhelper fakeroot - export NVCC_GENCODE - execute_with_retries make -j$(nproc) pkg.debian.build - elif is_rocky ; then - # These packages are required to build .rpm packages from source - execute_with_retries \ - dnf -y -q install rpm-build rpmdevtools - export NVCC_GENCODE - execute_with_retries make -j$(nproc) pkg.redhat.build - fi - tar czvf "/${local_tarball}" "../${build_path}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" - make clean - popd - fi - gcloud storage cat "${gcs_tarball}" | tar xz - } - - if is_debuntu ; then - dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" - elif is_rocky ; then - rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" - fi - - popd - touch "${workdir}/complete/nccl" -} - -function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) -function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) - -function install_nvidia_cudnn() { - if test -f "${workdir}/complete/cudnn" ; then return ; fi - local major_version - major_version="${CUDNN_VERSION%%.*}" - local cudnn_pkg_version - cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" +function install_nvidia_cudnn() { + local major_version + major_version="${CUDNN_VERSION%%.*}" + local cudnn_pkg_version + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" if is_rocky ; then if is_cudnn8 ; then @@ -1319,6 +515,7 @@ function install_nvidia_cudnn() { if ge_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else + local CUDNN="${CUDNN_VERSION%.*}" if is_cudnn8 ; then install_local_cudnn8_repo @@ -1328,8 +525,6 @@ function install_nvidia_cudnn() { apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" - - uninstall_local_cudnn8_repo sync elif is_cudnn9 ; then install_cuda_keyring_pkg @@ -1346,15 +541,118 @@ function install_nvidia_cudnn() { echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi fi + elif is_ubuntu ; then + local -a packages + packages=( + "libcudnn${major_version}=${cudnn_pkg_version}" + "libcudnn${major_version}-dev=${cudnn_pkg_version}") + execute_with_retries \ + apt-get install -q -y --no-install-recommends "${packages[*]}" + sync else - echo "Unsupported OS: '${_shortname}'" + echo "Unsupported OS: '${OS_NAME}'" exit 1 fi ldconfig - echo "NVIDIA cuDNN successfully installed for ${_shortname}." - touch "${workdir}/complete/cudnn" + echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." +} + +CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" +PSN="$(get_metadata_attribute private_secret_name)" +readonly PSN +function configure_dkms_certs() { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + else + modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3" + fi + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key modulus" + fi + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert modulus" + fi + + return + fi + + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + # symlink private key and copy public cert from volatile storage for DKMS + if is_ubuntu ; then + mkdir -p /var/lib/shim-signed/mok + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv + cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der + else + mkdir -p /var/lib/dkms/ + ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key + cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub + fi +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >&2 + return 0 + fi + rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv +} + +function add_contrib_component() { + if ge_debian12 ; then + # Include in sources file components on which nvidia-kernel-open-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi } function add_nonfree_components() { @@ -1370,93 +668,76 @@ function add_nonfree_components() { fi } -# -# Install package signing key and add corresponding repository -# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html function add_repo_nvidia_container_toolkit() { - local nvctk_root="https://nvidia.github.io/libnvidia-container" - local signing_key_url="${nvctk_root}/gpgkey" - local repo_data - - if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" - else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi + if is_debuntu ; then + local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html + test -f "${kr_path}" || + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o "${kr_path}" - os_add_repo nvidia-container-toolkit \ - "${signing_key_url}" \ - "${repo_data}" \ - "no" + test -f "${sources_list_path}" || + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \ + | tee "${sources_list_path}" + fi } function add_repo_cuda() { if is_debuntu ; then - install_cuda_keyring_pkg # 11.7+, 12.0+ + local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg + local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" + echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ + | sudo tee "${sources_list_path}" + curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ + -o "${kr_path}" elif is_rocky ; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" + execute_with_retries "dnf clean all" fi } +readonly uname_r=$(uname -r) function build_driver_from_github() { - # non-GPL driver will have been built on rocky8 - if is_rocky8 ; then return 0 ; fi + if is_ubuntu ; then + mok_key=/var/lib/shim-signed/mok/MOK.priv + mok_der=/var/lib/shim-signed/mok/MOK.der + else + mok_key=/var/lib/dkms/mok.key + mok_der=/var/lib/dkms/mok.pub + fi + workdir=/opt/install-nvidia-driver + mkdir -p "${workdir}" pushd "${workdir}" - test -d "${workdir}/open-gpu-kernel-modules" || { - local tarball_fn="${DRIVER_VERSION}.tar.gz" + tarball_fn="${DRIVER_VERSION}.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ | tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } + cd open-gpu-kernel-modules - local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" - test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local def_dir="${modulus_md5sum:-unsigned}" - local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") - - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + time make -j$(nproc) modules \ + > /var/log/open-gpu-kernel-modules-build.log \ + 2> /var/log/open-gpu-kernel-modules-build_error.log + sync - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then - echo "cache hit" - else - # build the kernel modules - pushd open-gpu-kernel-modules - install_build_dependencies - if ( is_cuda11 && is_ubuntu22 ) ; then - echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" - exit 1 - fi - execute_with_retries make -j$(nproc) modules \ - > kernel-open/build.log \ - 2> kernel-open/build_error.log - # Sign kernel modules - if [[ -n "${PSN}" ]]; then - configure_dkms_certs - for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do - "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ - "${mok_key}" \ - "${mok_der}" \ - "${module}" - done - clear_dkms_key - fi - make modules_install \ - >> kernel-open/build.log \ - 2>> kernel-open/build_error.log - # Collect build logs and installed binaries - tar czvf "${local_tarball}" \ - "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ - $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" - make clean - popd - fi - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv - depmod -a - } + if [[ -n "${PSN}" ]]; then + #configure_dkms_certs + for module in $(find kernel-open -name '*.ko'); do + "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ + "${mok_key}" \ + "${mok_der}" \ + "${module}" + done + #clear_dkms_key + fi + make modules_install \ + >> /var/log/open-gpu-kernel-modules-build.log \ + 2>> /var/log/open-gpu-kernel-modules-build_error.log popd } @@ -1479,10 +760,12 @@ function build_driver_from_packages() { add_contrib_component apt-get update -qq execute_with_retries apt-get install -y -qq --no-install-recommends dkms + #configure_dkms_certs execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync elif is_rocky ; then + #configure_dkms_certs if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else @@ -1490,108 +773,26 @@ function build_driver_from_packages() { fi sync fi + #clear_dkms_key } function install_nvidia_userspace_runfile() { - # Parameters for NVIDIA-provided Debian GPU driver - readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - - readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") - - USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" - readonly USERSPACE_FILENAME - - # This .run file contains NV's OpenGL implementation as well as - # nvidia optimized implementations of the gtk+ 2,3 stack(s) not - # including glib (https://docs.gtk.org/glib/), and what appears to - # be a copy of the source from the kernel-open directory of for - # example DRIVER_VERSION=560.35.03 - # - # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz - # - # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run - # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - if test -f "${workdir}/complete/userspace" ; then return ; fi - local local_fn="${tmpdir}/userspace.run" - - cache_fetched_package "${USERSPACE_URL}" \ - "${pkg_bucket}/${USERSPACE_FILENAME}" \ - "${local_fn}" - - local runfile_args - runfile_args="" - local cache_hit="0" - local local_tarball - - if is_rocky8 ; then - local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" - test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" - local_tarball="${workdir}/${build_tarball}" - local def_dir="${modulus_md5sum:-unsigned}" - local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") - - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" - - if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then - cache_hit="1" - runfile_args="--no-kernel-modules" - echo "cache hit" - else - install_build_dependencies - configure_dkms_certs - local signing_options - signing_options="" - if [[ -n "${PSN}" ]]; then - signing_options="--module-signing-hash sha256 \ - --module-signing-x509-hash sha256 \ - --module-signing-secret-key \"${mok_key}\" \ - --module-signing-public-key \"${mok_der}\" \ - --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ - " - fi - runfile_args="--no-dkms ${signing_options}" - fi - } - else - runfile_args="--no-kernel-modules" - fi - - execute_with_retries bash "${local_fn}" -e -q \ - ${runfile_args} \ - --ui=none \ - --install-libglvnd \ - --tmpdir="${tmpdir}" - - if is_rocky8 ; then - if [[ "${cache_hit}" == "1" ]] ; then - gcloud storage cat "${gcs_tarball}" | tar -C / -xzv - depmod -a - else - clear_dkms_key - tar czvf "${local_tarball}" \ - /var/log/nvidia-installer.log \ - $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - fi - fi - - rm -f "${local_fn}" - touch "${workdir}/complete/userspace" + if test -f "${tmpdir}/userspace-complete" ; then return ; fi + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${USERSPACE_URL}" -o "${tmpdir}/userspace.run" + execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}" + rm -f "${tmpdir}/userspace.run" + touch "${tmpdir}/userspace-complete" sync } function install_cuda_runfile() { - if test -f "${workdir}/complete/cuda" ; then return ; fi - local local_fn="${tmpdir}/cuda.run" - - cache_fetched_package "${NVIDIA_CUDA_URL}" \ - "${pkg_bucket}/${CUDA_RUNFILE}" \ - "${local_fn}" - - execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" - rm -f "${local_fn}" - touch "${workdir}/complete/cuda" + if test -f "${tmpdir}/cuda-complete" ; then return ; fi + time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run" + execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}" + rm -f "${tmpdir}/cuda.run" + touch "${tmpdir}/cuda-complete" sync } @@ -1607,11 +808,12 @@ function install_cuda_toolkit() { if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} + sync elif is_rocky ; then # rocky9: cuda-11-[7,8], cuda-12-[1..6] execute_with_retries dnf -y -q install "${cudatk_package}" + sync fi - sync } function load_kernel_module() { @@ -1628,120 +830,57 @@ function load_kernel_module() { # TODO: if peermem is available, also modprobe nvidia-peermem } -function install_cuda(){ - if test -f "${workdir}/complete/cuda-repo" ; then return ; fi - - if ( ge_debian12 && is_src_os ) ; then - echo "installed with the driver on ${_shortname}" - return 0 - fi - - # The OS package distributions are unreliable - install_cuda_runfile - - # Includes CUDA packages - add_repo_cuda - - touch "${workdir}/complete/cuda-repo" -} - -function install_nvidia_container_toolkit() { - local container_runtime_default - if command -v docker ; then container_runtime_default='docker' - elif command -v containerd ; then container_runtime_default='containerd' - elif command -v crio ; then container_runtime_default='crio' - else container_runtime_default='' ; fi - CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") - - if test -z "${CONTAINER_RUNTIME}" ; then return ; fi - - add_repo_nvidia_container_toolkit - if is_debuntu ; then - execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else - execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi - nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" - systemctl restart "${CONTAINER_RUNTIME}" -} - # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if test -f "${workdir}/complete/gpu-driver" ; then return ; fi - if ( ge_debian12 && is_src_os ) ; then add_nonfree_components + add_repo_nvidia_container_toolkit apt-get update -qq + #configure_dkms_certs apt-get -yq install \ - dkms \ - nvidia-open-kernel-dkms \ - nvidia-open-kernel-support \ - nvidia-smi \ - libglvnd0 \ - libcuda1 - echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" - return 0 - fi + nvidia-container-toolkit \ + dkms \ + nvidia-open-kernel-dkms \ + nvidia-open-kernel-support \ + nvidia-smi \ + libglvnd0 \ + libcuda1 + #clear_dkms_key + elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then - # OS driver packages do not produce reliable driver ; use runfile - install_nvidia_userspace_runfile + install_nvidia_userspace_runfile - build_driver_from_github - - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - touch "${workdir}/complete/gpu-driver" -} + build_driver_from_github -function install_ops_agent(){ - if test -f "${workdir}/complete/ops-agent" ; then return ; fi + install_cuda_runfile + elif is_debuntu ; then + install_cuda_keyring_pkg - mkdir -p /opt/google - cd /opt/google - # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation - curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh - execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install + build_driver_from_packages - touch "${workdir}/complete/ops-agent" -} + install_cuda_toolkit + elif is_rocky ; then + add_repo_cuda -# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics -function install_gpu_monitoring_agent() { - download_gpu_monitoring_agent - install_gpu_monitoring_agent_dependency - start_gpu_monitoring_agent_service -} + build_driver_from_packages -function download_gpu_monitoring_agent(){ - if is_rocky ; then - execute_with_retries "dnf -y -q install git" + install_cuda_toolkit else - execute_with_retries "apt-get install git -y" + echo "Unsupported OS: '${OS_NAME}'" + exit 1 + fi + ldconfig + if is_src_os ; then + echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully" + else + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" fi - mkdir -p /opt/google - chmod 777 /opt/google - cd /opt/google - test -d compute-gpu-monitoring || \ - execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" -} - -function install_gpu_monitoring_agent_dependency(){ - cd /opt/google/compute-gpu-monitoring/linux - python3 -m venv venv - venv/bin/pip install wheel - venv/bin/pip install -Ur requirements.txt -} - -function start_gpu_monitoring_agent_service(){ - cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system - systemctl daemon-reload - systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { - # Stackdriver GPU agent parameters -# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' - local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' - if ( ! command -v pip && is_debuntu ) ; then - execute_with_retries "apt-get install -y -qq python3-pip" + if ! command -v pip; then + execute_with_retries "apt-get install -y -qq python-pip" fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" @@ -1751,13 +890,7 @@ function install_gpu_agent() { "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" - local venv="${install_dir}/venv" - python3 -m venv "${venv}" -( - source "${venv}/bin/activate" - python3 -m pip install --upgrade pip - execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" -) + execute_with_retries pip install -r "${install_dir}/requirements.txt" sync # Generate GPU service. @@ -1768,7 +901,7 @@ Description=GPU Utilization Metric Agent [Service] Type=simple PIDFile=/run/gpu_agent.pid -ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' +ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"' User=root Group=root WorkingDirectory=/ @@ -1783,57 +916,75 @@ EOF systemctl --no-reload --now enable gpu-utilization-agent.service } -function configure_gpu_exclusive_mode() { - # only run this function when spark < 3.0 - if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi - # include exclusive mode on GPU - nvsmi -c EXCLUSIVE_PROCESS - clear_nvsmi_cache -} - -function fetch_mig_scripts() { - mkdir -p /usr/local/yarn-mig-scripts - chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - chmod 755 /usr/local/yarn-mig-scripts/* +function set_hadoop_property() { + local -r config_file=$1 + local -r property=$2 + local -r value=$3 + "${bdcfg}" set_property \ + --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ + --name "${property}" --value "${value}" \ + --clobber } -function install_spark_rapids() { - # Update SPARK RAPIDS config - local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" - local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 +function configure_yarn() { + if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu - local -r scala_ver="2.12" + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then - local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 - fi + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} - readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) - readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + # Fix local dirs access permissions + local yarn_local_dirs=() - local jar_basename + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" - cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} - jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" - cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" +function configure_gpu_exclusive_mode() { + # check if running spark 3, if not, enable GPU exclusive mode + local spark_version + spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) + if [[ ${spark_version} != 3.* ]]; then + # include exclusive mode on GPU + nvsmi -c EXCLUSIVE_PROCESS + fi +} - jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" - cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ - "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" +function fetch_mig_scripts() { + mkdir -p /usr/local/yarn-mig-scripts + sudo chmod 755 /usr/local/yarn-mig-scripts + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + sudo chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { @@ -1863,7 +1014,6 @@ function configure_gpu_script() { # See the License for the specific language governing permissions and # limitations under the License. # -# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') @@ -1872,51 +1022,10 @@ EOF chmod a+rx "${gpus_resources_script}" - local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - - local executor_cores - executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" - local executor_memory - executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" - local task_cpus=2 - local gpu_amount - - # The current setting of spark.task.resource.gpu.amount (0.333) is - # not ideal to get the best performance from the RAPIDS Accelerator - # plugin. It's recommended to be 1/{executor core count} unless you - # have a special use case. -# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - gpu_amount="$(perl -e "print 1 / ${executor_cores}")" - -# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression - - cat >>"${spark_defaults_conf}" <> "${spark_defaults_conf}" + fi } function configure_gpu_isolation() { @@ -1949,12 +1058,12 @@ EOF function nvsmi() { local nvsmi="/usr/bin/nvidia-smi" - if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' + if [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2 elif [[ ! -f "${nvsmi}" ]] ; then echo "nvidia-smi not installed" >&2 ; return 0 elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi - if test -v 1 && [[ "$1" == "-L" ]] ; then + if [[ "$1" == "-L" ]] ; then local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi @@ -1965,35 +1074,14 @@ function nvsmi() { "${nvsmi}" $* } -function clear_nvsmi_cache() { - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then - rm "${nvsmi_query_xml}" - fi -} - -function query_nvsmi() { - if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi - nvsmi -q -x --dtd > "${nvsmi_query_xml}" -} - -function install_build_dependencies() { - if test -f "${workdir}/complete/build-dependencies" ; then return ; fi - +function install_dependencies() { if is_debuntu ; then - if is_ubuntu22 && is_cuda12 ; then - # On ubuntu22, the default compiler does not build some kernel module versions - # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 - execute_with_retries apt-get install -y -qq gcc-12 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-12 - fi - + execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen elif is_rocky ; then - execute_with_retries dnf -y -q install gcc + execute_with_retries dnf -y -q install pciutils gcc screen local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" + local install_log="${tmpdir}/install.log" set +e eval "${dnf_cmd}" > "${install_log}" 2>&1 local retval="$?" @@ -2016,259 +1104,364 @@ function install_build_dependencies() { execute_with_retries "${dnf_cmd}" fi - touch "${workdir}/complete/build-dependencies" } -function prepare_gpu_env(){ - set +e - gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" - set -e - echo "gpu_count=[${gpu_count}]" - nvsmi_works="0" - nvsmi_query_xml="${tmpdir}/nvsmi.xml" - xmllint="/opt/conda/miniconda3/bin/xmllint" - NVIDIA_SMI_PATH='/usr/bin' - MIG_MAJOR_CAPS=0 - IS_MIG_ENABLED=0 - CUDNN_PKG_NAME="" - CUDNN8_PKG_NAME="" - CUDA_LOCAL_REPO_INSTALLED="0" +function main() { + # This configuration should be run on all nodes + # regardless if they have attached GPUs + configure_yarn - # Whether to install NVIDIA-provided or OS-provided GPU driver - GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') - readonly GPU_DRIVER_PROVIDER + # Detect NVIDIA GPU + if (lspci | grep -q NVIDIA); then + # if this is called without the MIG script then the drivers are not installed + migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" + if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver - INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') - readonly INSTALL_GPU_AGENT + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then + if (echo "${migquery_result}" | grep Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` + fetch_mig_scripts + fi + fi + fi - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - readonly RAPIDS_RUNTIME + # if mig is enabled drivers would have already been installed + if [[ $IS_MIG_ENABLED -eq 0 ]]; then + install_nvidia_gpu_driver - # determine whether we have nvidia-smi installed and working - nvsmi + load_kernel_module - set_cuda_version - set_driver_version - set_cuda_repo_shortname - set_nv_urls - set_cuda_runfile_url - set_cudnn_version - set_cudnn_tarball_url + if [[ -n ${CUDNN_VERSION} ]]; then + install_nvidia_nccl + install_nvidia_cudnn + fi + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent will not be installed.' + fi - if is_cuda11 ; then gcc_ver="11" - elif is_cuda12 ; then gcc_ver="12" ; fi -} + # for some use cases, the kernel module needs to be removed before first use of nvidia-smi + for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + done -# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades -# Users should run apt-mark unhold before they wish to upgrade these packages -function hold_nvidia_packages() { - if ! is_debuntu ; then return ; fi + MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")" + if test -n "$(nvsmi -L)" ; then + # cache the result of the gpu query + ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') + echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" + fi + NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + # enable MIG on every GPU + for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do + nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + done + + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' + MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)" + fetch_mig_scripts + else + configure_gpu_exclusive_mode + fi + fi - apt-mark hold nvidia-* - apt-mark hold libnvidia-* - if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then - apt-mark hold xserver-xorg-video-nvidia* + configure_yarn_nodemanager + configure_gpu_script + configure_gpu_isolation + elif [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager + configure_gpu_script fi -} - -function delete_mig_instances() ( - # delete all instances - set +e - nvidia-smi mig -dci - - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No compute instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac - - nvidia-smi mig -dgi - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No GPU instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac -) -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles -function configure_mig_cgi() { - delete_mig_instances - META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" - if test -n "${META_MIG_CGI_VALUE}"; then - nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C - else - # https://pci-ids.ucw.cz/v2.2/pci.ids - local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)" - if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then - # run the following command to list placement profiles - # nvidia-smi mig -lgipp - # - # This is the result when using H100 instances on 20241220 - # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 - # GPU 0 Profile ID 14 Placements: {0,2,4}:2 - # GPU 0 Profile ID 9 Placements: {0,4}:4 - # GPU 0 Profile ID 5 Placement : {0}:4 - # GPU 0 Profile ID 0 Placement : {0}:8 - - # For H100 3D controllers, consider profile 19, 7x1G instances - nvidia-smi mig -cgi 9,9 -C - elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then - # Dataproc only supports H100s right now ; split in 2 if not specified - # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances - nvidia-smi mig -cgi 9,9 -C - else - echo "unrecognized 3D controller" - fi + # Restart YARN services if they are running already + if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-resourcemanager.service + fi + if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-nodemanager.service fi - clear_nvsmi_cache } -function enable_mig() { - if test -f "${workdir}/complete/enable-mig" ; then return ; fi - - # Start persistenced if it's not already running - if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi - for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do - # Write an ascii zero to the numa node indicator - echo "0" | dd of="${f}" status=none - done - time nvsmi --gpu-reset # 30s - nvsmi -mig 1 - clear_nvsmi_cache +function clean_up_sources_lists() { + # + # bigtop (primary) + # + local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" - touch "${workdir}/complete/enable-mig" -} + if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then + region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" -function enable_and_configure_mig() { - # default MIG to on when this script is used - META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + local regional_bigtop_repo_uri + regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | + sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | + grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | + cut -d ' ' -f 2 | + head -1) - if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi + if [[ "${regional_bigtop_repo_uri}" == */ ]]; then + local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" + else + local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + fi - enable_mig - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" + local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" + rm -f "${bigtop_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ + "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" - if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi - if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured but NOT enabled. Failing" ; exit 1 ; fi + sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + fi - echo "MIG is fully enabled" - configure_mig_cgi -} + # + # adoptium + # + # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu + local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" + local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" + rm -f "${adoptium_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ + | gpg --dearmor -o "${adoptium_kr_path}" + echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ + > /etc/apt/sources.list.d/adoptium.list -function setup_gpu_yarn() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs - configure_yarn_resources - # When there is no GPU, but the installer is executing on a master node: - if [[ "${gpu_count}" == "0" ]] ; then - if [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - fi - return 0 - fi + # + # docker + # + local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" + local docker_repo_file="/etc/apt/sources.list.d/docker.list" + local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" - if [[ "${nvsmi_works}" == "1" ]] ; then - # if this is called without the MIG script then the drivers are not installed - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - set +e - migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" - set -e - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + rm -f "${docker_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ + | gpg --dearmor -o "${docker_kr_path}" + echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ + > ${docker_repo_file} - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi + # + # google cloud + logging/monitoring + # + if ls /etc/apt/sources.list.d/google-cloud*.list ; then + rm -f /usr/share/keyrings/cloud.google.gpg + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + for list in google-cloud google-cloud-logging google-cloud-monitoring ; do + list_file="/etc/apt/sources.list.d/${list}.list" + if [[ -f "${list_file}" ]]; then + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" fi - fi + done fi - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver - install_cuda - load_kernel_module - - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent -# install_gpu_monitoring_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent has not been installed.' - fi - configure_gpu_exclusive_mode + # + # cran-r + # + if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi + rm -f /usr/share/keyrings/cran-r.gpg + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ + gpg --dearmor -o /usr/share/keyrings/cran-r.gpg + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + fi + + # + # mysql + # + if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then + rm -f /usr/share/keyrings/mysql.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + gpg --dearmor -o /usr/share/keyrings/mysql.gpg + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi - install_nvidia_container_toolkit - configure_yarn_nodemanager_gpu - configure_gpu_script - configure_gpu_isolation + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + } -function gpu_exit_handler() { +function exit_handler() { + set +ex + echo "Exit handler invoked" + + # Purge private key material until next grant + clear_dkms_key + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # If system memory was sufficient to mount memory-backed filesystems if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - for shmdir in /var/cudnn-local ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do + if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then umount -f ${shmdir} fi done + + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' fi - hold_nvidia_packages -} + if is_debuntu ; then + # Clean up OS package cache + apt-get -y -qq clean + apt-get -y -qq autoremove + # re-hold systemd package + if ge_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + else + dnf clean all + fi -function main() { - setup_gpu_yarn + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 | sort -h + elif is_debian ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 | sort -h + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib64/google-cloud-sdk \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + fi + + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" - echo "yarn setup complete" + perl -e '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" - if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then - install_nvidia_nccl - install_nvidia_cudnn - fi + echo "exit_handler has completed" - if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then - install_spark_rapids - configure_gpu_script - echo "RAPIDS initialized with Spark runtime" - elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then - # we are not currently tooled for installing dask in this action. - echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh" - else - echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero fi - echo "main complete" return 0 } -function exit_handler() { - gpu_exit_handler - pip_exit_handler - common_exit_handler - return 0 +function set_proxy(){ + export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)" + export http_proxy="${METADATA_HTTP_PROXY}" + export https_proxy="${METADATA_HTTP_PROXY}" + export HTTP_PROXY="${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="${METADATA_HTTP_PROXY}" + export no_proxy=metadata.google.internal,169.254.169.254 + export NO_PROXY=metadata.google.internal,169.254.169.254 +} + +function mount_ramdisk(){ + local free_mem + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + + # Write to a ramdisk instead of churning the persistent disk + + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" + + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + pip cache purge || echo "unable to purge pip cache" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi } function prepare_to_install(){ - prepare_common_env - prepare_pip_env - prepare_gpu_env + nvsmi_works="0" + readonly bdcfg="/usr/local/bin/bdconfig" + tmpdir=/tmp/ + if ! is_debuntu && ! is_rocky ; then + echo "Unsupported OS: '$(os_name)'" + exit 1 + fi + + repair_old_backports + + export DEBIAN_FRONTEND=noninteractive + trap exit_handler EXIT + mount_ramdisk + install_log="${tmpdir}/install.log" + + set_proxy + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + sleep 5s + apt-get -y -qq autoremove + if ge_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero + ) fi + + configure_dkms_certs + + install_dependencies + + # Monitor disk usage in a screen session + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -US keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" } prepare_to_install diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 1f3328eaa..f8438915f 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -4,27 +4,27 @@ from absl.testing import absltest from absl.testing import parameterized -import unittest - from integration_tests.dataproc_test_case import DataprocTestCase -DEFAULT_TIMEOUT = 15 # minutes -DEFAULT_CUDA_VERSION = "12.4" class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" + GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a + GPU_A100 = "type=nvidia-tesla-a100" GPU_H100 = "type=nvidia-h100-80gb,count=8" def verify_instance(self, name): # Verify that nvidia-smi works - import random - # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions - time.sleep( 3 + random.randint(1, 30) ) + time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience self.assert_instance_command(name, "nvidia-smi", 1) + def verify_pyspark(self, name): + # Verify that pyspark works + self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -41,27 +41,6 @@ def verify_instance_nvcc(self, name, cuda_version): self.assert_instance_command( name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) ) - def verify_instance_pyspark(self, name): - # Verify that pyspark works - self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) - - def verify_instance_cuda_version(self, name, cuda_version): - self.assert_instance_command( - name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) ) - - def verify_instance_driver_version(self, name, driver_version): - self.assert_instance_command( - name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) ) - - def verify_pyspark(self): - self.assert_dataproc_job( - self.getClusterName(), - "pyspark", - """--properties="spark.executor.resource.gpu.amount=1" \ - --properties="spark.task.resource.gpu.amount=0.01" \ - '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO) - ) - def verify_instance_spark(self): self.assert_dataproc_job( self.getClusterName(), @@ -77,22 +56,6 @@ def verify_instance_spark(self): + "spark.yarn.unmanagedAM.enabled=false" ) - def verify_driver_signature(self, name): - cert_path='/var/lib/dkms/mok.pub' - if self.getImageOs() == 'ubuntu': - cert_path='/var/lib/shim-signed/mok/MOK.der' - - cert_verification_cmd = """ -perl -Mv5.10 -e ' -my $cert = ( qx{openssl x509 -inform DER -in {} -text} - =~ /Serial Number:.*? +(.+?)\s*$/ms ); -my $kmod = ( qx{modinfo nvidia} - =~ /^sig_key:\s+(\S+)/ms ); -exit 1 unless $cert eq lc $kmod -' -""" - self.assert_instance_command( name, cert_verification_cmd.format(cert_path) ) - @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), # ("STANDARD", ["m"], GPU_T4, None, None), @@ -101,14 +64,8 @@ def verify_driver_signature(self, name): def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") - - if configuration == 'SINGLE' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) - self.skipTest("known to fail") + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") metadata = None if driver_provider is not None: @@ -116,18 +73,17 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-standard-32", + machine_type="n1-highmem-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, # This cluster is sized and timed appropriately to build the kernel driver and nccl - boot_disk_size="60GB") + timeout_in_minutes=90, + boot_disk_size="50GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) - self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) - self.verify_instance_pyspark(machine_name) - self.verify_pyspark() + if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ): + self.verify_pyspark(machine_name) @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), @@ -135,16 +91,13 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + self.skipTest("No need to regularly test not installing the agent") - metadata = "install-gpu-agent=false" - if configuration == 'SINGLE' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) - self.skipTest("known to fail") + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + metadata = "install-gpu-agent=false" if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) self.createCluster( @@ -154,27 +107,22 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, + timeout_in_minutes=30, boot_disk_size="50GB") for machine_suffix in machine_suffixes: - machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_pyspark() + self.verify_instance("{}-{}".format(self.getClusterName(), + machine_suffix)) + @parameterized.parameters( - ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), # ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), ) def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") - - if configuration == 'KERBEROS' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('KERBEROS fails with image version <= 2.1') - unittest.expectedFailure(self) - self.skipTest("known to fail") + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -186,47 +134,40 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, + timeout_in_minutes=30, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_gpu_agent(machine_name) - self.verify_pyspark() + self.verify_instance("{}-{}".format(self.getClusterName(), + machine_suffix)) + self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), + machine_suffix)) @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, "12.4"), -# ("SINGLE", ["m"], GPU_T4, None, "11.8"), +# ("SINGLE", ["m"], GPU_T4, None, "12.0"), + ("SINGLE", ["m"], GPU_T4, None, "11.8"), ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), - ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if configuration == 'KERBEROS' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('KERBEROS fails with image version <= 2.1') - unittest.expectedFailure(self) - self.skipTest("known to fail") + if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): + self.skipTest("CUDA == 12.0 not supported on debian 12") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) - - if configuration == 'SINGLE' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) - self.skipTest("known to fail") - + self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -236,41 +177,40 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, + timeout_in_minutes=30, boot_disk_size="50GB") - for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_nvcc(machine_name, cuda_version) - self.verify_instance_pyspark(machine_name) - self.verify_pyspark() @parameterized.parameters( - ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "11.8"), -# ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.0"), - ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.4"), + ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), +# ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"), + ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"), ) def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): - # Operation [projects/.../regions/.../operations/...] failed: - # Invalid value for field 'resource.machineType': \ - # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \ - # 'machineTypes/a3-highgpu-8g'. \ - # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature.. - # ('This use case not thoroughly tested') - unittest.expectedFailure(self) - self.skipTest("known to fail") - - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ + + self.skipTest("Test is known to fail. Skipping so that we can exercise others") + + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + + if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): + self.skipTest("CUDA == 12.0 not supported on debian 12") + + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -278,11 +218,11 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, configuration, self.INIT_ACTIONS, master_machine_type="a3-highgpu-8g", - worker_machine_type="a3-highgpu-8g", + worker_machine_type="a2-highgpu-2g", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, + timeout_in_minutes=30, boot_disk_size="50GB", startup_script="gpu/mig.sh") @@ -296,13 +236,12 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if configuration == 'SINGLE' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) - self.skipTest("known to fail") + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ + and configuration == 'SINGLE': + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") metadata = None if driver_provider is not None: @@ -316,9 +255,9 @@ def test_gpu_allocation(self, configuration, master_accelerator, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, boot_disk_size="50GB", - timeout_in_minutes=90) + timeout_in_minutes=30) - self.verify_pyspark() + self.verify_instance_spark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "11.8"), @@ -331,83 +270,28 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator, worker_accelerator, cuda_version): - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ - and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ - ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ - and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ + and configuration == 'SINGLE': + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty") - if configuration == 'SINGLE' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) - self.skipTest("known to fail") + if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): + self.skipTest("CUDA == 12.0 not supported on debian 12") - metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) - self.createCluster( - configuration, - self.INIT_ACTIONS, - machine_type="n1-highmem-8", - master_accelerator=master_accelerator, - worker_accelerator=worker_accelerator, - metadata=metadata, - timeout_in_minutes=90, - boot_disk_size="50GB", - scopes="https://www.googleapis.com/auth/monitoring.write") - - for machine_suffix in machine_suffixes: - machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_gpu_agent(machine_name) - self.verify_pyspark() - - @parameterized.parameters( -# ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), -# ("STANDARD", ["m"], GPU_T4, None, "12.0"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.1.1", 'rocky', '2.0'), -# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), -# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'), -# ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), - ) - def tests_driver_signing(self, configuration, machine_suffixes, - master_accelerator, worker_accelerator, - cuda_version, image_os, image_version): + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ + ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): + self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) - - if configuration == 'KERBEROS' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('KERBEROS fails with image version <= 2.1') - unittest.expectedFailure(self) - self.skipTest("known to fail") - - kvp_array=[] - import os - - if "private_secret_name" in os.environ: - for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']: - kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) ) - - if kvp_array[0] == "public_secret_name=": - self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") - else: - self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") - - metadata = ",".join( kvp_array ) - - if self.getImageOs() != image_os: - self.skipTest("This test is only run on os {}".format(image_os)) - if self.getImageVersion() != image_version: - self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os)) + self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( configuration, self.INIT_ACTIONS, @@ -415,16 +299,16 @@ def tests_driver_signing(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, + timeout_in_minutes=30, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - hostname="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(hostname) - self.verify_instance_gpu_agent(hostname) - self.verify_driver_signature(hostname) + self.verify_instance("{}-{}".format(self.getClusterName(), + machine_suffix)) + self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), + machine_suffix)) - self.verify_pyspark() + self.verify_instance_spark() if __name__ == "__main__": absltest.main() diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py deleted file mode 100644 index 9f2b18683..000000000 --- a/gpu/verify_pyspark.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2025 Google LLC and contributors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import matplotlib.pyplot as plt -import numpy as np - -from pyspark import SparkContext -from pyspark.sql import SparkSession -from pyspark import SparkConf, StorageLevel -from tqdm import tqdm -from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover -import pyspark.sql.functions as f - -spark = SparkSession.builder.appName("spark-rapids").getOrCreate() - -#from utils import SimpleTimer, ResultsLogger, visualize_data - -conf = (SparkConf().setMaster("local[*]") - .setAppName("SparkVectorizer") - .set('spark.driver.memory', '300G') - .set('spark.driver.maxResultSize', '20G') - .set('spark.network.timeout', '7200s') - ) - -sc = SparkContext.getOrCreate(conf=conf) -sc.setLogLevel("FATAL") -spark = SparkSession(sc) -print(sc._conf.getAll()) # check context settings - -x = np.linspace(0, 3*np.pi, 500) -plt.plot(x, np.sin(x**2)) -plt.title('A simple chirp'); From f00e2f80696c537f2143cd408e9831e3fbf57c46 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 13:20:31 -0800 Subject: [PATCH 118/130] including libtemplate-perl as a dependency --- cloudbuild/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index aebaffd84..644219305 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -22,7 +22,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \ dd of="${bazel_repo_file}" status=none && \ apt-get update -qq RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ - apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ + apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} \ + libtemplate-perl > /dev/null 2>&1 && \ apt-get clean # Set bazel-${bazel_version} as the default bazel alternative in this container From 7118ebf5704602394fe8d24cdd79ccc3b27e32af Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 13:39:08 -0800 Subject: [PATCH 119/130] moved to dask-template-20250104 --- templates/dask/dask.sh.in | 63 --------------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 templates/dask/dask.sh.in diff --git a/templates/dask/dask.sh.in b/templates/dask/dask.sh.in deleted file mode 100644 index 2f8450dd6..000000000 --- a/templates/dask/dask.sh.in +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# -[% INSERT legal/license_header %] -# -[% PROCESS common/template_disclaimer %] -# -# This initialization action script will install Dask and other relevant -# libraries on a Dataproc cluster. This is supported for either "yarn" or -# "standalone" runtimes Please see dask.org and yarn.dask.org for more -# information. - -set -euxo pipefail - -[% INSERT common/util_functions %] - -[% INSERT dask/util_functions %] - -function main() { - # Install Dask - install_dask - - # In "standalone" mode, Dask relies on a systemd unit to launch. - # In "yarn" mode, it relies on a config.yaml file. - if [[ "${DASK_RUNTIME}" == "yarn" ]]; then - # Create Dask YARN config file - configure_dask_yarn - elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then - # Create Dask service - install_systemd_dask_service - start_systemd_dask_service - - configure_knox_for_dask - - local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')" - if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then - configure_fluentd_for_dask - fi - else - echo "Unsupported Dask Runtime: ${DASK_RUNTIME}" - exit 1 - fi - - echo "Dask for ${DASK_RUNTIME} successfully initialized." -} - -function exit_handler() { - pip_exit_handler - common_exit_handler - return 0 -} - -function prepare_to_install(){ - prepare_common_env - prepare_conda_env - conda_env="$(get_metadata_attribute conda-env 'dask')" - readonly conda_env - prepare_dask_env - trap exit_handler EXIT -} - -prepare_to_install - -main From f2b50f74d4a8463f0c678e8042b0c61a7ec60cb2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 13:39:40 -0800 Subject: [PATCH 120/130] moved to gpu-template-20250107 --- templates/gpu/install_gpu_driver.sh.in | 80 -------------------------- 1 file changed, 80 deletions(-) delete mode 100644 templates/gpu/install_gpu_driver.sh.in diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in deleted file mode 100644 index a7c4d353f..000000000 --- a/templates/gpu/install_gpu_driver.sh.in +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -# -[% INSERT legal/license_header %] -# -[% PROCESS common/template_disclaimer %] -# -# This script installs NVIDIA GPU drivers and collects GPU utilization metrics. - -set -euxo pipefail - -[% INSERT common/util_functions %] - -[% INSERT common/install_functions %] - -[% INSERT gpu/util_functions %] - -[% INSERT gpu/install_functions %] - -[% INCLUDE gpu/yarn_functions %] - -[% INSERT gpu/spark_functions %] - -function main() { - install_gpu_driver_and_cuda - - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent -# install_gpu_monitoring_agent - echo 'GPU metrics agent successfully deployed.' - else - echo 'GPU metrics agent has not been installed.' - fi - configure_gpu_exclusive_mode - - setup_gpu_yarn - - echo "yarn setup complete" - - if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then - install_nvidia_nccl - install_nvidia_cudnn - fi - - if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then - install_spark_rapids - configure_gpu_script - echo "RAPIDS initialized with Spark runtime" - elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then - # we are not currently tooled for installing dask in this action. - echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh" - else - echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" - fi - - echo "main complete" - return 0 -} - -function exit_handler() { - set +e - gpu_install_exit_handler - gpu_exit_handler - pip_exit_handler - yarn_exit_handler - common_exit_handler - return 0 -} - -function prepare_to_install(){ - prepare_common_env - prepare_pip_env - prepare_gpu_env - prepare_gpu_install_env - trap exit_handler EXIT -} - -prepare_to_install - -main From 900c10a0d34bae4dbc50685c3cc42b7e7b45341b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 13:56:26 -0800 Subject: [PATCH 121/130] * include version in template disclaimer * include version in action generator --- templates/common/template_disclaimer | 9 ++++++--- templates/generate-action.pl | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/templates/common/template_disclaimer b/templates/common/template_disclaimer index 3b417deff..1c2d22b04 100644 --- a/templates/common/template_disclaimer +++ b/templates/common/template_disclaimer @@ -1,5 +1,8 @@ +# +# Google Cloud Dataproc Initialization Actions v[% IA_VERSION %] +# # This initialization action is generated from -# initialization-actions/templates/[% template_path %] +# initialization-actions/templates/[% template_path %].in # -# Modifications made directly to the generated file will be lost when -# the template is re-evaluated +# Modifications made directly to generated files will be lost when the +# templates are next evaluated. diff --git a/templates/generate-action.pl b/templates/generate-action.pl index 690acb409..2e1d344ff 100644 --- a/templates/generate-action.pl +++ b/templates/generate-action.pl @@ -7,6 +7,9 @@ use Template; use strict; +# Version of Initialization Actions we will generate +my $IA_VERSION="0.0.1"; + my $action = $ARGV[0]; my $v = { template_path => "${action}", From bef08b17cc0018b7426469b5fff49ab3b8bc255b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 19:02:43 -0800 Subject: [PATCH 122/130] migrated rapids.sh base template to rapids-template-20250106 --- templates/rapids/rapids.sh.in | 63 ----------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 templates/rapids/rapids.sh.in diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in deleted file mode 100644 index 61b7247c0..000000000 --- a/templates/rapids/rapids.sh.in +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# -[% INSERT legal/license_header %] -# -[% PROCESS common/template_disclaimer %] -# -# This initialization action script will install rapids on a Dataproc -# cluster. - -set -euxo pipefail - -[% INSERT common/util_functions %] - -[% INSERT gpu/util_functions %] - -[% INSERT dask/util_functions %] - -function main() { - # Install Dask with RAPIDS - install_dask_rapids - - # In "standalone" mode, Dask relies on a systemd unit to launch. - # In "yarn" mode, it relies a config.yaml file. - if [[ "${DASK_RUNTIME}" == "yarn" ]]; then - # Create cuda accelerated Dask YARN config file - configure_dask_yarn - echo "yarn setup complete" - else - # Create Dask service - install_systemd_dask_service - start_systemd_dask_service - - configure_knox_for_dask - - local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')" - if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then - configure_fluentd_for_dask - fi - fi -} - -function exit_handler() { - gpu_exit_handler - pip_exit_handler - conda_exit_handler - common_exit_handler - return 0 -} - -function prepare_to_install(){ - prepare_common_env - conda_env="$(get_metadata_attribute conda-env 'dask-rapids')" - readonly conda_env - prepare_dask_rapids_env - prepare_conda_env - prepare_pip_env - prepare_gpu_env - trap exit_handler EXIT -} - -prepare_to_install - -main From aa792c39fb597322c36a4f835503b8011e12c498 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 19:14:11 -0800 Subject: [PATCH 123/130] script to generate all actions from templates --- templates/generate-all-actions.sh | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 templates/generate-all-actions.sh diff --git a/templates/generate-all-actions.sh b/templates/generate-all-actions.sh new file mode 100644 index 000000000..2b25d99c5 --- /dev/null +++ b/templates/generate-all-actions.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +for tt in $(find templates -name '*.sh.in') ; do + genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"` + perl templates/generate-action.pl "${genfile}" | tee "${genfile}" > /tmp/$(basename $genfile) +done From 824bcf85e1d74287c86636f3e5e285fd680595db Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 20:16:02 -0800 Subject: [PATCH 124/130] spark prepare steps belong in common --- templates/common/util_functions | 26 ++++++++++++++++++++++++++ templates/gpu/spark_functions | 29 ----------------------------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 9a6407a7b..7cbef0849 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -491,6 +491,32 @@ function prepare_conda_env() { } function prepare_common_env() { + SPARK_NLP_VERSION="3.2.1" # Must include subminor version here + SPARK_JARS_DIR=/usr/lib/spark/jars + SPARK_CONF_DIR='/etc/spark/conf' + SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")" + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + + readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION + + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 + fi + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi + # Verify OS compatability and Secure boot state check_os check_secure_boot diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions index 25a99221e..fa29330de 100644 --- a/templates/gpu/spark_functions +++ b/templates/gpu/spark_functions @@ -41,32 +41,3 @@ function install_spark_rapids() { "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ "/usr/lib/spark/jars/${jar_basename}" } - -function prepare_spark_env() { - SPARK_NLP_VERSION="3.2.1" # Must include subminor version here - SPARK_JARS_DIR=/usr/lib/spark/jars - SPARK_CONF_DIR='/etc/spark/conf' - SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")" - SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - - readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION - - if version_lt "${SPARK_VERSION}" "3.1" || \ - version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 - fi - - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi - -} From 374ff96149207fc8e5a2ab705640f84f7beb4d74 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 11 Jan 2025 19:31:36 -0800 Subject: [PATCH 125/130] less noise in temp directory --- templates/generate-all-actions.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/templates/generate-all-actions.sh b/templates/generate-all-actions.sh index 2b25d99c5..ce5caef35 100644 --- a/templates/generate-all-actions.sh +++ b/templates/generate-all-actions.sh @@ -2,5 +2,6 @@ for tt in $(find templates -name '*.sh.in') ; do genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"` - perl templates/generate-action.pl "${genfile}" | tee "${genfile}" > /tmp/$(basename $genfile) + mkdir -p /tmp/init/$(dirname $genfile) + perl templates/generate-action.pl "${genfile}" | tee "${genfile}" > "/tmp/init/${genfile}" done From 5a37d94ca77f1a82525bad7336fda2dca92a73ce Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 15 Jan 2025 16:41:45 -0800 Subject: [PATCH 126/130] tested with much older versions of CUDA on an old dataproc image from pre-2023 --- templates/common/util_functions | 2 +- templates/gpu/install_functions | 60 ++++++++++++++++++++------ templates/gpu/mig_functions | 27 +++++++++++- templates/gpu/util_functions | 76 ++++++++++++++++++--------------- 4 files changed, 114 insertions(+), 51 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 7cbef0849..9c7bfeba9 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -554,7 +554,7 @@ function prepare_common_env() { if is_debuntu ; then clean_up_sources_lists - apt-get update -qq + apt-get update -qq --allow-releaseinfo-change apt-get -y clean apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions index 746eb79bb..1ba76c236 100644 --- a/templates/gpu/install_functions +++ b/templates/gpu/install_functions @@ -46,7 +46,7 @@ function set_cuda_runfile_url() { local MAX_DRIVER_VERSION local MAX_CUDA_VERSION - local MIN_OPEN_DRIVER_VER="515.48.07" + MIN_OPEN_DRIVER_VER="515.43.04" local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER @@ -84,7 +84,33 @@ function set_cuda_runfile_url() { # driver version named in cuda runfile filename # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) +# 10.0.130/410.48 =https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux +# 10.1.234/418.87.00=https://developer.download.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.243_418.87.00_linux.run +# 10.2.89/440.33.01 =https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run +# 11.0.3/450.51.06 =https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run +# 11.1.1/455.42.00 =https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run +# 11.2.2/460.32.03 =https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run +# 11.3.1/465.19.01 =https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run +# 11.4.4/470.82.01 =https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run +# 11.5.2/495.29.05 =https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run +# 11.6.2/510.47.03 =https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run +# 11.7.1/515.65.01 =https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run +# 11.8.0/520.61.05 =https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run +# 12.0.1/525.85.12 =https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run +# 12.1.1/530.30.02 =https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run +# 12.2.2/535.104.05 =https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run +# 12.3.2/545.23.08 =https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run readonly -A drv_for_cuda=( + ["10.0.130"]="410.48" + ["10.1.234"]="418.87.00" + ["10.2.89"]="440.33.01" + ["11.0.3"]="450.51.06" + ["11.1.1"]="455.42.00" + ["11.2.2"]="460.32.03" + ["11.3.1"]="465.19.01" + ["11.4.4"]="470.82.01" + ["11.5.2"]="495.29.05" + ["11.6.2"]="510.47.03" ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" ["11.8.0"]="520.61.05" ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" @@ -108,7 +134,8 @@ function set_cuda_runfile_url() { CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" readonly CUDA_RUNFILE - if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then + # version naming and archive url were erratic prior to 11.0.3 + if ( version_ge "${CUDA_FULL_VERSION}" "11.0.3" && ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ) ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" exit 1 fi @@ -292,13 +319,13 @@ function install_nvidia_nccl() { # Hopper: SM_90,SM_90a compute_90,compute_90a # Blackwell: SM_100, compute_100 NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" + if version_gt "${CUDA_VERSION}" "11.6" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" - fi + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" - fi + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi mkdir -p "${workdir}" pushd "${workdir}" @@ -464,8 +491,8 @@ function add_repo_cuda() { } function build_driver_from_github() { - # non-GPL driver will have been built on rocky8 - if is_rocky8 ; then return 0 ; fi + # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { @@ -592,7 +619,7 @@ function install_nvidia_userspace_runfile() { local cache_hit="0" local local_tarball - if is_rocky8 ; then + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" @@ -604,7 +631,9 @@ function install_nvidia_userspace_runfile() { if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then cache_hit="1" - runfile_args="--no-kernel-modules" + if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then + runfile_args="${runfile_args} --no-kernel-modules" + fi echo "cache hit" else install_build_dependencies @@ -619,10 +648,13 @@ function install_nvidia_userspace_runfile() { --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ " fi - runfile_args="--no-dkms ${signing_options}" + runfile_args="${signing_options}" + if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then + runfile_args="${runfile_args} --no-dkms" + fi fi } - else + elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="--no-kernel-modules" fi @@ -632,7 +664,7 @@ function install_nvidia_userspace_runfile() { --install-libglvnd \ --tmpdir="${tmpdir}" - if is_rocky8 ; then + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then if [[ "${cache_hit}" == "1" ]] ; then gcloud storage cat "${gcs_tarball}" | tar -C / -xzv depmod -a diff --git a/templates/gpu/mig_functions b/templates/gpu/mig_functions index 233b2d02c..7ec29aa25 100644 --- a/templates/gpu/mig_functions +++ b/templates/gpu/mig_functions @@ -66,13 +66,36 @@ function enable_mig() { is_complete enable-mig && return # Start persistenced if it's not already running - if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi +# if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do # Write an ascii zero to the numa node indicator echo "0" | dd of="${f}" status=none done - time nvsmi --gpu-reset # 30s + # nvidia-smi --query-compute-apps=pid --format=csv,noheader + for svc in resourcemanager nodemanager; do + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl stop "hadoop-yarn-${svc}.service" + fi + done + time nvsmi --gpu-reset || { # 30s + echo "unable to reset gpu. Trying to stop services and kernel modules which may have a lock." + # TODO: find a way to reset the A100 without reboot + for tryno in {1..25} ; do ; removed="1" + for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do + if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done + if [[ "${removed}" == "1" ]] ; then + echo "modules removed successfully" + nvsmi --gpu-reset + break + fi + done + } nvsmi -mig 1 + for svc in resourcemanager nodemanager; do + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl start "hadoop-yarn-${svc}.service" + fi + done clear_nvsmi_cache mark_complete enable-mig diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 48473d13b..565ec3ba0 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -1,51 +1,56 @@ function set_support_matrix() { # CUDA version and Driver version # https://docs.nvidia.com/deploy/cuda-compatibility/ - # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html + # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html#framework-matrix # https://developer.nvidia.com/cuda-downloads # Minimum supported version for open kernel driver is 515.43.04 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags - # Rocky8: 12.0: 525.147.05 local latest latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" readonly -A DRIVER_FOR_CUDA=( - ["11.7"]="515.65.01" ["11.8"]="525.147.05" - ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01" + ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31" + ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" + ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" + ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08" + ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) readonly -A DRIVER_SUBVER=( - ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" - ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" + ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" ["430"]="430.64" + ["435"]="435.21" ["440"]="440.100" ["450"]="450.119.03" + ["455"]="455.45.01" ["460"]="460.91.03" ["465"]="465.31" + ["470"]="470.256.02" ["495"]="495.46" ["510"]="510.108.03" + ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" + ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142" + ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.77" ) # https://developer.nvidia.com/cudnn-downloads - if is_debuntu ; then readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" + ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" ["11.0"]="8.0.4" + ["11.1"]="8.0.5" ["11.2"]="8.1.1" ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" + ["11.5"]="8.3.1.22" ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" + ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" + ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" + ["12.5"]="9.2.1.18" ["12.6"]="9.6.0.74" ) - elif is_rocky ; then - # rocky: - # 12.0: 8.8.1.3 - # 12.1: 8.9.3.28 - # 12.2: 8.9.7.29 - # 12.3: 9.0.0.312 - # 12.4: 9.1.1.17 - # 12.5: 9.2.1.18 - # 12.6: 9.5.1.17 - readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" - ) - fi # https://developer.nvidia.com/nccl/nccl-download - # 12.2: 2.19.3, 12.5: 2.21.5 readonly -A NCCL_FOR_CUDA=( - ["11.7"]="2.21.5" ["11.8"]="2.21.5" - ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" + ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3" + ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" ["11.5"]="2.11.4" + ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5" + ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3" + ["12.3"]="2.19.4" ["12.4"]="2.23.4" ["12.5"]="2.22.3" + ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( - ["11.7"]="11.7.1" ["11.8"]="11.8.0" - ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" + ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" + ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2" + ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2" + ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" + ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" + ["12.6"]="12.6.3" ) } @@ -131,7 +136,7 @@ function set_driver_version() { export DRIVER_VERSION DRIVER - gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + gpu_driver_url="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" exit 1 @@ -197,19 +202,22 @@ function prepare_gpu_env(){ # Verify SPARK compatability RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}") - readonly RAPIDS_RUNTIME + INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")" + readonly RAPIDS_RUNTIME INCLUDE_GPUS # determine whether we have nvidia-smi installed and working nvsmi } -# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades -# Users should run apt-mark unhold before they wish to upgrade these packages +# Hold all NVIDIA-related packages from upgrading either unintenionally or +# through use of services like unattended-upgrades +# +# Users should run apt-mark unhold before upgrading these packages function hold_nvidia_packages() { if ! is_debuntu ; then return ; fi - apt-mark hold nvidia-* - apt-mark hold libnvidia-* + apt-mark hold nvidia-* > /dev/null 2>&1 + apt-mark hold libnvidia-* > /dev/null 2>&1 if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then apt-mark hold xserver-xorg-video-nvidia* fi From 7662215766b520d006def883c8b0cf8dba440a1f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 16 Jan 2025 10:56:23 -0800 Subject: [PATCH 127/130] exercised older CUDA and mig a100 use case more ; added pytorch installation functionality --- templates/gpu/install_functions | 112 ++++++++++++++++++++++---------- templates/gpu/mig_functions | 55 ++++++++++++++-- templates/gpu/util_functions | 7 +- 3 files changed, 131 insertions(+), 43 deletions(-) diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions index 1ba76c236..8effce9b4 100644 --- a/templates/gpu/install_functions +++ b/templates/gpu/install_functions @@ -1,14 +1,15 @@ function set_cudnn_version() { - readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" + readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39" + readonly DEFAULT_CUDNN8_VERSION="8.3.1.22" readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" # Parameters for NVIDIA-provided cuDNN library DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} readonly DEFAULT_CUDNN_VERSION CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") - # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} - if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" + # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION} + if ( is_rocky && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then + CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}" elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then # cuDNN v8 is not distribution for ubuntu20+, debian12 CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" @@ -303,30 +304,6 @@ function install_nvidia_nccl() { local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - # https://github.com/NVIDIA/nccl/blob/master/README.md - # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Fermi: SM_20, compute_30 - # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 - # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 - # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 - - # The following architectures are suppored by open kernel driver - # Volta: SM_70,SM_72, compute_70,compute_72 - # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 - - # The following architectures are supported by CUDA v11.8+ - # Ada: SM_89, compute_89 - # Hopper: SM_90,SM_90a compute_90,compute_90a - # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" - if version_gt "${CUDA_VERSION}" "11.6" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi - if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi - if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi - mkdir -p "${workdir}" pushd "${workdir}" @@ -347,6 +324,30 @@ function install_nvidia_nccl() { local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" + if version_gt "${CUDA_VERSION}" "11.6" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache @@ -369,11 +370,12 @@ function install_nvidia_nccl() { export NVCC_GENCODE execute_with_retries make -j$(nproc) pkg.redhat.build fi - tar czvf "/${local_tarball}" "../${build_path}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" + tar czvf "${local_tarball}" "../${build_path}" make clean popd + tar xzvf "${local_tarball}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" fi gcloud storage cat "${gcs_tarball}" | tar xz } @@ -415,16 +417,16 @@ function install_nvidia_cudnn() { apt-get -y install nvidia-cudnn else if is_cudnn8 ; then - install_local_cudnn8_repo + add_repo_cuda apt-get update -qq + # Ignore version requested and use the latest version in the package index + cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" - - uninstall_local_cudnn8_repo sync elif is_cudnn9 ; then install_cuda_keyring_pkg @@ -452,6 +454,48 @@ function install_nvidia_cudnn() { mark_complete cudnn } +function install_pytorch() { + if test -f "${workdir}/complete/pytorch" ; then return ; fi + local env + env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') + local mc3=/opt/conda/miniconda3 + local envpath="${mc3}/envs/${env}" + # Set numa node to 0 for all GPUs + for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done + local verb=create + if test -d "${envpath}" ; then verb=install ; fi + + readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no') + case "${INCLUDE_PYTORCH^^}" in + "1" | "YES" | "TRUE" ) + local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + mkdir -p "${envpath}" + gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + cudart_spec="cuda-cudart" + if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi + "${mc3}/bin/mamba" "${verb}" -n "${env}" \ + -c conda-forge -c nvidia -c rapidsai \ + numba pytorch tensorflow[and-cuda] rapids pyspark \ + "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi + ;; + * ) echo "skip pytorch install" ;; + esac + touch "${workdir}/complete/pytorch" +} + function add_nonfree_components() { if is_src_nvidia ; then return; fi if ge_debian12 ; then diff --git a/templates/gpu/mig_functions b/templates/gpu/mig_functions index 7ec29aa25..7d94b7dcf 100644 --- a/templates/gpu/mig_functions +++ b/templates/gpu/mig_functions @@ -65,38 +65,81 @@ function configure_mig_cgi() { function enable_mig() { is_complete enable-mig && return - # Start persistenced if it's not already running -# if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi + # All devices on the same numa node for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do # Write an ascii zero to the numa node indicator echo "0" | dd of="${f}" status=none done + + echo "Stopping services and kernel modules in preparation for enabling mig." + if ( ps auwx | grep -i nvidia\\-persistenced ) ; then killall -9 nvidia-persistenced ; fi + # nvidia-smi --query-compute-apps=pid --format=csv,noheader for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then systemctl stop "hadoop-yarn-${svc}.service" fi done + # can lsof be used to determine what processes have a file with name =~ /nvidia/ under the /dev/ directory ? + # if so, stop the service which launches the process with the open filehandle + + MIG_GPU_LIST="`nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n ""`" + NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" + +# root@cluster-1718310842-m:/tmp# for m in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do sudo rmmod $m ; done +# rmmod: ERROR: Module nvidia_drm is not currently loaded +# rmmod: ERROR: Module nvidia_modeset is not currently loaded +# rmmod: ERROR: Module nvidia_uvm is not currently loaded +# rmmod: ERROR: Module nvidia is not currently loaded +# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --gpu-reset +# Resetting GPU 00000000:00:04.0 is not supported. +# root@cluster-1718310842-m:/tmp# nvidia-smi -i 0 --multi-instance-gpu=1 +# Warning: MIG mode is in pending enable state for GPU 00000000:00:04.0:Not Supported +# Reboot the system or try nvidia-smi --gpu-reset to make MIG mode effective on GPU 00000000:00:04.0 +# All done. +# root@cluster-1718310842-m:/tmp# echo $? +# 0 +# root@cluster-1718310842-m:/tmp# /usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader +# Disabled + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + time nvsmi --gpu-reset || { # 30s - echo "unable to reset gpu. Trying to stop services and kernel modules which may have a lock." # TODO: find a way to reset the A100 without reboot + removed="1" for tryno in {1..25} ; do ; removed="1" for mod in nvidia_drm nvidia_modeset nvidia_uvm nvidia ; do if lsmod | grep -q "${mod}" ; then rmmod $mod > /dev/null 2>&1 || removed="0" ; fi ; done if [[ "${removed}" == "1" ]] ; then echo "modules removed successfully" - nvsmi --gpu-reset - break + nvsmi --gpu-reset && break fi done } - nvsmi -mig 1 + + if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do + if version_le "${CUDA_VERSION}" "11.6" ; then + nvsmi -i "${GPU_ID}" --multi-instance-gpu=1 + else + nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + fi + done + fi + if test -n "$(nvsmi -L)" ; then + # cache the result of the gpu query + ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') + echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" + chmod a+r "/var/run/nvidia-gpu-index.txt" + fi for svc in resourcemanager nodemanager; do if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then systemctl start "hadoop-yarn-${svc}.service" fi done clear_nvsmi_cache + # Start persistenced if it's not already running + if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi mark_complete enable-mig } diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 565ec3ba0..eea7b3dd5 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -200,10 +200,11 @@ function prepare_gpu_env(){ readonly DEFAULT_RAPIDS_RUNTIME='SPARK' fi - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}") + # Set variables from metadata + RAPIDS_RUNTIME="$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")" INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")" - readonly RAPIDS_RUNTIME INCLUDE_GPUS + INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')" + readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH # determine whether we have nvidia-smi installed and working nvsmi From 0c3eb5162580d6e15692f1b2d2c3f7aa5ea9dd80 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 18 Jan 2025 21:35:48 -0800 Subject: [PATCH 128/130] create function to harden sshd config ; execute it before repairing old backports --- templates/common/util_functions | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index 9c7bfeba9..b99387d79 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -490,6 +490,24 @@ function prepare_conda_env() { fi } +function harden_sshd_config() { + # disable sha1 use in kex and kex-gss features + declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms") + for ftr in "${!feature_map[@]}" ; do + export feature=${feature_map[$ftr]} + sshd_config_line=$( + (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g'; + ssh -Q "${ftr}" ) \ + | sort | uniq | grep -iv sha1 | perl -e '@a=; + print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if @a') + grep -v "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new + echo "$sshd_config_line" >> /tmp/sshd_config_new + # TODO: test whether sshd will reload with this change before mv + mv /tmp/sshd_config_new /etc/ssh/sshd_config + done + systemctl reload ssh +} + function prepare_common_env() { SPARK_NLP_VERSION="3.2.1" # Must include subminor version here SPARK_JARS_DIR=/usr/lib/spark/jars @@ -550,9 +568,10 @@ function prepare_common_env() { is_complete prepare.common && return - repair_old_backports + harden_sshd_config if is_debuntu ; then + repair_old_backports clean_up_sources_lists apt-get update -qq --allow-releaseinfo-change apt-get -y clean From 576bbb61036e84531b37f324ce3482271d5cd0a6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 22 Jan 2025 16:40:01 -0800 Subject: [PATCH 129/130] reviewed #1275 and brought closer to parity --- templates/common/util_functions | 22 +++--- templates/generate-action.pl | 6 +- templates/gpu/install_functions | 134 ++++++++++++++------------------ templates/gpu/util_functions | 74 +++++++++--------- templates/gpu/yarn_functions | 3 +- 5 files changed, 114 insertions(+), 125 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index b99387d79..aeea8a294 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -466,11 +466,11 @@ function install_dependencies() { } function prepare_pip_env() { - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv" + test -d "${workdir}/python-venv" || /opt/conda/miniconda3/bin/python3 -m venv "${workdir}/python-venv" source "${workdir}/python-venv/bin/activate" + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge pip cache purge || echo "unable to purge pip cache" if is_ramdisk ; then # Download pip packages to tmpfs @@ -491,21 +491,25 @@ function prepare_conda_env() { } function harden_sshd_config() { - # disable sha1 use in kex and kex-gss features - declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms") + # disable sha1 and md5 use in kex and kex-gss features + declare -A feature_map=(["kex"]="kexalgorithms") + if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then + feature_map["kex-gss"]="gssapikexalgorithms" ; fi for ftr in "${!feature_map[@]}" ; do export feature=${feature_map[$ftr]} sshd_config_line=$( (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g'; ssh -Q "${ftr}" ) \ - | sort | uniq | grep -iv sha1 | perl -e '@a=; - print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if @a') - grep -v "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new + | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}; + print("$ENV{feature} ",join(",",map{ chomp; $_ }@a), $/) if "@a"') + grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new echo "$sshd_config_line" >> /tmp/sshd_config_new # TODO: test whether sshd will reload with this change before mv mv /tmp/sshd_config_new /etc/ssh/sshd_config done - systemctl reload ssh + local svc=ssh + if is_rocky ; then svc="sshd" ; fi + systemctl reload "${svc}" } function prepare_common_env() { diff --git a/templates/generate-action.pl b/templates/generate-action.pl index 2e1d344ff..334d6ecac 100644 --- a/templates/generate-action.pl +++ b/templates/generate-action.pl @@ -8,7 +8,7 @@ use strict; # Version of Initialization Actions we will generate -my $IA_VERSION="0.0.1"; +my $IA_VERSION="0.1.1"; my $action = $ARGV[0]; my $v = { @@ -22,7 +22,7 @@ sub usage{ This script evaluates a template to generate an initialization action. The output is printed to STDOUT. -Action templates reside under templates/$action and end in .sh.in +Action templates reside under templates/\${action}.in The argument is the destination action name, not the source. EOF @@ -34,7 +34,7 @@ sub usage{ my $tt = Template->new( { INCLUDE_PATH => "$ENV{PWD}/templates", - VARIABLES => $v, + VARIABLES => $v, INTERPOLATE => 0, }) || die "$Template::ERROR$/"; diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions index 8effce9b4..68183bc1f 100644 --- a/templates/gpu/install_functions +++ b/templates/gpu/install_functions @@ -4,16 +4,15 @@ function set_cudnn_version() { readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" # Parameters for NVIDIA-provided cuDNN library - DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} - readonly DEFAULT_CUDNN_VERSION + readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION} if ( is_rocky && version_le "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}" - elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then + elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then # cuDNN v8 is not distribution for ubuntu20+, debian12 CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" - elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then + elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 CUDNN_VERSION="8.8.0.121" fi @@ -106,7 +105,7 @@ function set_cuda_runfile_url() { ["10.1.234"]="418.87.00" ["10.2.89"]="440.33.01" ["11.0.3"]="450.51.06" - ["11.1.1"]="455.42.00" + ["11.1.1"]="455.32.00" ["11.2.2"]="460.32.03" ["11.3.1"]="465.19.01" ["11.4.4"]="470.82.01" @@ -130,17 +129,21 @@ function set_cuda_runfile_url() { local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - readonly NVIDIA_CUDA_URL - - CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" - readonly CUDA_RUNFILE # version naming and archive url were erratic prior to 11.0.3 - if ( version_ge "${CUDA_FULL_VERSION}" "11.0.3" && ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ) ; then + if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" + if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then + echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead" + fi exit 1 fi + readonly NVIDIA_CUDA_URL + + CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_RUNFILE + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then @@ -152,50 +155,24 @@ function set_cuda_runfile_url() { fi } -function set_cudnn_tarball_url() { -CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" -CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" -if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then - # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" - if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then - # When cuDNN version is greater than or equal to 8.4.1.50 use this format - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" - fi - # Use legacy url format with one of the tarball name formats depending on version as above - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" -fi -if ( version_ge "${CUDA_VERSION}" "12.0" ); then - # Use modern url format When cuda version is greater than or equal to 12.0 - CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" -fi -readonly CUDNN_TARBALL -readonly CUDNN_TARBALL_URL -} - function install_cuda_keyring_pkg() { - if ( test -v CUDA_KEYRING_PKG_INSTALLED && - [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi + is_complete cuda-keyring-installed && return local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" rm -f "${tmpdir}/cuda-keyring.deb" - CUDA_KEYRING_PKG_INSTALLED="1" + mark_complete cuda-keyring-installed } function uninstall_cuda_keyring_pkg() { apt-get purge -yq cuda-keyring - CUDA_KEYRING_PKG_INSTALLED="0" + mark_incomplete cuda-keyring-installed } function install_local_cuda_repo() { is_complete install-local-cuda-repo && return - - if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi - CUDA_LOCAL_REPO_INSTALLED="1" pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" @@ -219,7 +196,7 @@ function install_local_cuda_repo() { } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cuda-repo" + mark_incomplete install-local-cuda-repo } function install_local_cudnn_repo() { @@ -268,7 +245,7 @@ function install_local_cudnn8_repo() { # cache the cudnn package cache_fetched_package "${local_deb_url}" \ - "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ + "${pkg_bucket}/nvidia/cudnn/${CUDNN8_CUDA_VER}/${deb_fn}" \ "${local_deb_fn}" local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" @@ -322,41 +299,42 @@ function install_nvidia_nccl() { test -d "${workdir}/nccl/build" || { local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" - - # https://github.com/NVIDIA/nccl/blob/master/README.md - # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Fermi: SM_20, compute_30 - # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 - # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 - # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 - - # The following architectures are suppored by open kernel driver - # Volta: SM_70,SM_72, compute_70,compute_72 - # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 - - # The following architectures are supported by CUDA v11.8+ - # Ada: SM_89, compute_89 - # Hopper: SM_90,SM_90a compute_90,compute_90a - # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" - if version_gt "${CUDA_VERSION}" "11.6" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi - if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi - if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi + local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}" output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" + gcloud storage cat "${gcs_tarball}" | tar xvz else # build and cache pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install install_build_dependencies + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" + if version_gt "${CUDA_VERSION}" "11.6" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi + if is_debuntu ; then # These packages are required to build .deb packages from source execute_with_retries \ @@ -391,8 +369,8 @@ function install_nvidia_nccl() { } function install_nvidia_cudnn() { + if le_debian10 ; then return ; fi is_complete cudnn && return - local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -427,9 +405,10 @@ function install_nvidia_cudnn() { apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" - sync + + sync elif is_cudnn9 ; then - install_cuda_keyring_pkg + install_cuda_keyring_pkg apt-get update -qq @@ -438,7 +417,8 @@ function install_nvidia_cudnn() { "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" - sync + + sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi @@ -462,8 +442,6 @@ function install_pytorch() { local envpath="${mc3}/envs/${env}" # Set numa node to 0 for all GPUs for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done - local verb=create - if test -d "${envpath}" ; then verb=install ; fi readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no') case "${INCLUDE_PYTORCH^^}" in @@ -479,6 +457,8 @@ function install_pytorch() { mkdir -p "${envpath}" gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz else + local verb=create + if test -d "${envpath}" ; then verb=install ; fi cudart_spec="cuda-cudart" if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi "${mc3}/bin/mamba" "${verb}" -n "${env}" \ @@ -536,7 +516,7 @@ function add_repo_cuda() { function build_driver_from_github() { # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version - if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then return 0 ; fi + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { @@ -554,7 +534,7 @@ function build_driver_from_github() { local def_dir="${modulus_md5sum:-unsigned}" local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}") - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" @@ -618,6 +598,7 @@ function build_driver_from_packages() { add_contrib_component apt-get update -qq execute_with_retries apt-get install -y -qq --no-install-recommends dkms + configure_dkms_certs execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync @@ -629,6 +610,7 @@ function build_driver_from_packages() { fi sync fi + clear_dkms_key } function install_nvidia_userspace_runfile() { @@ -708,7 +690,7 @@ function install_nvidia_userspace_runfile() { --install-libglvnd \ --tmpdir="${tmpdir}" - if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "515.43.04" ) ; then + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then if [[ "${cache_hit}" == "1" ]] ; then gcloud storage cat "${gcs_tarball}" | tar -C / -xzv depmod -a @@ -732,7 +714,7 @@ function install_cuda_runfile() { local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ - "${pkg_bucket}/${CUDA_RUNFILE}" \ + "${pkg_bucket}/${CUDA_RUNFILE}" \ "${local_fn}" execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index eea7b3dd5..69d55a2cb 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -7,50 +7,52 @@ function set_support_matrix() { # Minimum supported version for open kernel driver is 515.43.04 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags local latest - latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" + latest="$(curl -s https://us.download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" readonly -A DRIVER_FOR_CUDA=( - ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01" - ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31" - ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" - ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" - ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08" - ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01" + ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31" + ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" + ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" + ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08" + ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) readonly -A DRIVER_SUBVER=( - ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" ["430"]="430.64" - ["435"]="435.21" ["440"]="440.100" ["450"]="450.119.03" - ["455"]="455.45.01" ["460"]="460.91.03" ["465"]="465.31" - ["470"]="470.256.02" ["495"]="495.46" ["510"]="510.108.03" - ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" - ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142" - ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.77" + ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" + ["430"]="430.64" ["435"]="435.21" ["440"]="440.100" + ["450"]="450.119.03" ["455"]="455.45.01" ["460"]="460.91.03" + ["465"]="465.31" ["470"]="470.256.02" ["495"]="495.46" + ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05" + ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06" + ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03" + ["565"]="565.77" ) # https://developer.nvidia.com/cudnn-downloads readonly -A CUDNN_FOR_CUDA=( - ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" ["11.0"]="8.0.4" - ["11.1"]="8.0.5" ["11.2"]="8.1.1" ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" - ["11.5"]="8.3.1.22" ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" - ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" - ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" - ["12.5"]="9.2.1.18" ["12.6"]="9.6.0.74" + ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" + ["11.0"]="8.0.4" ["11.1"]="8.0.5" ["11.2"]="8.1.1" + ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" ["11.5"]="8.3.1.22" + ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5" + ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18" + ["12.6"]="9.6.0.74" ) # https://developer.nvidia.com/nccl/nccl-download readonly -A NCCL_FOR_CUDA=( - ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3" - ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" ["11.5"]="2.11.4" - ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5" - ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3" - ["12.3"]="2.19.4" ["12.4"]="2.23.4" ["12.5"]="2.22.3" - ["12.6"]="2.23.4" + ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3" + ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" + ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12" + ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" + ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4" + ["12.5"]="2.22.3" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( - ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" - ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2" - ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2" - ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" - ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" - ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" - ["12.6"]="12.6.3" + ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" + ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2" + ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2" + ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" + ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" + ["12.6"]="12.6.3" ) } @@ -71,7 +73,7 @@ function set_cuda_version() { local CUDA_URL_VERSION CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then - DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" + DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION}" fi fi readonly DEFAULT_CUDA_VERSION @@ -114,10 +116,10 @@ function set_driver_version() { if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then + if curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the version indicated by the cuda url as the default if it exists DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then + elif curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default DEFAULT_DRIVER="${driver_max_maj_version}" fi diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions index d9040b1d6..c4194a2ea 100644 --- a/templates/gpu/yarn_functions +++ b/templates/gpu/yarn_functions @@ -52,6 +52,7 @@ EOF chmod a+rx "${gpus_resources_script}" local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" + if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi local executor_cores executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" @@ -75,9 +76,9 @@ EOF # query explain output won't show GPU operator, if the user has doubts # they can uncomment the line before seeing the GPU plan explain; # having AQE enabled gives user the best performance. +spark.executor.resource.gpu.discoveryScript=${gpus_resources_script} spark.executor.resource.gpu.amount=${gpu_count} spark.plugins=com.nvidia.spark.SQLPlugin -spark.executor.resource.gpu.discoveryScript=${gpus_resources_script} spark.executor.cores=${executor_cores} spark.executor.memory=${executor_memory_gb}G spark.dynamicAllocation.enabled=false From 989b445b20a2be99b22f169ab9e85f8def9be534 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 28 Jan 2025 17:39:25 -0800 Subject: [PATCH 130/130] changes from testing PR #1275 --- templates/common/util_functions | 7 ++- templates/gpu/install_functions | 96 +++++++++++++++++++++------------ templates/gpu/spark_functions | 5 +- templates/gpu/util_functions | 2 +- templates/gpu/yarn_functions | 30 ++++++++++- 5 files changed, 98 insertions(+), 42 deletions(-) diff --git a/templates/common/util_functions b/templates/common/util_functions index aeea8a294..42f01278b 100644 --- a/templates/common/util_functions +++ b/templates/common/util_functions @@ -7,9 +7,9 @@ function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | c # ( version_ge 2.0 2.1 ) evaluates to false # ( version_ge 2.2 2.1 ) evaluates to true function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) -function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) +function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge "$1" "$2" ; ) function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) -function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) +function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le "$1" "$2" ; ) function define_os_comparison_functions() { @@ -500,8 +500,7 @@ function harden_sshd_config() { sshd_config_line=$( (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g'; ssh -Q "${ftr}" ) \ - | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}; - print("$ENV{feature} ",join(",",map{ chomp; $_ }@a), $/) if "@a"') + | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -) grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new echo "$sshd_config_line" >> /tmp/sshd_config_new # TODO: test whether sshd will reload with this change before mv diff --git a/templates/gpu/install_functions b/templates/gpu/install_functions index 68183bc1f..0ed0e8c8f 100644 --- a/templates/gpu/install_functions +++ b/templates/gpu/install_functions @@ -119,7 +119,7 @@ function set_cuda_runfile_url() { ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not - ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05" ) # Verify that the file with the indicated combination exists @@ -200,6 +200,7 @@ function uninstall_local_cuda_repo(){ } function install_local_cudnn_repo() { + # https://docs.nvidia.com/deeplearning/cudnn/sla/index.html is_complete install-local-cudnn-repo && return pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" @@ -368,6 +369,7 @@ function install_nvidia_nccl() { mark_complete nccl } +# https://docs.nvidia.com/deeplearning/cudnn/sla/index.html function install_nvidia_cudnn() { if le_debian10 ; then return ; fi is_complete cudnn && return @@ -435,45 +437,64 @@ function install_nvidia_cudnn() { } function install_pytorch() { - if test -f "${workdir}/complete/pytorch" ; then return ; fi + is_complete pytorch && return + local env env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') local mc3=/opt/conda/miniconda3 local envpath="${mc3}/envs/${env}" + if [[ "${env}" == "base" ]]; then + echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi # Set numa node to 0 for all GPUs for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done - readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no') - case "${INCLUDE_PYTORCH^^}" in - "1" | "YES" | "TRUE" ) - local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" - output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') - if echo "${output}" | grep -q "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - mkdir -p "${envpath}" - gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz - else - local verb=create - if test -d "${envpath}" ; then verb=install ; fi - cudart_spec="cuda-cudart" - if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi - "${mc3}/bin/mamba" "${verb}" -n "${env}" \ - -c conda-forge -c nvidia -c rapidsai \ - numba pytorch tensorflow[and-cuda] rapids pyspark \ - "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" - pushd "${envpath}" - tar czf "${local_tarball}" . - popd - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - fi - ;; - * ) echo "skip pytorch install" ;; - esac - touch "${workdir}/complete/pytorch" + if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + # do not build in tests with < 32 cores + sleep $(( ( RANDOM % 11 ) + 10 )) + while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do + sleep 5m + done + fi + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + mkdir -p "${envpath}" + gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + touch "${local_tarball}.building" + gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + local verb=create + if test -d "${envpath}" ; then verb=install ; fi + cudart_spec="cuda-cudart" + if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi + + # Install pytorch and company to this environment + "${mc3}/bin/mamba" "${verb}" -n "${env}" \ + -c conda-forge -c nvidia -c rapidsai \ + numba pytorch tensorflow[and-cuda] rapids pyspark \ + "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" + + # Install jupyter kernel in this environment + "${envpath}/bin/python3" -m pip install ipykernel + + # package environment and cache in GCS + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + fi + + # register the environment as a selectable kernel + "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})" + + mark_complete pytorch } function add_nonfree_components() { @@ -508,7 +529,16 @@ function add_repo_nvidia_container_toolkit() { function add_repo_cuda() { if is_debuntu ; then - install_cuda_keyring_pkg # 11.7+, 12.0+ + if version_le "${CUDA_VERSION}" 11.6 ; then + local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg + local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" + echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ + | sudo tee "${sources_list_path}" + curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ + -o "${kr_path}" + else + install_cuda_keyring_pkg # 11.7+, 12.0+ + fi elif is_rocky ; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" fi diff --git a/templates/gpu/spark_functions b/templates/gpu/spark_functions index fa29330de..731e01756 100644 --- a/templates/gpu/spark_functions +++ b/templates/gpu/spark_functions @@ -7,14 +7,15 @@ function download_spark_jar() { function install_spark_rapids() { # Update SPARK RAPIDS config - local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_SPARK_RAPIDS_VERSION + DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then - local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 + DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 fi readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions index 69d55a2cb..0270b41f3 100644 --- a/templates/gpu/util_functions +++ b/templates/gpu/util_functions @@ -60,7 +60,7 @@ function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; + "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;; * ) echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" exit 1 diff --git a/templates/gpu/yarn_functions b/templates/gpu/yarn_functions index c4194a2ea..d7accf8f1 100644 --- a/templates/gpu/yarn_functions +++ b/templates/gpu/yarn_functions @@ -12,6 +12,25 @@ function configure_yarn_gpu_resources() { 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' + + # Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below + if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then + fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml" + set_hadoop_property 'yarn-site.xml' \ + 'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler' + set_hadoop_property 'yarn-site.xml' \ + "yarn.scheduler.fair.user-as-default-queue" "false" + set_hadoop_property 'yarn-site.xml' \ + "yarn.scheduler.fair.allocation.file" "${fs_xml}" + set_hadoop_property 'yarn-site.xml' \ + 'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + cat > "${fs_xml}" < + + 1 + +EOF + fi } function configure_gpu_script() { @@ -44,9 +63,15 @@ function configure_gpu_script() { # # Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} +set -e +resources_json="/dev/shm/nvidia/gpusResources.json" +if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi + +mkdir -p "$(dirname ${resources_json})" + ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') -echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} +echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}" EOF chmod a+rx "${gpus_resources_script}" @@ -78,7 +103,6 @@ EOF # having AQE enabled gives user the best performance. spark.executor.resource.gpu.discoveryScript=${gpus_resources_script} spark.executor.resource.gpu.amount=${gpu_count} -spark.plugins=com.nvidia.spark.SQLPlugin spark.executor.cores=${executor_cores} spark.executor.memory=${executor_memory_gb}G spark.dynamicAllocation.enabled=false @@ -86,6 +110,7 @@ spark.dynamicAllocation.enabled=false spark.task.resource.gpu.amount=${gpu_amount} spark.task.cpus=2 spark.yarn.unmanagedAM.enabled=false +spark.plugins=com.nvidia.spark.SQLPlugin ###### END : RAPIDS properties for Spark ${SPARK_VERSION} ###### EOF } @@ -97,6 +122,7 @@ function configure_yarn_nodemanager_gpu() { 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}" + configure_yarn_nodemanager }