@@ -183,6 +183,7 @@ readonly -A CUDA_SUBVER=(
183183
184184function set_cuda_version() {
185185 case " ${DATAPROC_IMAGE_VERSION} " in
186+ " 1.5" ) DEFAULT_CUDA_VERSION=" 11.6.2" ;;
186187 " 2.0" ) DEFAULT_CUDA_VERSION=" 12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
187188 " 2.1" ) DEFAULT_CUDA_VERSION=" 12.4.1" ;;
188189 " 2.2" ) DEFAULT_CUDA_VERSION=" 12.6.3" ;;
@@ -243,10 +244,10 @@ function set_driver_version() {
243244 if [[ " ${CUDA_URL_DRIVER_VERSION} " =~ ^[0-9]+.* [0-9]$ ]] ; then
244245 major_driver_version=" ${CUDA_URL_DRIVER_VERSION%% .* } "
245246 driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
246- if curl ${curl_retry_args} --head " ${nv_xf86_x64_base} /${CUDA_URL_DRIVER_VERSION} /NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION} .run" | grep -E -q ' ^ HTTP.*200\s*$ ' ; then
247+ if curl ${curl_retry_args} --head " ${nv_xf86_x64_base} /${CUDA_URL_DRIVER_VERSION} /NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION} .run" | grep -E -q ' HTTP.*200' ; then
247248 # use the version indicated by the cuda url as the default if it exists
248249 DEFAULT_DRIVER=" ${CUDA_URL_DRIVER_VERSION} "
249- elif curl ${curl_retry_args} --head " ${nv_xf86_x64_base} /${driver_max_maj_version} /NVIDIA-Linux-x86_64-${driver_max_maj_version} .run" | grep -E -q ' ^ HTTP.*200\s*$ ' ; then
250+ elif curl ${curl_retry_args} --head " ${nv_xf86_x64_base} /${driver_max_maj_version} /NVIDIA-Linux-x86_64-${driver_max_maj_version} .run" | grep -E -q ' HTTP.*200' ; then
250251 # use the maximum sub-version available for the major version indicated in cuda url as the default
251252 DEFAULT_DRIVER=" ${driver_max_maj_version} "
252253 fi
@@ -266,7 +267,7 @@ function set_driver_version() {
266267 export DRIVER_VERSION DRIVER
267268
268269 gpu_driver_url=" ${nv_xf86_x64_base} /${DRIVER_VERSION} /NVIDIA-Linux-x86_64-${DRIVER_VERSION} .run"
269- if ! curl ${curl_retry_args} --head " ${gpu_driver_url} " | grep -E -q ' ^ HTTP.*200\s*$ ' ; then
270+ if ! curl ${curl_retry_args} --head " ${gpu_driver_url} " | grep -E -q ' HTTP.*200' ; then
270271 echo " No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION} "
271272 exit 1
272273 fi
@@ -398,7 +399,7 @@ function set_cuda_runfile_url() {
398399
399400 NVIDIA_CUDA_URL=$( get_metadata_attribute ' cuda-url' " ${DEFAULT_NVIDIA_CUDA_URL} " )
400401
401- if ! curl ${curl_retry_args} --head " ${NVIDIA_CUDA_URL} " | grep -E -q ' ^ HTTP.*200\s*$ ' ; then
402+ if ! curl ${curl_retry_args} --head " ${NVIDIA_CUDA_URL} " | grep -E -q ' HTTP.*200' ; then
402403 echo " No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver} , CUDA_VERSION=${CUDA_FULL_VERSION} "
403404 if [[ " ${DEFAULT_NVIDIA_CUDA_URL} " != " ${NVIDIA_CUDA_URL} " ]]; then
404405 echo " consider [${DEFAULT_NVIDIA_CUDA_URL} ] instead"
@@ -628,16 +629,13 @@ function install_nvidia_nccl() {
628629 if [[ " $( hostname -s) " =~ ^test && " $( nproc) " < 32 ]] ; then
629630 # when running with fewer than 32 cores, yield to in-progress build
630631 sleep $(( ( RANDOM % 11 ) + 10 ))
631- if ${gsutil_cmd} ls " ${gcs_tarball} .building" ; then
632+ local output=" $( ${gsutil_stat_cmd} " ${gcs_tarball} .building" ) "
633+ if [[ " $? " == " 0" ]] ; then
632634 local build_start_time build_start_epoch timeout_epoch
633- if [[ " ${gsutil_cmd} " =~ " gsutil" ]] ; then
634- build_start_time=" $( ${gsutil_cmd} ls -L " ${gcs_tarball} .building" | awk -F' : +' ' /Creation time/ {print $2}' ) "
635- else
636- build_start_time=" $( ${gsutil_cmd} ls -j " ${gcs_tarball} .building" | jq -r .[0].metadata.timeCreated " ${local_tarball} .building.json" ) "
637- fi
638- build_start_epoch=" $( date -d " ${build_start_time} " +%s) "
635+ build_start_time=" $( echo ${output} | awk -F' : +' ' /.reation.time/ {print $2}' ) "
636+ build_start_epoch=" $( date -u -d " ${build_start_time} " +%s) "
639637 timeout_epoch=$(( build_start_epoch + 2700 )) # 45 minutes
640- while ${gsutil_cmd} ls -L " ${gcs_tarball} .building" ; do
638+ while ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; do
641639 local now_epoch=" $( date -u +%s) "
642640 if (( now_epoch > timeout_epoch )) ; then
643641 # detect unexpected build failure after 45m
@@ -649,8 +647,7 @@ function install_nvidia_nccl() {
649647 fi
650648 fi
651649
652- output=$( ${gsutil_cmd} ls " ${gcs_tarball} " 2>&1 || echo ' ' )
653- if echo " ${output} " | grep -q " ${gcs_tarball} " ; then
650+ if ${gsutil_stat_cmd} " ${gcs_tarball} " ; then
654651 # cache hit - unpack from cache
655652 echo " cache hit"
656653 ${gsutil_cmd} cat " ${gcs_tarball} " | tar xvz
@@ -705,7 +702,7 @@ function install_nvidia_nccl() {
705702 popd
706703 tar xzvf " ${local_tarball} "
707704 ${gsutil_cmd} cp " ${local_tarball} " " ${gcs_tarball} "
708- if ${gsutil_cmd} ls " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
705+ if ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
709706 building_file=" "
710707 rm " ${local_tarball} "
711708 fi
@@ -796,6 +793,7 @@ function install_pytorch() {
796793 local env
797794 env=$( get_metadata_attribute ' gpu-conda-env' ' dpgce' )
798795 local mc3=/opt/conda/miniconda3
796+ [[ -d ${mc3} ]] || return
799797 local envpath=" ${mc3} /envs/${env} "
800798 if [[ " ${env} " == " base" ]]; then
801799 echo " WARNING: installing to base environment known to cause solve issues" ; envpath=" ${mc3} " ; fi
@@ -809,16 +807,12 @@ function install_pytorch() {
809807 if [[ " $( hostname -s) " =~ ^test && " $( nproc) " < 32 ]] ; then
810808 # when running with fewer than 32 cores, yield to in-progress build
811809 sleep $(( ( RANDOM % 11 ) + 10 ))
812- if ${gsutil_cmd} ls -j " ${gcs_tarball} .building" > " ${local_tarball} .building.json " ; then
810+ if ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; then
813811 local build_start_time build_start_epoch timeout_epoch
814- if [[ " ${gsutil_cmd} " =~ " gsutil" ]] ; then
815- build_start_time=" $( ${gsutil_cmd} ls -L " ${gcs_tarball} .building" | awk -F' : +' ' /Creation time/ {print $2}' ) "
816- else
817- build_start_time=" $( ${gsutil_cmd} ls -j " ${gcs_tarball} .building" | jq -r .[0].metadata.timeCreated " ${local_tarball} .building.json" ) "
818- fi
819- build_start_epoch=" $( date -d " ${build_start_time} " +%s) "
812+ build_start_time=" $( ${gsutil_stat_cmd} " ${gcs_tarball} .building" | awk -F' : +' ' /.reation.time/ {print $2}' ) "
813+ build_start_epoch=" $( date -u -d " ${build_start_time} " +%s) "
820814 timeout_epoch=$(( build_start_epoch + 2700 )) # 45 minutes
821- while ${gsutil_cmd} ls -L " ${gcs_tarball} .building" ; do
815+ while ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; do
822816 local now_epoch=" $( date -u +%s) "
823817 if (( now_epoch > timeout_epoch )) ; then
824818 # detect unexpected build failure after 45m
@@ -830,8 +824,7 @@ function install_pytorch() {
830824 fi
831825 fi
832826
833- output=$( ${gsutil_cmd} ls " ${gcs_tarball} " 2>&1 || echo ' ' )
834- if echo " ${output} " | grep -q " ${gcs_tarball} " ; then
827+ if ${gsutil_stat_cmd} " ${gcs_tarball} " ; then
835828 # cache hit - unpack from cache
836829 echo " cache hit"
837830 mkdir -p " ${envpath} "
@@ -859,7 +852,7 @@ function install_pytorch() {
859852 tar czf " ${local_tarball} " .
860853 popd
861854 ${gsutil_cmd} cp " ${local_tarball} " " ${gcs_tarball} "
862- if ${gsutil_cmd} ls " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
855+ if ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
863856 building_file=" "
864857 fi
865858
@@ -1040,16 +1033,12 @@ function build_driver_from_github() {
10401033 if [[ " $( hostname -s) " =~ ^test && " $( nproc) " < 32 ]] ; then
10411034 # when running with fewer than 32 cores, yield to in-progress build
10421035 sleep $(( ( RANDOM % 11 ) + 10 ))
1043- if ${gsutil_cmd} ls -j " ${gcs_tarball} .building" > " ${local_tarball} .building.json " ; then
1036+ if ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; then
10441037 local build_start_time build_start_epoch timeout_epoch
1045- if [[ " ${gsutil_cmd} " =~ " gsutil" ]] ; then
1046- build_start_time=" $( ${gsutil_cmd} ls -L " ${gcs_tarball} .building" | awk -F' : +' ' /Creation time/ {print $2}' ) "
1047- else
1048- build_start_time=" $( ${gsutil_cmd} ls -j " ${gcs_tarball} .building" | jq -r .[0].metadata.timeCreated " ${local_tarball} .building.json" ) "
1049- fi
1050- build_start_epoch=" $( date -d " ${build_start_time} " +%s) "
1038+ build_start_time=" $( ${gsutil_stat_cmd} " ${gcs_tarball} .building" | awk -F' : +' ' /.reation.time/ {print $2}' ) "
1039+ build_start_epoch=" $( date -u -d " ${build_start_time} " +%s) "
10511040 timeout_epoch=$(( build_start_epoch + 2700 )) # 45 minutes
1052- while ${gsutil_cmd} ls -L " ${gcs_tarball} .building" ; do
1041+ while ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; do
10531042 local now_epoch=" $( date -u +%s) "
10541043 if (( now_epoch > timeout_epoch )) ; then
10551044 # detect unexpected build failure after 45m
@@ -1061,7 +1050,7 @@ function build_driver_from_github() {
10611050 fi
10621051 fi
10631052
1064- if ${gsutil_cmd} ls " ${gcs_tarball} " 2>&1 ; then
1053+ if ${gsutil_stat_cmd} " ${gcs_tarball} " 2>&1 ; then
10651054 echo " cache hit"
10661055 else
10671056 # build the kernel modules
@@ -1096,7 +1085,7 @@ function build_driver_from_github() {
10961085 " ${workdir} /open-gpu-kernel-modules/kernel-open/" * .log \
10971086 $( find /lib/modules/${uname_r} / -iname ' nvidia*.ko' )
10981087 ${gsutil_cmd} cp " ${local_tarball} " " ${gcs_tarball} "
1099- if ${gsutil_cmd} ls " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
1088+ if ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
11001089 building_file=" "
11011090 rm " ${local_tarball} "
11021091 make clean
@@ -1195,16 +1184,13 @@ function install_nvidia_userspace_runfile() {
11951184 if [[ " $( hostname -s) " =~ ^test && " $( nproc) " < 32 ]] ; then
11961185 # when running with fewer than 32 cores, yield to in-progress build
11971186 sleep $(( ( RANDOM % 11 ) + 10 ))
1198- if ${gsutil_cmd} ls -j " ${gcs_tarball} .building" > " ${local_tarball} .building.json" ; then
1187+ local output=" $( ${gsutil_stat_cmd} " ${gcs_tarball} .building" ) "
1188+ if [[ $? == " 0" ]] ; then
11991189 local build_start_time build_start_epoch timeout_epoch
1200- if [[ " ${gsutil_cmd} " =~ " gsutil" ]] ; then
1201- build_start_time=" $( ${gsutil_cmd} ls -L " ${gcs_tarball} .building" | awk -F' : +' ' /Creation time/ {print $2}' ) "
1202- else
1203- build_start_time=" $( ${gsutil_cmd} ls -j " ${gcs_tarball} .building" | jq -r .[0].metadata.timeCreated " ${local_tarball} .building.json" ) "
1204- fi
1205- build_start_epoch=" $( date -d " ${build_start_time} " +%s) "
1190+ build_start_time=" $( echo ${output} | awk -F' : +' ' /.reation.time/ {print $2}' ) "
1191+ build_start_epoch=" $( date -u -d " ${build_start_time} " +%s) "
12061192 timeout_epoch=$(( build_start_epoch + 2700 )) # 45 minutes
1207- while ${gsutil_cmd} ls -L " ${gcs_tarball} .building" ; do
1193+ while ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; do
12081194 local now_epoch=" $( date -u +%s) "
12091195 if (( now_epoch > timeout_epoch )) ; then
12101196 # detect unexpected build failure after 45m
@@ -1216,7 +1202,7 @@ function install_nvidia_userspace_runfile() {
12161202 fi
12171203 fi
12181204
1219- if ${gsutil_cmd} ls " ${gcs_tarball} " ; then
1205+ if ${gsutil_stat_cmd} " ${gcs_tarball} " ; then
12201206 cache_hit=" 1"
12211207 if version_ge " ${DRIVER_VERSION} " " ${MIN_OPEN_DRIVER_VER} " ; then
12221208 runfile_args=" ${runfile_args} --no-kernel-modules"
@@ -1268,7 +1254,8 @@ function install_nvidia_userspace_runfile() {
12681254 /var/log/nvidia-installer.log \
12691255 $( find /lib/modules/${uname_r} / -iname ' nvidia*.ko' )
12701256 ${gsutil_cmd} cp " ${local_tarball} " " ${gcs_tarball} "
1271- if ${gsutil_cmd} ls " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
1257+
1258+ if ${gsutil_stat_cmd} " ${gcs_tarball} .building" ; then ${gsutil_cmd} rm " ${gcs_tarball} .building" || true ; fi
12721259 building_file=" "
12731260 fi
12741261 fi
@@ -1427,7 +1414,9 @@ function install_gpu_agent() {
14271414 | sed -e ' s/-u --format=/--format=/' \
14281415 | dd status=none of=" ${install_dir} /report_gpu_metrics.py"
14291416 local venv=" ${install_dir} /venv"
1430- /opt/conda/miniconda3/bin/python3 -m venv " ${venv} "
1417+ python_interpreter=" /opt/conda/miniconda3/bin/python3"
1418+ [[ -f " ${python_interpreter} " ]] || python_interpreter=" $( command -v python3) "
1419+ " ${python_interpreter} " -m venv " ${venv} "
14311420(
14321421 source " ${venv} /bin/activate"
14331422 python3 -m pip install --upgrade pip
@@ -1755,13 +1744,17 @@ function prepare_gpu_env(){
17551744 pci_device_id=" $( grep -h -i PCI_ID=10DE /sys/bus/pci/devices/* /uevent | head -1 | awk -F: ' {print $2}' ) "
17561745 pci_device_id_int=" $(( 16 #${pci_device_id} )) "
17571746 case " ${pci_device_id} " in
1758- " 15F8" ) gpu_type=" nvidia-tesla-p100" ;;
1759- " 1BB3" ) gpu_type=" nvidia-tesla-p4" ;;
1760- " 1DB1" ) gpu_type=" nvidia-tesla-v100" ;;
1761- " 1EB8" ) gpu_type=" nvidia-tesla-t4" ;;
1762- " 20*" ) gpu_type=" nvidia-tesla-a100" ;;
1763- " 23*" ) gpu_type=" nvidia-h100" ;; # install does not begin with image 2.0.68-debian10/cuda11.1
1764- " 27B8" ) gpu_type=" nvidia-l4" ;; # install does not complete with image 2.0.68-debian10/cuda11.1
1747+ " 15F8" ) gpu_type=" nvidia-tesla-p100" ;;
1748+ " 1BB3" ) gpu_type=" nvidia-tesla-p4" ;;
1749+ " 1DB1" ) gpu_type=" nvidia-tesla-v100" ;;
1750+ " 1EB8" ) gpu_type=" nvidia-tesla-t4" ;;
1751+ " 20B2" | \
1752+ " 20B5" | \
1753+ " 20F3" | \
1754+ " 20F5" ) gpu_type=" nvidia-tesla-a100-80gb" ;;
1755+ " 20*" ) gpu_type=" nvidia-tesla-a100" ;;
1756+ " 23*" ) gpu_type=" nvidia-h100" ;; # NB: install does not begin with legacy image 2.0.68-debian10/cuda11.1
1757+ " 27B8" ) gpu_type=" nvidia-l4" ;; # NB: install does not complete with legacy image 2.0.68-debian10/cuda11.1
17651758 esac
17661759
17671760 ACCELERATOR=" type=${gpu_type} ,count=${gpu_count} "
@@ -1929,7 +1922,7 @@ function cache_fetched_package() {
19291922 local gcs_fn=" $2 "
19301923 local local_fn=" $3 "
19311924
1932- if ${gsutil_cmd} ls " ${gcs_fn} " 2>&1 ; then
1925+ if ${gsutil_stat_cmd} " ${gcs_fn} " 2>&1 ; then
19331926 time ${gsutil_cmd} cp " ${gcs_fn} " " ${local_fn} "
19341927 else
19351928 time ( curl ${curl_retry_args} " ${src_url} " -o " ${local_fn} " && \
@@ -2048,7 +2041,7 @@ function exit_handler() {
20482041
20492042 # clean up incomplete build indicators
20502043 if test -n " ${building_file} " ; then
2051- if ${gsutil_cmd} ls " ${building_file} " ; then ${gsutil_cmd} rm " ${building_file} " || true ; fi
2044+ if ${gsutil_stat_cmd} " ${building_file} " ; then ${gsutil_cmd} rm " ${building_file} " || true ; fi
20522045 fi
20532046
20542047 set +ex
@@ -2180,7 +2173,9 @@ function mount_ramdisk(){
21802173 mount -t tmpfs tmpfs " ${tmpdir} "
21812174
21822175 # Download conda packages to tmpfs
2183- /opt/conda/miniconda3/bin/conda config --add pkgs_dirs " ${tmpdir} "
2176+ if [[ -f /opt/conda/miniconda3/bin/conda ]] ; then
2177+ /opt/conda/miniconda3/bin/conda config --add pkgs_dirs " ${tmpdir} "
2178+ fi
21842179
21852180 # Clear pip cache
21862181 # TODO: make this conditional on which OSs have pip without cache purge
@@ -2230,9 +2225,11 @@ function prepare_to_install(){
22302225 # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
22312226 # used as a more performant replacement for `gsutil`
22322227 gsutil_cmd=" gcloud storage"
2228+ gsutil_stat_cmd=" gcloud storage objects describe"
22332229 gcloud_sdk_version=" $( gcloud --version | awk -F' SDK ' ' /Google Cloud SDK/ {print $2}' ) "
22342230 if version_lt " ${gcloud_sdk_version} " " 402.0.0" ; then
22352231 gsutil_cmd=" gsutil -o GSUtil:check_hashes=never"
2232+ gsutil_stat_cmd=" gsutil stat"
22362233 fi
22372234 curl_retry_args=" -fsSL --retry-connrefused --retry 10 --retry-max-time 30"
22382235
@@ -2302,7 +2299,7 @@ function check_os() {
23022299
23032300 SPARK_VERSION=" $( spark-submit --version 2>&1 | sed -n ' s/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) "
23042301 readonly SPARK_VERSION
2305- if version_lt " ${SPARK_VERSION} " " 3.1 " || \
2302+ if version_lt " ${SPARK_VERSION} " " 2.4 " || \
23062303 version_ge " ${SPARK_VERSION} " " 4.0" ; then
23072304 echo " Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
23082305 exit 1
@@ -2316,7 +2313,8 @@ function check_os() {
23162313 # When building custom-images, neither of the above variables
23172314 # are defined and we need to make a reasonable guess
23182315
2319- if version_lt " ${SPARK_VERSION} " " 3.2" ; then DATAPROC_IMAGE_VERSION=" 2.0"
2316+ if version_lt " ${SPARK_VERSION} " " 2.5" ; then DATAPROC_IMAGE_VERSION=" 1.5"
2317+ elif version_lt " ${SPARK_VERSION} " " 3.2" ; then DATAPROC_IMAGE_VERSION=" 2.0"
23202318 elif version_lt " ${SPARK_VERSION} " " 3.4" ; then DATAPROC_IMAGE_VERSION=" 2.1"
23212319 elif version_lt " ${SPARK_VERSION} " " 3.6" ; then DATAPROC_IMAGE_VERSION=" 2.2"
23222320 else echo " Unknown dataproc image version" ; exit 1 ; fi
@@ -2386,6 +2384,9 @@ function install_spark_rapids() {
23862384 # Update SPARK RAPIDS config
23872385 local DEFAULT_SPARK_RAPIDS_VERSION
23882386 DEFAULT_SPARK_RAPIDS_VERSION=" 24.08.1"
2387+ if version_ge " ${DATAPROC_IMAGE_VERSION} " " 2.2" ; then
2388+ DEFAULT_SPARK_RAPIDS_VERSION=" 25.02.1"
2389+ fi
23892390 local DEFAULT_XGBOOST_VERSION=" 1.7.6" # 2.1.3
23902391
23912392 # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
0 commit comments