Skip to content

Commit 1d9ec7f

Browse files
authored
Test GPU driver installer and cloud-sql-proxy actions with 1.5-debian10 (#1312)
Thank you Wendy & Axel for the reviews!
1 parent 46a53ed commit 1d9ec7f

File tree

2 files changed

+61
-59
lines changed

2 files changed

+61
-59
lines changed

cloud-sql-proxy/cloud-sql-proxy.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ function get_java_property() {
288288
function get_dataproc_property() {
289289
local property_name=$1
290290
local property_value
291+
[[ -f /etc/google-dataproc/dataproc.properties ]] || return
291292
property_value=$(get_java_property \
292293
/etc/google-dataproc/dataproc.properties "${property_name}")
293294
echo "${property_value}"

gpu/install_gpu_driver.sh

Lines changed: 60 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ readonly -A CUDA_SUBVER=(
183183

184184
function set_cuda_version() {
185185
case "${DATAPROC_IMAGE_VERSION}" in
186+
"1.5" ) DEFAULT_CUDA_VERSION="11.6.2" ;;
186187
"2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
187188
"2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
188189
"2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
@@ -243,10 +244,10 @@ function set_driver_version() {
243244
if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
244245
major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
245246
driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
246-
if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
247+
if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then
247248
# use the version indicated by the cuda url as the default if it exists
248249
DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
249-
elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
250+
elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then
250251
# use the maximum sub-version available for the major version indicated in cuda url as the default
251252
DEFAULT_DRIVER="${driver_max_maj_version}"
252253
fi
@@ -266,7 +267,7 @@ function set_driver_version() {
266267
export DRIVER_VERSION DRIVER
267268

268269
gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
269-
if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
270+
if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200' ; then
270271
echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
271272
exit 1
272273
fi
@@ -398,7 +399,7 @@ function set_cuda_runfile_url() {
398399

399400
NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
400401

401-
if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
402+
if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then
402403
echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
403404
if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then
404405
echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead"
@@ -628,16 +629,13 @@ function install_nvidia_nccl() {
628629
if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
629630
# when running with fewer than 32 cores, yield to in-progress build
630631
sleep $(( ( RANDOM % 11 ) + 10 ))
631-
if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then
632+
local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building")"
633+
if [[ "$?" == "0" ]] ; then
632634
local build_start_time build_start_epoch timeout_epoch
633-
if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then
634-
build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')"
635-
else
636-
build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
637-
fi
638-
build_start_epoch="$(date -d "${build_start_time}" +%s)"
635+
build_start_time="$(echo ${output} | awk -F': +' '/.reation.time/ {print $2}')"
636+
build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
639637
timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
640-
while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do
638+
while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
641639
local now_epoch="$(date -u +%s)"
642640
if (( now_epoch > timeout_epoch )) ; then
643641
# detect unexpected build failure after 45m
@@ -649,8 +647,7 @@ function install_nvidia_nccl() {
649647
fi
650648
fi
651649

652-
output=$(${gsutil_cmd} ls "${gcs_tarball}" 2>&1 || echo '')
653-
if echo "${output}" | grep -q "${gcs_tarball}" ; then
650+
if ${gsutil_stat_cmd} "${gcs_tarball}" ; then
654651
# cache hit - unpack from cache
655652
echo "cache hit"
656653
${gsutil_cmd} cat "${gcs_tarball}" | tar xvz
@@ -705,7 +702,7 @@ function install_nvidia_nccl() {
705702
popd
706703
tar xzvf "${local_tarball}"
707704
${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
708-
if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
705+
if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
709706
building_file=""
710707
rm "${local_tarball}"
711708
fi
@@ -796,6 +793,7 @@ function install_pytorch() {
796793
local env
797794
env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
798795
local mc3=/opt/conda/miniconda3
796+
[[ -d ${mc3} ]] || return
799797
local envpath="${mc3}/envs/${env}"
800798
if [[ "${env}" == "base" ]]; then
801799
echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
@@ -809,16 +807,12 @@ function install_pytorch() {
809807
if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
810808
# when running with fewer than 32 cores, yield to in-progress build
811809
sleep $(( ( RANDOM % 11 ) + 10 ))
812-
if ${gsutil_cmd} ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
810+
if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then
813811
local build_start_time build_start_epoch timeout_epoch
814-
if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then
815-
build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')"
816-
else
817-
build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
818-
fi
819-
build_start_epoch="$(date -d "${build_start_time}" +%s)"
812+
build_start_time="$(${gsutil_stat_cmd} "${gcs_tarball}.building" | awk -F': +' '/.reation.time/ {print $2}')"
813+
build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
820814
timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
821-
while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do
815+
while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
822816
local now_epoch="$(date -u +%s)"
823817
if (( now_epoch > timeout_epoch )) ; then
824818
# detect unexpected build failure after 45m
@@ -830,8 +824,7 @@ function install_pytorch() {
830824
fi
831825
fi
832826

833-
output=$(${gsutil_cmd} ls "${gcs_tarball}" 2>&1 || echo '')
834-
if echo "${output}" | grep -q "${gcs_tarball}" ; then
827+
if ${gsutil_stat_cmd} "${gcs_tarball}" ; then
835828
# cache hit - unpack from cache
836829
echo "cache hit"
837830
mkdir -p "${envpath}"
@@ -859,7 +852,7 @@ function install_pytorch() {
859852
tar czf "${local_tarball}" .
860853
popd
861854
${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
862-
if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
855+
if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
863856
building_file=""
864857
fi
865858

@@ -1040,16 +1033,12 @@ function build_driver_from_github() {
10401033
if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
10411034
# when running with fewer than 32 cores, yield to in-progress build
10421035
sleep $(( ( RANDOM % 11 ) + 10 ))
1043-
if ${gsutil_cmd} ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
1036+
if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then
10441037
local build_start_time build_start_epoch timeout_epoch
1045-
if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then
1046-
build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')"
1047-
else
1048-
build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
1049-
fi
1050-
build_start_epoch="$(date -d "${build_start_time}" +%s)"
1038+
build_start_time="$(${gsutil_stat_cmd} "${gcs_tarball}.building" | awk -F': +' '/.reation.time/ {print $2}')"
1039+
build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
10511040
timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
1052-
while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do
1041+
while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
10531042
local now_epoch="$(date -u +%s)"
10541043
if (( now_epoch > timeout_epoch )) ; then
10551044
# detect unexpected build failure after 45m
@@ -1061,7 +1050,7 @@ function build_driver_from_github() {
10611050
fi
10621051
fi
10631052

1064-
if ${gsutil_cmd} ls "${gcs_tarball}" 2>&1 ; then
1053+
if ${gsutil_stat_cmd} "${gcs_tarball}" 2>&1 ; then
10651054
echo "cache hit"
10661055
else
10671056
# build the kernel modules
@@ -1096,7 +1085,7 @@ function build_driver_from_github() {
10961085
"${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
10971086
$(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
10981087
${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
1099-
if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
1088+
if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
11001089
building_file=""
11011090
rm "${local_tarball}"
11021091
make clean
@@ -1195,16 +1184,13 @@ function install_nvidia_userspace_runfile() {
11951184
if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
11961185
# when running with fewer than 32 cores, yield to in-progress build
11971186
sleep $(( ( RANDOM % 11 ) + 10 ))
1198-
if ${gsutil_cmd} ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
1187+
local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building")"
1188+
if [[ $? == "0" ]] ; then
11991189
local build_start_time build_start_epoch timeout_epoch
1200-
if [[ "${gsutil_cmd}" =~ "gsutil" ]] ; then
1201-
build_start_time="$(${gsutil_cmd} ls -L "${gcs_tarball}.building" | awk -F': +' '/Creation time/ {print $2}')"
1202-
else
1203-
build_start_time="$(${gsutil_cmd} ls -j "${gcs_tarball}.building" | jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
1204-
fi
1205-
build_start_epoch="$(date -d "${build_start_time}" +%s)"
1190+
build_start_time="$(echo ${output} | awk -F': +' '/.reation.time/ {print $2}')"
1191+
build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
12061192
timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
1207-
while ${gsutil_cmd} ls -L "${gcs_tarball}.building" ; do
1193+
while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
12081194
local now_epoch="$(date -u +%s)"
12091195
if (( now_epoch > timeout_epoch )) ; then
12101196
# detect unexpected build failure after 45m
@@ -1216,7 +1202,7 @@ function install_nvidia_userspace_runfile() {
12161202
fi
12171203
fi
12181204

1219-
if ${gsutil_cmd} ls "${gcs_tarball}" ; then
1205+
if ${gsutil_stat_cmd} "${gcs_tarball}" ; then
12201206
cache_hit="1"
12211207
if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
12221208
runfile_args="${runfile_args} --no-kernel-modules"
@@ -1268,7 +1254,8 @@ function install_nvidia_userspace_runfile() {
12681254
/var/log/nvidia-installer.log \
12691255
$(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
12701256
${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
1271-
if ${gsutil_cmd} ls "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
1257+
1258+
if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
12721259
building_file=""
12731260
fi
12741261
fi
@@ -1427,7 +1414,9 @@ function install_gpu_agent() {
14271414
| sed -e 's/-u --format=/--format=/' \
14281415
| dd status=none of="${install_dir}/report_gpu_metrics.py"
14291416
local venv="${install_dir}/venv"
1430-
/opt/conda/miniconda3/bin/python3 -m venv "${venv}"
1417+
python_interpreter="/opt/conda/miniconda3/bin/python3"
1418+
[[ -f "${python_interpreter}" ]] || python_interpreter="$(command -v python3)"
1419+
"${python_interpreter}" -m venv "${venv}"
14311420
(
14321421
source "${venv}/bin/activate"
14331422
python3 -m pip install --upgrade pip
@@ -1755,13 +1744,17 @@ function prepare_gpu_env(){
17551744
pci_device_id="$(grep -h -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | head -1 | awk -F: '{print $2}')"
17561745
pci_device_id_int="$((16#${pci_device_id}))"
17571746
case "${pci_device_id}" in
1758-
"15F8" ) gpu_type="nvidia-tesla-p100" ;;
1759-
"1BB3" ) gpu_type="nvidia-tesla-p4" ;;
1760-
"1DB1" ) gpu_type="nvidia-tesla-v100" ;;
1761-
"1EB8" ) gpu_type="nvidia-tesla-t4" ;;
1762-
"20*" ) gpu_type="nvidia-tesla-a100" ;;
1763-
"23*" ) gpu_type="nvidia-h100" ;; # install does not begin with image 2.0.68-debian10/cuda11.1
1764-
"27B8" ) gpu_type="nvidia-l4" ;; # install does not complete with image 2.0.68-debian10/cuda11.1
1747+
"15F8" ) gpu_type="nvidia-tesla-p100" ;;
1748+
"1BB3" ) gpu_type="nvidia-tesla-p4" ;;
1749+
"1DB1" ) gpu_type="nvidia-tesla-v100" ;;
1750+
"1EB8" ) gpu_type="nvidia-tesla-t4" ;;
1751+
"20B2" | \
1752+
"20B5" | \
1753+
"20F3" | \
1754+
"20F5" ) gpu_type="nvidia-tesla-a100-80gb" ;;
1755+
"20*" ) gpu_type="nvidia-tesla-a100" ;;
1756+
"23*" ) gpu_type="nvidia-h100" ;; # NB: install does not begin with legacy image 2.0.68-debian10/cuda11.1
1757+
"27B8" ) gpu_type="nvidia-l4" ;; # NB: install does not complete with legacy image 2.0.68-debian10/cuda11.1
17651758
esac
17661759

17671760
ACCELERATOR="type=${gpu_type},count=${gpu_count}"
@@ -1929,7 +1922,7 @@ function cache_fetched_package() {
19291922
local gcs_fn="$2"
19301923
local local_fn="$3"
19311924

1932-
if ${gsutil_cmd} ls "${gcs_fn}" 2>&1 ; then
1925+
if ${gsutil_stat_cmd} "${gcs_fn}" 2>&1 ; then
19331926
time ${gsutil_cmd} cp "${gcs_fn}" "${local_fn}"
19341927
else
19351928
time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \
@@ -2048,7 +2041,7 @@ function exit_handler() {
20482041

20492042
# clean up incomplete build indicators
20502043
if test -n "${building_file}" ; then
2051-
if ${gsutil_cmd} ls "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi
2044+
if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi
20522045
fi
20532046

20542047
set +ex
@@ -2180,7 +2173,9 @@ function mount_ramdisk(){
21802173
mount -t tmpfs tmpfs "${tmpdir}"
21812174

21822175
# Download conda packages to tmpfs
2183-
/opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
2176+
if [[ -f /opt/conda/miniconda3/bin/conda ]] ; then
2177+
/opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
2178+
fi
21842179

21852180
# Clear pip cache
21862181
# TODO: make this conditional on which OSs have pip without cache purge
@@ -2230,9 +2225,11 @@ function prepare_to_install(){
22302225
# With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
22312226
# used as a more performant replacement for `gsutil`
22322227
gsutil_cmd="gcloud storage"
2228+
gsutil_stat_cmd="gcloud storage objects describe"
22332229
gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
22342230
if version_lt "${gcloud_sdk_version}" "402.0.0" ; then
22352231
gsutil_cmd="gsutil -o GSUtil:check_hashes=never"
2232+
gsutil_stat_cmd="gsutil stat"
22362233
fi
22372234
curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30"
22382235

@@ -2302,7 +2299,7 @@ function check_os() {
23022299

23032300
SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
23042301
readonly SPARK_VERSION
2305-
if version_lt "${SPARK_VERSION}" "3.1" || \
2302+
if version_lt "${SPARK_VERSION}" "2.4" || \
23062303
version_ge "${SPARK_VERSION}" "4.0" ; then
23072304
echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
23082305
exit 1
@@ -2316,7 +2313,8 @@ function check_os() {
23162313
# When building custom-images, neither of the above variables
23172314
# are defined and we need to make a reasonable guess
23182315

2319-
if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
2316+
if version_lt "${SPARK_VERSION}" "2.5" ; then DATAPROC_IMAGE_VERSION="1.5"
2317+
elif version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
23202318
elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
23212319
elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
23222320
else echo "Unknown dataproc image version" ; exit 1 ; fi
@@ -2386,6 +2384,9 @@ function install_spark_rapids() {
23862384
# Update SPARK RAPIDS config
23872385
local DEFAULT_SPARK_RAPIDS_VERSION
23882386
DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
2387+
if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
2388+
DEFAULT_SPARK_RAPIDS_VERSION="25.02.1"
2389+
fi
23892390
local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
23902391

23912392
# https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu

0 commit comments

Comments
 (0)