From c0ea6318e975de4b7a517c56c72180bcd656f65f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 7 Dec 2024 15:01:23 -0800
Subject: [PATCH 001/112] [gpu] toward a more consistent driver and CUDA
 install

gpu/install_gpu_driver.sh
  * exclusively using .run file installation method when available
  * build nccl from source
  * cache build artifacts from kernel driver and nccl
  * Tested more CUDA minor versions
  * gathering CUDA and driver version from URLs if passed
  * Printing warnings when combination provided is known to fail
  * waiting on apt lock when it exists
  * wrapping expensive functions in completion checks to reduce re-run time
  * fixed a problem with ops agent not installing ; using venv
  * Installing gcc-12 on ubuntu22 to fix kernel driver FTBFS
  * setting better spark defaults
  * skipping proxy setup if http-proxy metadata not set
  * added function to check secure-boot and os version compatability

gpu/manual-test-runner.sh
  * order commands correctly

gpu/test_gpu.py
  * clearer test skipping logic
  * added instructions on how to test pyspark
---
 gpu/install_gpu_driver.sh | 643 ++++++++++++++++++++++++++++----------
 gpu/manual-test-runner.sh |   4 +-
 gpu/test_gpu.py           |  25 +-
 3 files changed, 497 insertions(+), 175 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 25efb2a49..db6d630a1 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -53,7 +53,7 @@ function os_vercat()   ( set +x
                    else os_version ; fi ; )
 
 function repair_old_backports {
-  if ge_debian12 || ! is_debuntu ; then return ; fi
+  if ! is_debuntu ; then return ; fi
   # This script uses 'apt-get update' and is therefore potentially dependent on
   # backports repositories which have been archived.  In order to mitigate this
   # problem, we will use archive.debian.org for the oldoldstable repo
@@ -94,6 +94,7 @@ function print_metadata_value_if_exists() {
   return ${return_code}
 }
 
+# replicates /usr/share/google/get_metadata_value
 function get_metadata_value() (
   set +x
   local readonly varname=$1
@@ -117,10 +118,21 @@ function get_metadata_attribute() (
   get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
 )
 
-OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
 readonly OS_NAME
 
+# Fetch SPARK config
+SPARK_VERSION_ENV="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+readonly SPARK_VERSION_ENV
+if version_ge "${SPARK_VERSION_ENV}" "3.0" && \
+   version_lt "${SPARK_VERSION_ENV}" "4.0" ; then
+  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+  readonly SPARK_VERSION="3.0"             # try ${SPARK_VERSION_ENV}
+else
+  echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+  exit 1
+fi
+
 # node role
 ROLE="$(get_metadata_attribute dataproc-role)"
 readonly ROLE
@@ -131,13 +143,13 @@ readonly ROLE
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
           ["11.8"]="560.35.03"
-          ["12.0"]="525.60.13"  ["12.4"]="560.35.03"  ["12.6"]="560.35.03"
+          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="560.35.03"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
 )
 # https://developer.nvidia.com/cudnn-downloads
 if is_debuntu ; then
 readonly -A CUDNN_FOR_CUDA=(
           ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.4"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"  ["12.6"]="9.5.1.17"
 )
 elif is_rocky ; then
 # rocky:
@@ -150,34 +162,65 @@ elif is_rocky ; then
 #   12.6: 9.5.1.17
 readonly -A CUDNN_FOR_CUDA=(
           ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"   ["12.4"]="9.1.1.17"   ["12.6"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"   ["12.1"]="8.9.3.28"   ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"  ["12.6"]="9.5.1.17"
 )
 fi
 # https://developer.nvidia.com/nccl/nccl-download
 # 12.2: 2.19.3, 12.5: 2.21.5
 readonly -A NCCL_FOR_CUDA=(
           ["11.8"]="2.15.5"
-          ["12.0"]="2.16.5"  ["12.4"]="2.23.4"     ["12.6"]="2.23.4"
+          ["12.0"]="2.16.5"    ["12.1"]="2.18.3"     ["12.4"]="2.23.4"     ["12.5"]="2.21.5"  ["12.6"]="2.23.4"
 )
 readonly -A CUDA_SUBVER=(
           ["11.8"]="11.8.0"
-          ["12.0"]="12.0.0"  ["12.4"]="12.4.1"     ["12.6"]="12.6.2"
+          ["12.0"]="12.0.0"    ["12.1"]="12.1.1"     ["12.4"]="12.4.1"     ["12.5"]="12.5.0"  ["12.6"]="12.6.2"
+)
+# Debian 12
+# 12.3.101, 12.3.52
+# 12.4.127, 12.4.99
+# 12.5.82, 12.5.39
+# 12.6.77, 12.6.68, 12.6.37
+
+readonly -A cuda_toolkit_config_version=(
+          ["12.4"]="12.4.127"     ["12.6"]="12.6.77"
 )
 
 RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-readonly DEFAULT_CUDA_VERSION='12.4'
-CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
-if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then
-  # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27
-  CUDA_VERSION="${DEFAULT_CUDA_VERSION}"
-fi
 
-if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then
-  # Only CUDA 12.0 supported on older debuntu
-  CUDA_VERSION="12.0"
-fi
-readonly CUDA_VERSION
-readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}"
+function set_cuda_version() {
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+  if [[ -n "${cuda_url}" ]] ; then
+    local CUDA_URL_VERSION
+    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
+      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
+    fi
+  else
+    DEFAULT_CUDA_VERSION='12.4'
+  fi
+  readonly DEFAULT_CUDA_VERSION
+
+  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  readonly CUDA_VERSION
+  if ( ! test -v CUDA_FULL_VERSION ) ; then
+    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
+  fi
+  readonly CUDA_FULL_VERSION
+
+  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+    echo "CUDA 12.3.0 is the minimum CUDA 12 version on Debian 12"
+  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+    echo "CUDA 12.1.1 is the maximum CUDA version on ubuntu18.  Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+  fi
+}
+set_cuda_version
 
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
 function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
@@ -187,17 +230,58 @@ function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
 function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
 function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
 
-DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
-if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
-                                         DEFAULT_DRIVER="560.28.03"  ; fi
-if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03"  ; fi
-if ( is_rocky    && le_cuda11 )   ; then DEFAULT_DRIVER="525.147.05" ; fi
-if ( is_ubuntu20 && le_cuda11 )   ; then DEFAULT_DRIVER="535.183.06" ; fi
-if ( is_rocky9   && ge_cuda12 )   ; then DEFAULT_DRIVER="565.57.01"  ; fi
-DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
+function set_driver_version() {
+  local gpu_driver_url
+  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
+
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+  local DEFAULT_DRIVER
+  # Take default from gpu-driver-url metadata value
+  if [[ -n "${gpu_driver_url}" ]] ; then
+    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
+    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
+  # Take default from cuda-url metadata value as a backup
+  elif [[ -n "${cuda_url}" ]] ; then
+    CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" ; fi
+  fi
+
+  if ( ! test -v DEFAULT_DRIVER ) ; then
+  # Otherwise attempt to make an educated guess
+    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
+#    if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
+#                                             DEFAULT_DRIVER="560.28.03"  ; fi
+#    if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03"  ; fi
+#    if ( is_rocky    && le_cuda11 )   ; then DEFAULT_DRIVER="525.147.05" ; fi
+#    if ( is_ubuntu20 && le_cuda11 )   ; then DEFAULT_DRIVER="535.183.06" ; fi
+#    if ( is_rocky9   && ge_cuda12 )   ; then DEFAULT_DRIVER="565.57.01"  ; fi
+  fi
+
+  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
+
+  readonly DRIVER_VERSION
+  readonly DRIVER="${DRIVER_VERSION%%.*}"
 
-readonly DRIVER_VERSION
-readonly DRIVER=${DRIVER_VERSION%%.*}
+  export DRIVER_VERSION DRIVER
+
+  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
+    exit 1
+  fi
+
+  # Verify that the requested combination is supported
+  readonly CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run"
+  cuda_url="https://developer.download.nvidia.com/compute/cuda/${CUDA_FULL_VERSION}/local_installers/${CUDA_RUNFILE}"
+  if ! curl -s --head "${cuda_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${DRIVER_VERSION}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    exit 1
+  fi
+}
+
+set_driver_version
 
 readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
 readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
@@ -227,6 +311,11 @@ readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64
 
 readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
 
+USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+readonly USERSPACE_FILENAME
+
+readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
 # Short name for urls
 if is_ubuntu22  ; then
     # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
@@ -234,14 +323,14 @@ if is_ubuntu22  ; then
     # use packages from previous release until such time as nvidia
     # release ubuntu2204 builds
 
-    nccl_shortname="ubuntu2004"
     shortname="$(os_id)$(os_vercat)"
+    nccl_shortname="ubuntu2004"
 elif ge_rocky9 ; then
     # use packages from previous release until such time as nvidia
     # release rhel9 builds
 
-    nccl_shortname="rhel8"
     shortname="rhel9"
+    nccl_shortname="rhel8"
 elif is_rocky ; then
     shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
     nccl_shortname="${shortname}"
@@ -261,29 +350,55 @@ readonly NCCL_REPO_URL
 readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
 
 function set_cuda_runfile_url() {
-  local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}"
-  local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}"
-
-  if ge_cuda12 ; then
-    if ( le_debian11 || le_ubuntu18 ) ; then
-      RUNFILE_DRIVER_VERSION="525.60.13"
-      RUNFILE_CUDA_VERSION="12.0.0"
-    elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then
-      RUNFILE_DRIVER_VERSION="525.147.05"
-      RUNFILE_CUDA_VERSION="12.0.0"
+  local MAX_DRIVER_VERSION
+  local MAX_CUDA_VERSION
+
+  local MIN_OPEN_DRIVER_VER="515.48.07"
+  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
+  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
+
+  if is_cuda12 ; then
+    if is_debian12 ; then
+      MIN_DRIVER_VERSION="545.23.06"
+      MIN_CUDA_VERSION="12.3.0"
+    elif is_debian10 ; then
+      MAX_DRIVER_VERSION="555.42.02"
+      MAX_CUDA_VERSION="12.5.0"
+    elif is_ubuntu18 ; then
+      MAX_DRIVER_VERSION="530.30.02"
+      MAX_CUDA_VERSION="12.1.1"
+    fi
+  elif ge_version "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    if le_debian10 ; then
+      # cuda 11 is not supported for <= debian10
+      MAX_CUDA_VERSION="0"
+      MAX_DRIVER_VERSION="0"
     fi
   else
-    RUNFILE_DRIVER_VERSION="520.61.05"
-    RUNFILE_CUDA_VERSION="11.8.0"
+    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
   fi
 
-  readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run"
-  CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}"
-  DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}"
+  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
+    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
+    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
+    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  fi
+
+  CUDA_FILENAME="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run"
+  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
+  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_FILENAME}"
   readonly DEFAULT_NVIDIA_CUDA_URL
 
   NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
   readonly NVIDIA_CUDA_URL
+
+  CUDA_FILENAME="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_FILENAME
 }
 
 set_cuda_runfile_url
@@ -315,8 +430,6 @@ readonly CUDNN_TARBALL_URL
 GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
 readonly GPU_DRIVER_PROVIDER
 
-# Stackdriver GPU agent parameters
-readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
 # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
 INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
 readonly INSTALL_GPU_AGENT
@@ -336,7 +449,7 @@ function execute_with_retries() (
 
   if [[ "$cmd" =~ "^apt-get install" ]] ; then
     apt-get -y clean
-    apt-get -y autoremove
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
   fi
   for ((i = 0; i < 3; i++)); do
     set -x
@@ -455,43 +568,92 @@ function uninstall_local_cudnn8_repo() {
 }
 
 function install_nvidia_nccl() {
+  if test -f "${workdir}/nccl-complete" ; then return ; fi
+
   local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
 
-  if is_rocky ; then
-    execute_with_retries \
-      dnf -y -q install \
-        "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}"
-    sync
-  elif is_ubuntu ; then
-    install_cuda_keyring_pkg
+  # https://github.com/NVIDIA/nccl/blob/master/README.md
+  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Fermi:     SM_20,             compute_30
+  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+  # Volta:     SM_70,SM_72,       compute_70,compute_72
+  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+  # Ada:       SM_89,             compute_89
+  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+  # Blackwell: SM_100,            compute_100
+                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
 
-    apt-get update -qq
+  mkdir -p "${workdir}"
+  pushd "${workdir}"
 
-    if is_ubuntu18 ; then
-      execute_with_retries \
-        apt-get install -q -y \
-          libnccl2 libnccl-dev
-      sync
+  test -d "${workdir}/nccl" || {
+    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "nccl-${NCCL_VERSION}-1" nccl
+  }
+
+  local build_path
+  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
+                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
+
+  test -d "${workdir}/nccl/build" || {
+    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+
+    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+    if echo "${output}" | grep -q "${gcs_tarball}" ; then
+      # cache hit - unpack from cache
+      echo "cache hit"
     else
-      execute_with_retries \
-        apt-get install -q -y \
-          "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}"
-      sync
+      # build and cache
+      pushd nccl
+      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+      if is_debuntu ; then
+        # These packages are required to build .deb packages from source
+        execute_with_retries \
+          apt-get install -y -qq build-essential devscripts debhelper fakeroot
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.debian.build
+      elif is_rocky ; then
+        # These packages are required to build .rpm packages from source
+        execute_with_retries \
+          dnf -y -q install rpm-build rpmdevtools
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.redhat.build
+      fi
+      tar czvf "/${local_tarball}" "../${build_path}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
     fi
-  else
-    echo "Unsupported OS: '${OS_NAME}'"
-    # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems
-    # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
-    # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
-    # nvhpc_2024_247_Linux_x86_64_cuda_multi/install
-    return
+    gcloud storage cat "${gcs_tarball}" | tar xz
+  }
+
+  if is_debuntu ; then
+    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
+  elif is_rocky ; then
+    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
   fi
+
+  popd
+  touch "${workdir}/nccl-complete"
 }
 
 function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
 function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
+  if test -f "${workdir}/cudnn-complete" ; then return ; fi
+
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -557,6 +719,7 @@ function install_nvidia_cudnn() {
   ldconfig
 
   echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
+  touch "${workdir}/cudnn-complete"
 }
 
 CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
@@ -686,19 +849,12 @@ function add_repo_nvidia_container_toolkit() {
 
 function add_repo_cuda() {
   if is_debuntu ; then
-    local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
-    local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
-    echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
-    | sudo tee "${sources_list_path}"
-    curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
-      -o "${kr_path}"
+    install_cuda_keyring_pkg # 11.7+, 12.0+
   elif is_rocky ; then
     execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
-    execute_with_retries "dnf clean all"
   fi
 }
 
-readonly uname_r=$(uname -r)
 function build_driver_from_github() {
   if is_ubuntu ; then
     mok_key=/var/lib/shim-signed/mok/MOK.priv
@@ -707,37 +863,58 @@ function build_driver_from_github() {
     mok_key=/var/lib/dkms/mok.key
     mok_der=/var/lib/dkms/mok.pub
   fi
-  workdir=/opt/install-nvidia-driver
-  mkdir -p "${workdir}"
   pushd "${workdir}"
+
   test -d "${workdir}/open-gpu-kernel-modules" || {
-    tarball_fn="${DRIVER_VERSION}.tar.gz"
+    local tarball_fn="${DRIVER_VERSION}.tar.gz"
     curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
       "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
       | tar xz
     mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
   }
-  cd open-gpu-kernel-modules
 
-  time make -j$(nproc) modules \
-    >  /var/log/open-gpu-kernel-modules-build.log \
-    2> /var/log/open-gpu-kernel-modules-build_error.log
-  sync
+  test -f "${workdir}/open-gpu-kernel-modules/kernel-open/nvidia.ko" || {
+    local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
 
+    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+      echo "cache hit"
+    else
+      # build and cache kernel modules
+      pushd open-gpu-kernel-modules
+      execute_with_retries make -j$(nproc) modules \
+        >  kernel-open/build.log \
+        2> kernel-open/build_error.log
+      tar czvf "${local_tarball}" ../open-gpu-kernel-modules/kernel-open
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar xzv
+  }
+
+  # Sign kernel modules
   if [[ -n "${PSN}" ]]; then
-    #configure_dkms_certs
-    for module in $(find kernel-open -name '*.ko'); do
+    for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
       "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
       "${mok_key}" \
       "${mok_der}" \
       "${module}"
     done
-    #clear_dkms_key
   fi
 
-  make modules_install \
-    >> /var/log/open-gpu-kernel-modules-build.log \
-    2>> /var/log/open-gpu-kernel-modules-build_error.log
+  # install kernel modules
+  modinfo nvidia > /dev/null 2>&1 || {
+    pushd open-gpu-kernel-modules
+    make modules_install \
+        >>  kernel-open/build.log \
+        2>> kernel-open/build_error.log
+    depmod -a
+    popd
+  }
+
   popd
 }
 
@@ -776,23 +953,44 @@ function build_driver_from_packages() {
   #clear_dkms_key
 }
 
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
+
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
+
 function install_nvidia_userspace_runfile() {
-  if test -f "${tmpdir}/userspace-complete" ; then return ; fi
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
-  execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
-  rm -f "${tmpdir}/userspace.run"
-  touch "${tmpdir}/userspace-complete"
+  if test -f "${workdir}/userspace-complete" ; then return ; fi
+  local local_fn="${tmpdir}/userspace.run"
+
+  cache_fetched_package "${USERSPACE_URL}" \
+                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
+                        "${local_fn}"
+
+  execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}"
+  rm -f "${local_fn}"
+  touch "${workdir}/userspace-complete"
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${tmpdir}/cuda-complete" ; then return ; fi
-  time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
-  execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
-  rm -f "${tmpdir}/cuda.run"
-  touch "${tmpdir}/cuda-complete"
+  if test -f "${workdir}/cuda-complete" ; then return ; fi
+  local local_fn="${tmpdir}/cuda.run"
+
+  cache_fetched_package "${NVIDIA_CUDA_URL}" \
+			"${pkg_bucket}/${CUDA_FILENAME}" \
+                        "${local_fn}"
+
+  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
+  rm -f "${local_fn}"
+  touch "${workdir}/cuda-complete"
   sync
 }
 
@@ -808,12 +1006,11 @@ function install_cuda_toolkit() {
   if is_debuntu ; then
 #    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
     execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
-    sync
   elif is_rocky ; then
     # rocky9: cuda-11-[7,8], cuda-12-[1..6]
     execute_with_retries dnf -y -q install "${cudatk_package}"
-    sync
   fi
+  sync
 }
 
 function load_kernel_module() {
@@ -830,13 +1027,30 @@ function load_kernel_module() {
   # TODO: if peermem is available, also modprobe nvidia-peermem
 }
 
+function install_cuda(){
+  if test -f "${workdir}/cuda-repo-complete" ; then return ; fi
+
+  if ( ge_debian12 && is_src_os ) ; then
+    echo "installed with the driver on ${OS_NAME}"
+    return 0
+  fi
+
+  # The OS package distributions are unreliable
+  install_cuda_runfile
+
+  # Includes cudNN packages
+  add_repo_cuda
+
+  touch "${workdir}/cuda-repo-complete"
+}
+
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
+  if test -f "${workdir}/gpu-driver-complete" ; then return ; fi
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
     add_repo_nvidia_container_toolkit
     apt-get update -qq
-    #configure_dkms_certs
     apt-get -yq install \
           nvidia-container-toolkit \
           dkms \
@@ -845,42 +1059,38 @@ function install_nvidia_gpu_driver() {
           nvidia-smi \
           libglvnd0 \
           libcuda1
-    #clear_dkms_key
-  elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then
+    echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully"
+    return 0
+  fi
 
-    install_nvidia_userspace_runfile
+  # OS driver packages do not produce reliable driver ; use runfile
+  install_nvidia_userspace_runfile
 
-    build_driver_from_github
+  build_driver_from_github
 
-    install_cuda_runfile
-  elif is_debuntu ; then
-    install_cuda_keyring_pkg
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+  touch "${workdir}/gpu-driver-complete"
+}
 
-    build_driver_from_packages
+function install_ops_agent(){
+  if test -f "${workdir}/ops-agent-complete" ; then return ; fi
 
-    install_cuda_toolkit
-  elif is_rocky ; then
-    add_repo_cuda
+  mkdir -p /opt/google
+  cd /opt/google
+  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
+  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
-    build_driver_from_packages
-
-    install_cuda_toolkit
-  else
-    echo "Unsupported OS: '${OS_NAME}'"
-    exit 1
-  fi
-  ldconfig
-  if is_src_os ; then
-    echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully"
-  else
-    echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  fi
+  touch "${workdir}/ops-agent-complete"
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
 function install_gpu_agent() {
-  if ! command -v pip; then
-    execute_with_retries "apt-get install -y -qq python-pip"
+  # Stackdriver GPU agent parameters
+#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
+  if ( ! command -v pip && is_debuntu ) ; then
+    execute_with_retries "apt-get install -y -qq python3-pip"
   fi
   local install_dir=/opt/gpu-utilization-agent
   mkdir -p "${install_dir}"
@@ -890,7 +1100,13 @@ function install_gpu_agent() {
     "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
-  execute_with_retries pip install -r "${install_dir}/requirements.txt"
+  local venv="${install_dir}/venv"
+  python3 -m venv "${venv}"
+(
+  source "${venv}/bin/activate"
+  python3 -m pip install --upgrade pip
+  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
+)
   sync
 
   # Generate GPU service.
@@ -901,7 +1117,7 @@ Description=GPU Utilization Metric Agent
 [Service]
 Type=simple
 PIDFile=/run/gpu_agent.pid
-ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"'
+ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
 User=root
 Group=root
 WorkingDirectory=/
@@ -926,8 +1142,9 @@ function set_hadoop_property() {
     --clobber
 }
 
-function configure_yarn() {
-  if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+function configure_yarn_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
     printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
   fi
   set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
@@ -975,7 +1192,7 @@ function configure_gpu_exclusive_mode() {
   spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
   if [[ ${spark_version} != 3.* ]]; then
     # include exclusive mode on GPU
-    nvsmi -c EXCLUSIVE_PROCESS
+    nvidia-smi -c EXCLUSIVE_PROCESS
   fi
 }
 
@@ -1023,8 +1240,34 @@ EOF
   chmod a+rx "${gpus_resources_script}"
 
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-  if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
-    echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
+  if version_ge "${SPARK_VERSION}" "3.0" ; then
+    local gpu_count
+    gpu_count="$(lspci | grep NVIDIA | wc -l)"
+    local executor_cores
+    executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+    local executor_memory
+    executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+    local task_cpus=2
+    local gpu_amount
+    gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+
+    cat >>"${spark_defaults_conf}" <<EOF
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
+# query explain output won't show GPU operator, if the user has doubts
+# they can uncomment the line before seeing the GPU plan explain;
+# having AQE enabled gives user the best performance.
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
+spark.executor.resource.gpu.amount=${gpu_count}
+spark.executor.cores=${executor_cores}
+spark.executor.memory=${executor_memory_gb}G
+spark.dynamicAllocation.enabled=false
+# please update this config according to your application
+spark.task.resource.gpu.amount=${gpu_amount}
+spark.task.cpus=2
+spark.yarn.unmanagedAM.enabled=false
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
   fi
 }
 
@@ -1058,7 +1301,7 @@ EOF
 
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
-  if   [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
   elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
   elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
   else nvsmi_works="1" ; fi
@@ -1077,11 +1320,18 @@ function nvsmi() {
 function install_dependencies() {
   if is_debuntu ; then
     execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen
+    if is_ubuntu22 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
   elif is_rocky ; then
     execute_with_retries dnf -y -q install pciutils gcc screen
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    local install_log="${tmpdir}/install.log"
     set +e
     eval "${dnf_cmd}" > "${install_log}" 2>&1
     local retval="$?"
@@ -1109,7 +1359,7 @@ function install_dependencies() {
 function main() {
   # This configuration should be run on all nodes
   # regardless if they have attached GPUs
-  configure_yarn
+  configure_yarn_resources
 
   # Detect NVIDIA GPU
   if (lspci | grep -q NVIDIA); then
@@ -1133,6 +1383,8 @@ function main() {
     if [[ $IS_MIG_ENABLED -eq 0 ]]; then
       install_nvidia_gpu_driver
 
+      install_cuda
+
       load_kernel_module
 
       if [[ -n ${CUDNN_VERSION} ]]; then
@@ -1141,7 +1393,8 @@ function main() {
       fi
       #Install GPU metrics collection in Stackdriver if needed
       if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-        install_gpu_agent
+        #install_ops_agent
+	install_gpu_agent
         echo 'GPU metrics agent successfully deployed.'
       else
         echo 'GPU metrics agent will not be installed.'
@@ -1316,7 +1569,7 @@ function exit_handler() {
   if is_debuntu ; then
     # Clean up OS package cache
     apt-get -y -qq clean
-    apt-get -y -qq autoremove
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
     # re-hold systemd package
     if ge_debian12 ; then
     apt-mark hold systemd libsystemd0 ; fi
@@ -1333,11 +1586,17 @@ function exit_handler() {
       /usr/local/cuda-1?.? \
       /opt/conda/miniconda3 | sort -h
   elif is_debian ; then
-    du -hs \
+    du -x -hs \
       /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /var/lib/{docker,mysql,} \
       /usr/lib \
+      /opt/nvidia/* \
       /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
+      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+      /usr/bin \
+      /usr \
+      /var \
+      / 2>/dev/null | sort -h
   else
     du -hs \
       /var/lib/docker \
@@ -1382,7 +1641,11 @@ print( "    samples-taken: ", scalar @siz, $/,
 }
 
 function set_proxy(){
-  export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)"
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+  export METADATA_HTTP_PROXY
   export http_proxy="${METADATA_HTTP_PROXY}"
   export https_proxy="${METADATA_HTTP_PROXY}"
   export HTTP_PROXY="${METADATA_HTTP_PROXY}"
@@ -1402,6 +1665,9 @@ function mount_ramdisk(){
   mkdir -p "${tmpdir}"
   mount -t tmpfs tmpfs "${tmpdir}"
 
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+
   # Clear pip cache
   # TODO: make this conditional on which OSs have pip without cache purge
   pip cache purge || echo "unable to purge pip cache"
@@ -1418,30 +1684,47 @@ function mount_ramdisk(){
 }
 
 function prepare_to_install(){
+  # Verify OS compatability and Secure boot state
+  check_os_and_secure_boot
+
+  workdir=/opt/install-dpgce
   nvsmi_works="0"
-  readonly bdcfg="/usr/local/bin/bdconfig"
   tmpdir=/tmp/
-  if ! is_debuntu && ! is_rocky ; then
-    echo "Unsupported OS: '$(os_name)'"
-    exit 1
-  fi
-
-  repair_old_backports
-
+  readonly temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  readonly uname_r=$(uname -r)
+  readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
+  mkdir -p "${workdir}"
   trap exit_handler EXIT
+  set_proxy
   mount_ramdisk
-  install_log="${tmpdir}/install.log"
+  configure_dkms_certs
 
-  set_proxy
+  readonly install_log="${tmpdir}/install.log"
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION_ENV}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION_ENV}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION_ENV}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+
+  if test -f "${workdir}/prepare-complete" ; then return ; fi
+
+  repair_old_backports
 
   if is_debuntu ; then
     clean_up_sources_lists
     apt-get update -qq
     apt-get -y clean
-    sleep 5s
-    apt-get -y -qq autoremove
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
     if ge_debian12 ; then
     apt-mark unhold systemd libsystemd0 ; fi
   else
@@ -1453,15 +1736,41 @@ function prepare_to_install(){
     time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
   ) fi
 
-  configure_dkms_certs
-
   install_dependencies
 
   # Monitor disk usage in a screen session
   df / > "/run/disk-usage.log"
   touch "/run/keep-running-df"
-  screen -d -m -US keep-running-df \
+  screen -d -m -LUS keep-running-df \
     bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+
+  touch "${workdir}/prepare-complete"
+}
+
+# Verify if compatible linux distros and secure boot options are used
+function check_os_and_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
+
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
+    exit 1
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Please either disable secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return 1
+  fi
 }
 
 prepare_to_install
diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh
index 7545c1a1e..0c5b2fed1 100644
--- a/gpu/manual-test-runner.sh
+++ b/gpu/manual-test-runner.sh
@@ -5,8 +5,8 @@
 # To run the script, the following will bootstrap
 #
 # git clone git@github.com:LLC-Technologies-Collier/initialization-actions
-# git checkout gpu-20241121
 # cd initialization-actions
+# git checkout gpu-20241121
 # cp gpu/env.json.sample env.json
 # vi env.json
 # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest .
@@ -16,9 +16,7 @@
 #  To see a list of screen windows, press ^a "
 # Num Name
 #
-#   0 monitor
 #   1 2.0-debian10
-#   2 sh
 
 
 readonly timestamp="$(date +%F-%H-%M)"
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f8438915f..ec316b345 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -25,6 +25,12 @@ def verify_pyspark(self, name):
     # Verify that pyspark works
     self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
 
+  def verify_pytorch(self, name):
+    # Verify that pytorch works
+    self.assert_instance_command(name, "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node", 1)
+    #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node
+    #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python /tmp/prakasha-spark-test.py
+
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -64,6 +70,7 @@ def verify_instance_spark(self):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
+
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
@@ -82,7 +89,13 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
-      if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ):
+      if ( configuration == 'SINGLE' and \
+           self.getImageOs() == 'rocky' and \
+           self.getImageVersion() > pkg_resources.parse_version("2.1") ):
+        # Do not attempt this on single instance rocky clusters
+        no_op=1
+      else:
+        # verify that pyspark from command prompt works
         self.verify_pyspark(machine_name)
 
   @parameterized.parameters(
@@ -239,8 +252,9 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
 
     metadata = None
@@ -273,8 +287,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty")
 
     if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \

From f210adf9e650a357797bac7e3fa94dbc7dc967fb Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 7 Dec 2024 15:48:56 -0800
Subject: [PATCH 002/112] correcting driver for cuda 12.4

---
 gpu/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index db6d630a1..bd8ef593f 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -143,7 +143,7 @@ readonly ROLE
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
           ["11.8"]="560.35.03"
-          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="560.35.03"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.54.15"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
 )
 # https://developer.nvidia.com/cudnn-downloads
 if is_debuntu ; then

From f6ff5a3ae75f2a449f0082b2bacf6a3a244654b1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 7 Dec 2024 16:08:21 -0800
Subject: [PATCH 003/112] correcting cuda subversion.  12.4.0 instead of 12.4.1
 so that driver and cuda match up

---
 gpu/install_gpu_driver.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index bd8ef593f..a273075e8 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -143,7 +143,7 @@ readonly ROLE
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
           ["11.8"]="560.35.03"
-          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.54.15"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.54.14"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
 )
 # https://developer.nvidia.com/cudnn-downloads
 if is_debuntu ; then
@@ -173,7 +173,7 @@ readonly -A NCCL_FOR_CUDA=(
 )
 readonly -A CUDA_SUBVER=(
           ["11.8"]="11.8.0"
-          ["12.0"]="12.0.0"    ["12.1"]="12.1.1"     ["12.4"]="12.4.1"     ["12.5"]="12.5.0"  ["12.6"]="12.6.2"
+          ["12.0"]="12.0.0"    ["12.1"]="12.1.1"     ["12.4"]="12.4.0"     ["12.5"]="12.5.0"  ["12.6"]="12.6.2"
 )
 # Debian 12
 # 12.3.101, 12.3.52

From e36b25bd1385dc7bfcd36d6aa7bbfe7b4347767e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 7 Dec 2024 17:24:06 -0800
Subject: [PATCH 004/112] corrected cannonical 11.8 driver version ; removed
 extra code and comment ; added better description of what is in the runfile

---
 gpu/install_gpu_driver.sh | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index a273075e8..07018bc0b 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -142,7 +142,7 @@ readonly ROLE
 # https://developer.nvidia.com/cuda-downloads
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
-          ["11.8"]="560.35.03"
+          ["11.8"]="520.61.05"
           ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.54.14"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
 )
 # https://developer.nvidia.com/cudnn-downloads
@@ -175,15 +175,6 @@ readonly -A CUDA_SUBVER=(
           ["11.8"]="11.8.0"
           ["12.0"]="12.0.0"    ["12.1"]="12.1.1"     ["12.4"]="12.4.0"     ["12.5"]="12.5.0"  ["12.6"]="12.6.2"
 )
-# Debian 12
-# 12.3.101, 12.3.52
-# 12.4.127, 12.4.99
-# 12.5.82, 12.5.39
-# 12.6.77, 12.6.68, 12.6.37
-
-readonly -A cuda_toolkit_config_version=(
-          ["12.4"]="12.4.127"     ["12.6"]="12.6.77"
-)
 
 RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 
@@ -967,6 +958,17 @@ function cache_fetched_package() {
 }
 
 function install_nvidia_userspace_runfile() {
+
+  # This .run file contains NV's OpenGL implementation as well as
+  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
+  # including glib (https://docs.gtk.org/glib/), and what appears to
+  # be a copy of the source from the kernel-open directory of for
+  # example DRIVER_VERSION=560.35.03
+  #
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
+  #
+  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
+  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
   if test -f "${workdir}/userspace-complete" ; then return ; fi
   local local_fn="${tmpdir}/userspace.run"
 

From a2400a7d844da91df512ef9feb0d210a518e67fe Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 8 Dec 2024 23:20:34 -0800
Subject: [PATCH 005/112] skipping most tests ; using 11.7 from the cuda 11
 line instead of the less well supported 11.8

---
 gpu/test_gpu.py | 53 ++++++++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index ec316b345..10c491fb6 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -65,11 +65,12 @@ def verify_instance_spark(self):
   @parameterized.parameters(
       ("SINGLE",   ["m"], GPU_T4, None, None),
 #      ("STANDARD", ["m"], GPU_T4, None, None),
-      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"),
+#      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"),
   )
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
+    self.skipTest("Running only one test to build cache")
 
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
@@ -80,12 +81,12 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-8",
+        machine_type="n1-highmem-32",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
         timeout_in_minutes=90,
-        boot_disk_size="50GB")
+        boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
@@ -104,6 +105,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
   def test_install_gpu_without_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
+    self.skipTest("Running only one test to build cache")
 
     self.skipTest("No need to regularly test not installing the agent")
 
@@ -134,6 +136,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
+    self.skipTest("Running only one test to build cache")
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
@@ -157,30 +160,30 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                                     machine_suffix))
 
   @parameterized.parameters(
-#       ("SINGLE", ["m"],               GPU_T4, None,   "12.0"),
-        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
-      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.8"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "11.7"),
+#      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
+#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.7"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+#    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+#      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
+#    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
+#    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
+#      self.skipTest("CUDA == 12.0 not supported on debian 12")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
-    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
-          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+#    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+#    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+#          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+#      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
-    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+#    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+#    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+#    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+#      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -198,13 +201,14 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
       self.verify_instance_nvcc(machine_name, cuda_version)
 
   @parameterized.parameters(
-      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
+      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.7"),
 #      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"),
       ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"),
   )
   def test_install_gpu_with_mig(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider, cuda_version):
+    self.skipTest("Running only one test to build cache")
 
     self.skipTest("Test is known to fail.  Skipping so that we can exercise others")
 
@@ -249,6 +253,8 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   )
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
+    self.skipTest("Running only one test to build cache")
+
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
@@ -274,15 +280,16 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     self.verify_instance_spark()
 
   @parameterized.parameters(
-    ("SINGLE", ["m"], GPU_T4, None, "11.8"),
+    ("SINGLE", ["m"], GPU_T4, None, "11.7"),
 #    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
     ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.7"),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
   )
   def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
+    self.skipTest("Running only one test to build cache")
 
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")

From a137719dce56e0404e4bf67f5e5e81b0876fa2a8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 8 Dec 2024 23:21:30 -0800
Subject: [PATCH 006/112] verified that the cuda and driver versions match up

---
 gpu/install_gpu_driver.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 07018bc0b..5a2718291 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -142,13 +142,13 @@ readonly ROLE
 # https://developer.nvidia.com/cuda-downloads
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
-          ["11.8"]="520.61.05"
+          ["11.7"]="515.65.01"  ["11.8"]="520.61.05"
           ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.54.14"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
 )
 # https://developer.nvidia.com/cudnn-downloads
 if is_debuntu ; then
 readonly -A CUDNN_FOR_CUDA=(
-          ["11.8"]="9.5.1.17"
+          ["11.7"]="9.5.1.17"  ["11.8"]="9.5.1.17"
           ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"  ["12.6"]="9.5.1.17"
 )
 elif is_rocky ; then
@@ -161,14 +161,14 @@ elif is_rocky ; then
 #   12.5: 9.2.1.18
 #   12.6: 9.5.1.17
 readonly -A CUDNN_FOR_CUDA=(
-          ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"   ["12.1"]="8.9.3.28"   ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"  ["12.6"]="9.5.1.17"
+          ["11.7"]="9.5.1.17"  ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"   ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"  ["12.6"]="9.5.1.17"
 )
 fi
 # https://developer.nvidia.com/nccl/nccl-download
 # 12.2: 2.19.3, 12.5: 2.21.5
 readonly -A NCCL_FOR_CUDA=(
-          ["11.8"]="2.15.5"
+          ["11.7"]="2.21.5"    ["11.8"]="2.21.5"
           ["12.0"]="2.16.5"    ["12.1"]="2.18.3"     ["12.4"]="2.23.4"     ["12.5"]="2.21.5"  ["12.6"]="2.23.4"
 )
 readonly -A CUDA_SUBVER=(

From 693bc7fe403907c329433122695414116a418033 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 8 Dec 2024 23:36:46 -0800
Subject: [PATCH 007/112] reducing log capture

---
 gpu/manual-test-runner.sh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh
index 0c5b2fed1..021528f6c 100644
--- a/gpu/manual-test-runner.sh
+++ b/gpu/manual-test-runner.sh
@@ -6,7 +6,7 @@
 #
 # git clone git@github.com:LLC-Technologies-Collier/initialization-actions
 # cd initialization-actions
-# git checkout gpu-20241121
+# git checkout gpu-20241207
 # cp gpu/env.json.sample env.json
 # vi env.json
 # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest .
@@ -33,7 +33,7 @@ export PROJECT_ID="$(jq    -r .PROJECT_ID           env.json)"
 export REGION="$(jq        -r .REGION               env.json)"
 export BUCKET="$(jq        -r .BUCKET               env.json)"
 
-gcs_log_dir="gs://${BUCKET}/${BUILD_ID}/logs"
+gcs_log_dir="gs://${BUCKET}/gpu-dpgce/builds/${BUILD_ID}/logs"
 
 function exit_handler() {
   RED='\\e[0;31m'
@@ -44,8 +44,11 @@ function exit_handler() {
   # TODO: list clusters which match our BUILD_ID and clean them up
   # TODO: remove any test related resources in the project
 
-  echo 'Uploading local logs to GCS bucket.'
-  gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/"
+  # We allow the user to monitor the logs from within screen session.
+  # Logs can be archived if necessary, but won't be unless needed.
+
+#  echo 'Uploading local logs to GCS bucket.'
+#  gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/"
 
   if [[ -f "${tmp_dir}/tests_success" ]]; then
     echo -e "${GREEN}Workflow succeeded${NC}, check logs at ${log_dir}/ or ${gcs_log_dir}/"

From 4ce1efc00cadb7e4554e2d36a4e0865de3c0fb85 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 8 Dec 2024 23:41:00 -0800
Subject: [PATCH 008/112] temporarily increasing machine shape for build
 caching

---
 gpu/test_gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 10c491fb6..0ae8aa8bd 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -189,12 +189,12 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-8",
+        machine_type="n1-highmem-64",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
         timeout_in_minutes=30,
-        boot_disk_size="50GB")
+        boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)

From 05b3e2ba9ffe12ff418cd61795aeef9ab8f1830e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 9 Dec 2024 10:31:00 -0800
Subject: [PATCH 009/112] 64 is too many for a single T4

---
 gpu/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 0ae8aa8bd..309a4ae56 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -189,7 +189,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-64",
+        machine_type="n1-highmem-32",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,

From e2ab509fdefedcdaa0766bb073516affeaed2475 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 9 Dec 2024 10:53:19 -0800
Subject: [PATCH 010/112] added a subversion for 11.7

---
 gpu/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 5a2718291..f4ee157fc 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -172,7 +172,7 @@ readonly -A NCCL_FOR_CUDA=(
           ["12.0"]="2.16.5"    ["12.1"]="2.18.3"     ["12.4"]="2.23.4"     ["12.5"]="2.21.5"  ["12.6"]="2.23.4"
 )
 readonly -A CUDA_SUBVER=(
-          ["11.8"]="11.8.0"
+          ["11.7"]="11.7.1"    ["11.8"]="11.8.0"
           ["12.0"]="12.0.0"    ["12.1"]="12.1.1"     ["12.4"]="12.4.0"     ["12.5"]="12.5.0"  ["12.6"]="12.6.2"
 )
 

From 1a39be64424e3795640ba9e508e9982407799135 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 9 Dec 2024 10:54:34 -0800
Subject: [PATCH 011/112] add more tests to the install function

---
 gpu/test_gpu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 309a4ae56..5c69ea903 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -195,10 +195,13 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
         metadata=metadata,
         timeout_in_minutes=30,
         boot_disk_size="60GB")
+
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
       self.verify_instance_nvcc(machine_name, cuda_version)
+      self.verify_instance_pyspark(machine_name)
+      self.verify_instance_spark()
 
   @parameterized.parameters(
       ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.7"),

From 41ae06905b269d75e2e7bf84486fbb1c0f136c25 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 9 Dec 2024 11:14:50 -0800
Subject: [PATCH 012/112] only including architectures supported by this
 version of CUDA

---
 gpu/install_gpu_driver.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index f4ee157fc..c00dcdfb9 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -359,7 +359,7 @@ function set_cuda_runfile_url() {
       MAX_DRIVER_VERSION="530.30.02"
       MAX_CUDA_VERSION="12.1.1"
     fi
-  elif ge_version "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
     if le_debian10 ; then
       # cuda 11 is not supported for <= debian10
       MAX_CUDA_VERSION="0"
@@ -576,8 +576,12 @@ function install_nvidia_nccl() {
   # Blackwell: SM_100,            compute_100
                   NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
   NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
+  if version_ge "${CUDA_VERSION}" "11.8" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
+  fi
+  if version_ge "${CUDA_VERSION}" "12.0" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
+  fi
 
   mkdir -p "${workdir}"
   pushd "${workdir}"

From 39ac28118239fb58c1e0f4045edc52d24295116a Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 00:47:02 -0800
Subject: [PATCH 013/112] pinning down versions better ; more caching ; more
 ram disks ; new pytorch and tensorflow test functions

---
 gpu/install_gpu_driver.sh | 200 +++++++++++++++++++++++---------------
 gpu/test_gpu.py           |  72 ++++++++++++--
 gpu/verify_pytorch.py     |   8 ++
 gpu/verify_tensorflow.py  |  28 ++++++
 4 files changed, 221 insertions(+), 87 deletions(-)
 create mode 100644 gpu/verify_pytorch.py
 create mode 100644 gpu/verify_tensorflow.py

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index c00dcdfb9..738960a74 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -138,18 +138,26 @@ ROLE="$(get_metadata_attribute dataproc-role)"
 readonly ROLE
 
 # CUDA version and Driver version
+# https://docs.nvidia.com/deploy/cuda-compatibility/
 # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
 # https://developer.nvidia.com/cuda-downloads
+
+# Minimum supported version for open kernel driver is 515.43.04
+# https://github.com/NVIDIA/open-gpu-kernel-modules/tags
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
-          ["11.7"]="515.65.01"  ["11.8"]="520.61.05"
-          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.54.14"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+          ["11.7"]="515.65.01"  ["11.8"]="525.60.13"
+          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.67"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+)
+readonly -A DRIVER_SUBVER=(
+          ["515"]="515.48.07"   ["520"]="520.56.06"  ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+          ["545"]="545.29.06"   ["550"]="550.127.05" ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
 )
 # https://developer.nvidia.com/cudnn-downloads
 if is_debuntu ; then
 readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="9.5.1.17"  ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"  ["12.6"]="9.5.1.17"
+          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
 )
 elif is_rocky ; then
 # rocky:
@@ -161,19 +169,19 @@ elif is_rocky ; then
 #   12.5: 9.2.1.18
 #   12.6: 9.5.1.17
 readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="9.5.1.17"  ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"   ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"  ["12.6"]="9.5.1.17"
+          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
 )
 fi
 # https://developer.nvidia.com/nccl/nccl-download
 # 12.2: 2.19.3, 12.5: 2.21.5
 readonly -A NCCL_FOR_CUDA=(
-          ["11.7"]="2.21.5"    ["11.8"]="2.21.5"
-          ["12.0"]="2.16.5"    ["12.1"]="2.18.3"     ["12.4"]="2.23.4"     ["12.5"]="2.21.5"  ["12.6"]="2.23.4"
+          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
+          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
 )
 readonly -A CUDA_SUBVER=(
-          ["11.7"]="11.7.1"    ["11.8"]="11.8.0"
-          ["12.0"]="12.0.0"    ["12.1"]="12.1.1"     ["12.4"]="12.4.0"     ["12.5"]="12.5.0"  ["12.6"]="12.6.2"
+          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
+          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
 )
 
 RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
@@ -181,15 +189,17 @@ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 function set_cuda_version() {
   local cuda_url
   cuda_url=$(get_metadata_attribute 'cuda-url' '')
-
   if [[ -n "${cuda_url}" ]] ; then
+    # if cuda-url metadata variable has been passed, extract default version from url
     local CUDA_URL_VERSION
     CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
     if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
       DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
       CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
     fi
-  else
+  fi
+
+  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
     DEFAULT_CUDA_VERSION='12.4'
   fi
   readonly DEFAULT_CUDA_VERSION
@@ -200,17 +210,8 @@ function set_cuda_version() {
     CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
   fi
   readonly CUDA_FULL_VERSION
-
-  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
-    echo "CUDA 12.3.0 is the minimum CUDA 12 version on Debian 12"
-  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
-    echo "CUDA 12.1.1 is the maximum CUDA version on ubuntu18.  Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
-    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
-  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
-    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
-  fi
 }
+
 set_cuda_version
 
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
@@ -235,19 +236,23 @@ function set_driver_version() {
     if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
   # Take default from cuda-url metadata value as a backup
   elif [[ -n "${cuda_url}" ]] ; then
-    CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
-    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" ; fi
+    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
+      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
+      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
+      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the version indicated by the cuda url as the default if it exists
+	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the maximum sub-version available for the major version indicated in cuda url as the default
+	DEFAULT_DRIVER="${driver_max_maj_version}"
+      fi
+    fi
   fi
 
   if ( ! test -v DEFAULT_DRIVER ) ; then
-  # Otherwise attempt to make an educated guess
+    # If a default driver version has not been extracted, use the default for this version of CUDA
     DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
-#    if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
-#                                             DEFAULT_DRIVER="560.28.03"  ; fi
-#    if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03"  ; fi
-#    if ( is_rocky    && le_cuda11 )   ; then DEFAULT_DRIVER="525.147.05" ; fi
-#    if ( is_ubuntu20 && le_cuda11 )   ; then DEFAULT_DRIVER="535.183.06" ; fi
-#    if ( is_rocky9   && ge_cuda12 )   ; then DEFAULT_DRIVER="565.57.01"  ; fi
   fi
 
   DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
@@ -262,14 +267,6 @@ function set_driver_version() {
     echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
     exit 1
   fi
-
-  # Verify that the requested combination is supported
-  readonly CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run"
-  cuda_url="https://developer.download.nvidia.com/compute/cuda/${CUDA_FULL_VERSION}/local_installers/${CUDA_RUNFILE}"
-  if ! curl -s --head "${cuda_url}" | grep -E -q '^HTTP.*200\s*$' ; then
-    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${DRIVER_VERSION}, CUDA_VERSION=${CUDA_FULL_VERSION}"
-    exit 1
-  fi
 }
 
 set_driver_version
@@ -380,16 +377,46 @@ function set_cuda_runfile_url() {
     echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
   fi
 
-  CUDA_FILENAME="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run"
+  # driver version named in cuda runfile filename
+  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+  readonly -A drv_for_cuda=(
+          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+          ["11.8.0"]="520.61.05"
+          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+          ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+  )
+
+  # Verify that the file with the indicated combination exists
+  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
+  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
   local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
-  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_FILENAME}"
-  readonly DEFAULT_NVIDIA_CUDA_URL
+  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
 
   NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
   readonly NVIDIA_CUDA_URL
 
-  CUDA_FILENAME="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
-  readonly CUDA_FILENAME
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
+  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    exit 1
+  fi
+
+  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
+  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+  fi
 }
 
 set_cuda_runfile_url
@@ -469,8 +496,23 @@ function uninstall_cuda_keyring_pkg() {
   CUDA_KEYRING_PKG_INSTALLED="0"
 }
 
-CUDA_LOCAL_REPO_INSTALLED="0"
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
+
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
+
+
 function install_local_cuda_repo() {
+  if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
+
   if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
   CUDA_LOCAL_REPO_INSTALLED="1"
   pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
@@ -491,20 +533,21 @@ function install_local_cuda_repo() {
       "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
+
+  touch "${workdir}/install-local-cuda-repo-complete"
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  CUDA_LOCAL_REPO_INSTALLED="0"
+  rm -f "${workdir}/install-local-cuda-repo-complete"
 }
 
-CUDNN_LOCAL_REPO_INSTALLED="0"
 CUDNN_PKG_NAME=""
 function install_local_cudnn_repo() {
-  if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  pkgname="cudnn-local-repo-${shortname}-${CUDNN}"
+  if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi
+  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
   CUDNN_PKG_NAME="${pkgname}"
   local_deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
 
   # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
   curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
@@ -514,20 +557,21 @@ function install_local_cudnn_repo() {
 
   rm -f "${tmpdir}/local-installer.deb"
 
-  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
-  CUDNN_LOCAL_REPO_INSTALLED="1"
+  touch "${workdir}/install-local-cudnn-repo-complete"
 }
 
 function uninstall_local_cudnn_repo() {
   apt-get purge -yq "${CUDNN_PKG_NAME}"
-  CUDNN_LOCAL_REPO_INSTALLED="0"
+  rm -f "${workdir}/install-local-cudnn-repo-complete"
 }
 
 CUDNN8_LOCAL_REPO_INSTALLED="0"
 CUDNN8_PKG_NAME=""
 function install_local_cudnn8_repo() {
-  if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi
+
   if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
   elif is_debian ; then cudnn8_shortname="debian11"
   else return 0 ; fi
@@ -541,21 +585,31 @@ function install_local_cudnn8_repo() {
 
   deb_fn="${pkgname}_1.0-1_amd64.deb"
   local_deb_fn="${tmpdir}/${deb_fn}"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-      "${local_deb_url}" -o "${local_deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+
+  # cache the cudnn package
+  cache_fetched_package "${local_deb_url}" \
+                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${local_deb_fn}"
+
+  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
+  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
+  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
+    mkdir -p "${cudnn_path}"
+    mount -t tmpfs tmpfs "${cudnn_path}"
+  fi
 
   dpkg -i "${local_deb_fn}"
 
   rm -f "${local_deb_fn}"
 
-  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  CUDNN8_LOCAL_REPO_INSTALLED="1"
+  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  touch "${workdir}/install-local-cudnn8-repo-complete"
 }
 
 function uninstall_local_cudnn8_repo() {
   apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  CUDNN8_LOCAL_REPO_INSTALLED="0"
+  rm -f "${workdir}/install-local-cudnn8-repo-complete"
 }
 
 function install_nvidia_nccl() {
@@ -569,8 +623,12 @@ function install_nvidia_nccl() {
   # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
   # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
   # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+  # The following architectures are suppored by open kernel driver
   # Volta:     SM_70,SM_72,       compute_70,compute_72
   # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+  # The following architectures are supported by CUDA v11.8+
   # Ada:       SM_89,             compute_89
   # Hopper:    SM_90,SM_90a       compute_90,compute_90a
   # Blackwell: SM_100,            compute_100
@@ -672,7 +730,6 @@ function install_nvidia_cudnn() {
     if ge_debian12 && is_src_os ; then
       apt-get -y install nvidia-cudnn
     else
-      local CUDNN="${CUDNN_VERSION%.*}"
       if is_cudnn8 ; then
         install_local_cudnn8_repo
 
@@ -682,6 +739,8 @@ function install_nvidia_cudnn() {
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
             "libcudnn8-dev=${cudnn_pkg_version}"
+
+        uninstall_local_cudnn8_repo
 	sync
       elif is_cudnn9 ; then
 	install_cuda_keyring_pkg
@@ -948,19 +1007,6 @@ function build_driver_from_packages() {
   #clear_dkms_key
 }
 
-function cache_fetched_package() {
-  local src_url="$1"
-  local gcs_fn="$2"
-  local local_fn="$3"
-
-  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
-    time gcloud storage cp "${gcs_fn}" "${local_fn}"
-  else
-    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
-           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
-  fi
-}
-
 function install_nvidia_userspace_runfile() {
 
   # This .run file contains NV's OpenGL implementation as well as
@@ -991,7 +1037,7 @@ function install_cuda_runfile() {
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
-			"${pkg_bucket}/${CUDA_FILENAME}" \
+			"${pkg_bucket}/${CUDA_RUNFILE}" \
                         "${local_fn}"
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
@@ -1562,8 +1608,8 @@ function exit_handler() {
     pip config unset global.cache-dir || echo "unable to unset global pip cache"
 
     # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
-      if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
         umount -f ${shmdir}
       fi
     done
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 5c69ea903..536c7b4bf 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -6,16 +6,46 @@
 
 from integration_tests.dataproc_test_case import DataprocTestCase
 
+DEFAULT_TIMEOUT = 15  # minutes
 
 class NvidiaGpuDriverTestCase(DataprocTestCase):
   COMPONENT = "gpu"
   INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
-  GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a
+  GPU_V100 = "type=nvidia-tesla-v100"
   GPU_A100 = "type=nvidia-tesla-a100"
   GPU_H100 = "type=nvidia-h100-80gb,count=8"
 
+  # Tests for PyTorch
+  TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
+
+  # Tests for TensorFlow
+  TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
+
+  def assert_instance_command(self,
+                            instance,
+                            cmd,
+                            timeout_in_minutes=DEFAULT_TIMEOUT):
+
+    retry_count = 5
+
+    ssh_cmd='gcloud compute ssh {} --zone={} --command="{}"'.format(
+      instance, self.cluster_zone, cmd)
+
+    while retry_count > 0:
+      try:
+        ret_code, stdout, stderr = self.assert_command( ssh_cmd, timeout_in_minutes )
+        return ret_code, stdout, stderr
+      except Exception as e:
+        print("An error occurred: ", e)
+        retry_count -= 1
+        if retry_count > 0:
+          time.sleep(10)
+          continue
+        else:
+          raise
+
   def verify_instance(self, name):
     # Verify that nvidia-smi works
     time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience
@@ -26,10 +56,24 @@ def verify_pyspark(self, name):
     self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
 
   def verify_pytorch(self, name):
-    # Verify that pytorch works
-    self.assert_instance_command(name, "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node", 1)
-    #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node
-    #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python /tmp/prakasha-spark-test.py
+    test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                               self.TORCH_TEST_SCRIPT_FILE_NAME)
+    self.upload_test_file(test_filename, name)
+
+    verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+        self.TORCH_TEST_SCRIPT_FILE_NAME)
+    self.assert_instance_command(name, verify_cmd)
+    self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
+
+  def verify_tensorflow(self, name):
+    test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                               self.TF_TEST_SCRIPT_FILE_NAME)
+    self.upload_test_file(test_filename, name)
+
+    verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+        self.TF_TEST_SCRIPT_FILE_NAME)
+    self.assert_instance_command(name, verify_cmd)
+    self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)
 
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
@@ -47,6 +91,14 @@ def verify_instance_nvcc(self, name, cuda_version):
     self.assert_instance_command(
         name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) )
 
+  def verify_instance_cuda_version(self, name, cuda_version):
+    self.assert_instance_command(
+        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) )
+
+  def verify_instance_driver_version(self, name, driver_version):
+    self.assert_instance_command(
+        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) )
+
   def verify_instance_spark(self):
     self.assert_dataproc_job(
       self.getClusterName(),
@@ -161,9 +213,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
 
   @parameterized.parameters(
         ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
-        ("SINGLE", ["m"],               GPU_T4, None,   "11.7"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
 #      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.7"),
+#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.8"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
@@ -204,7 +256,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
       self.verify_instance_spark()
 
   @parameterized.parameters(
-      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.7"),
+      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
 #      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"),
       ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"),
   )
@@ -283,10 +335,10 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     self.verify_instance_spark()
 
   @parameterized.parameters(
-    ("SINGLE", ["m"], GPU_T4, None, "11.7"),
+    ("SINGLE", ["m"], GPU_T4, None, "11.8"),
 #    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
     ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.7"),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
   )
   def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes,
diff --git a/gpu/verify_pytorch.py b/gpu/verify_pytorch.py
new file mode 100644
index 000000000..dd4910d97
--- /dev/null
+++ b/gpu/verify_pytorch.py
@@ -0,0 +1,8 @@
+import torch
+print("get CUDA details : == : ")
+use_cuda = torch.cuda.is_available()
+if use_cuda:
+    print('__CUDNN VERSION:', torch.backends.cudnn.version())
+    print('__Number CUDA Devices:', torch.cuda.device_count())
+    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
+    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)
diff --git a/gpu/verify_tensorflow.py b/gpu/verify_tensorflow.py
new file mode 100644
index 000000000..2faf2c717
--- /dev/null
+++ b/gpu/verify_tensorflow.py
@@ -0,0 +1,28 @@
+import tensorflow as tf
+print("Get GPU Details : ")
+print(tf.config.list_physical_devices('GPU'))
+#print(tf.test.is_gpu_available())
+
+if tf.test.gpu_device_name():
+    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
+    print("Please install GPU version of TF")
+
+gpu_available = tf.config.list_physical_devices('GPU')
+print("gpu_available : " + str(gpu_available))
+
+#is_cuda_gpu_available = tf.config.list_physical_devices('GPU',cuda_only=True)
+is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
+print("is_cuda_gpu_available : " + str(is_cuda_gpu_available))
+
+#is_cuda_gpu_min_3 = tf.config.list_physical_devices('GPU',True, (3,0))
+is_cuda_gpu_min_3 = tf.test.is_gpu_available(True, (3,0))
+print("is_cuda_gpu_min_3 : " + str(is_cuda_gpu_min_3))
+
+from tensorflow.python.client import device_lib
+
+def get_available_gpus():
+    local_device_protos = device_lib.list_local_devices()
+    return [x.name for x in local_device_protos if x.device_type == 'GPU']
+
+print("Run GPU Functions Below : ")
+print(get_available_gpus())

From f116717a826f3fc3c9b1e5901bc2108bc040ad47 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 11:31:49 -0800
Subject: [PATCH 014/112] using maximum from 8.9 series on rocky for 11.7

---
 gpu/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 738960a74..911c2633a 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -169,7 +169,7 @@ elif is_rocky ; then
 #   12.5: 9.2.1.18
 #   12.6: 9.5.1.17
 readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
           ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
 )
 fi

From 976f869df675ae606b1cdeec96b035e1052e3e35 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 12:34:36 -0800
Subject: [PATCH 015/112] skip full build

---
 cloudbuild/presubmit.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index eec7adb76..882acc4db 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -70,6 +70,7 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
+      continue
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0

From 6ef2fdba48106c8e53f9537a799192d3ca40e72a Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 12:37:58 -0800
Subject: [PATCH 016/112] pinning to bazel-7.4.0

---
 gpu/Dockerfile | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/gpu/Dockerfile b/gpu/Dockerfile
index 1127293e1..23668a189 100644
--- a/gpu/Dockerfile
+++ b/gpu/Dockerfile
@@ -24,10 +24,15 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \
       | dd of=/etc/apt/sources.list.d/bazel.list status=none \
     && apt-get update -qq
 
-RUN apt-get autoremove -y -qq && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel > /dev/null 2>&1 && \
+RUN apt-get update -y -qq && \
+    apt-get autoremove -y -qq && \
+    apt-get install -y -qq default-jdk python3-setuptools bazel-7.4.0 > /dev/null 2>&1 && \
     apt-get clean
 
+# Set bazel-7.4.0 as the default bazel alternative in this container
+RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-7.4.0 7 && \
+    update-alternatives --set bazel /usr/bin/bazel-7.4.0
+
 # Install here any utilities you find useful when troubleshooting
 RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-get clean
 

From 1539cdbae71a132a39a242f1f0eb2e48ff1b5fb0 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 13:37:23 -0800
Subject: [PATCH 017/112] NCCL requires gcc-11 for cuda11

---
 gpu/install_gpu_driver.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 911c2633a..53c73e850 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1379,6 +1379,12 @@ function install_dependencies() {
       update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
       update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
       update-alternatives --set gcc /usr/bin/gcc-12
+    elif is_debian12 && is_cuda11 ; then
+      # On debian12, the default compiler does not build NCCL
+      execute_with_retries apt-get install -y -qq gcc-11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-11
     fi
   elif is_rocky ; then
     execute_with_retries dnf -y -q install pciutils gcc screen

From 9a54f4c947b7aff49a8f4260204f8c3482503498 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 16:53:34 -0800
Subject: [PATCH 018/112] rocky8 is now building from the source in the .run
 file

---
 gpu/install_gpu_driver.sh | 73 ++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 53c73e850..ee5109241 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -147,10 +147,10 @@ readonly ROLE
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
           ["11.7"]="515.65.01"  ["11.8"]="525.60.13"
-          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.67"  ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.67"     ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
 )
 readonly -A DRIVER_SUBVER=(
-          ["515"]="515.48.07"   ["520"]="520.56.06"  ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
           ["545"]="545.29.06"   ["550"]="550.127.05" ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
 )
 # https://developer.nvidia.com/cudnn-downloads
@@ -669,6 +669,7 @@ function install_nvidia_nccl() {
       # build and cache
       pushd nccl
       # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+      install_build_dependencies
       if is_debuntu ; then
         # These packages are required to build .deb packages from source
         execute_with_retries \
@@ -910,13 +911,8 @@ function add_repo_cuda() {
 }
 
 function build_driver_from_github() {
-  if is_ubuntu ; then
-    mok_key=/var/lib/shim-signed/mok/MOK.priv
-    mok_der=/var/lib/shim-signed/mok/MOK.der
-  else
-    mok_key=/var/lib/dkms/mok.key
-    mok_der=/var/lib/dkms/mok.pub
-  fi
+  # closed driver will have been built on rocky8
+  if is_rocky8 ; then return 0 ; fi
   pushd "${workdir}"
 
   test -d "${workdir}/open-gpu-kernel-modules" || {
@@ -937,6 +933,7 @@ function build_driver_from_github() {
     else
       # build and cache kernel modules
       pushd open-gpu-kernel-modules
+      install_build_dependencies
       execute_with_retries make -j$(nproc) modules \
         >  kernel-open/build.log \
         2> kernel-open/build_error.log
@@ -1026,7 +1023,22 @@ function install_nvidia_userspace_runfile() {
                         "${pkg_bucket}/${USERSPACE_FILENAME}" \
                         "${local_fn}"
 
-  execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}"
+  if is_rocky8 ; then
+    install_build_dependencies
+
+    # build non-open driver
+    execute_with_retries bash "${local_fn}" \
+      --module-signing-hash sha256 \
+      --module-signing-x509-hash sha256 \
+      --module-signing-secret-key "${mok_key}" \
+      --module-signing-public-key "${mok_der}" \
+      --module-signing-script "/lib/modules/${uname_r}/build/scripts/sign-file" \
+      --no-dkms \
+      --install-libglvnd --silent --tmpdir="${tmpdir}"
+  else
+    # prepare to build from github
+    execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}"
+  fi
   rm -f "${local_fn}"
   touch "${workdir}/userspace-complete"
   sync
@@ -1369,25 +1381,17 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
-function install_dependencies() {
+function install_build_dependencies() {
+  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
+
   if is_debuntu ; then
-    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen
-    if is_ubuntu22 ; then
-      # On ubuntu22, the default compiler does not build some kernel module versions
-      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
-      execute_with_retries apt-get install -y -qq gcc-12
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-12
-    elif is_debian12 && is_cuda11 ; then
-      # On debian12, the default compiler does not build NCCL
-      execute_with_retries apt-get install -y -qq gcc-11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-11
-    fi
+    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" "gcc-${gcc_ver}" screen
+
+    update-alternatives --install /usr/bin/gcc gcc "/usr/bin/gcc-${gcc_ver}" "${gcc_ver}"
+    update-alternatives --set gcc "/usr/bin/gcc-${gcc_ver}"
+
   elif is_rocky ; then
-    execute_with_retries dnf -y -q install pciutils gcc screen
+    execute_with_retries dnf -y -q install gcc
 
     local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
     set +e
@@ -1412,6 +1416,13 @@ function install_dependencies() {
 
     execute_with_retries "${dnf_cmd}"
   fi
+  touch "${workdir}/build-dependencies-complete"
+}
+
+function install_dependencies() {
+  pkg_list="pciutils screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
 }
 
 function main() {
@@ -1754,6 +1765,14 @@ function prepare_to_install(){
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
+
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+
   mkdir -p "${workdir}"
   trap exit_handler EXIT
   set_proxy

From 33165186e54fa2720acef8e5ceaa0815da3cb2e1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 17:07:16 -0800
Subject: [PATCH 019/112] reverting to previous state of only selecting a
 compiler version on latest releases

---
 gpu/install_gpu_driver.sh | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index ee5109241..2a8d8a7f8 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1385,10 +1385,20 @@ function install_build_dependencies() {
   if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
 
   if is_debuntu ; then
-    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" "gcc-${gcc_ver}" screen
-
-    update-alternatives --install /usr/bin/gcc gcc "/usr/bin/gcc-${gcc_ver}" "${gcc_ver}"
-    update-alternatives --set gcc "/usr/bin/gcc-${gcc_ver}"
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    elif is_debian12 && is_cuda11 ; then
+      # On debian12, the default compiler does not build NCCL
+      execute_with_retries apt-get install -y -qq gcc-11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-11
+    fi
 
   elif is_rocky ; then
     execute_with_retries dnf -y -q install gcc

From 722e4363e9fd77457c792e58f08e17d2a87d6d85 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 18:16:23 -0800
Subject: [PATCH 020/112] replaced literal path names with variable values ;
 indexing builds by the signing key used

---
 gpu/install_gpu_driver.sh | 50 ++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 2a8d8a7f8..c3c50a0d8 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -793,28 +793,27 @@ function configure_dkms_certs() {
     echo "Private key material exists"
 
     local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+
       modulus_md5sum="${expected_modulus_md5sum}"
-    else
-      modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
-    fi
 
-    # Verify that cert md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
         echo "unmatched rsa key modulus"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
+      fi
+      ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
 
-    # Verify that key md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
         echo "unmatched x509 cert modulus"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
     fi
 
     return
   fi
 
-
   # Retrieve cloud secrets keys
   local sig_priv_secret_name
   sig_priv_secret_name="${PSN}"
@@ -841,16 +840,14 @@ function configure_dkms_certs() {
       | base64 --decode \
       | dd status=none of="${CA_TMPDIR}/db.der"
 
-  # symlink private key and copy public cert from volatile storage for DKMS
-  if is_ubuntu ; then
-    mkdir -p /var/lib/shim-signed/mok
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der
-  else
-    mkdir -p /var/lib/dkms/
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub
-  fi
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
 }
 
 function clear_dkms_key {
@@ -858,7 +855,7 @@ function clear_dkms_key {
       echo "No signing secret provided.  skipping" >&2
       return 0
   fi
-  rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
 }
 
 function add_contrib_component() {
@@ -926,7 +923,11 @@ function build_driver_from_github() {
   test -f "${workdir}/open-gpu-kernel-modules/kernel-open/nvidia.ko" || {
     local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz"
     local local_tarball="${workdir}/${build_tarball}"
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+    local build_dir
+    if [[ -n "${modulus_md5sum}" ]] ; then build_dir="${modulus_md5sum}"
+                                      else build_dir="unsigned" ; fi
+
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}"
 
     if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
       echo "cache hit"
@@ -959,6 +960,7 @@ function build_driver_from_github() {
   # install kernel modules
   modinfo nvidia > /dev/null 2>&1 || {
     pushd open-gpu-kernel-modules
+    install_build_dependencies
     make modules_install \
         >>  kernel-open/build.log \
         2>> kernel-open/build_error.log

From f42fee6af0fea53c8376b183427165e18a41439e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 10 Dec 2024 20:53:18 -0800
Subject: [PATCH 021/112] moved variable definition to prepare function ; moved
 driver signing to build phase

---
 gpu/install_gpu_driver.sh | 40 ++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index c3c50a0d8..f16226d4d 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -777,9 +777,6 @@ function install_nvidia_cudnn() {
   touch "${workdir}/cudnn-complete"
 }
 
-CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-PSN="$(get_metadata_attribute private_secret_name)"
-readonly PSN
 function configure_dkms_certs() {
   if [[ -z "${PSN}" ]]; then
       echo "No signing secret provided.  skipping";
@@ -794,22 +791,22 @@ function configure_dkms_certs() {
 
     local expected_modulus_md5sum
     expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
-
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
       modulus_md5sum="${expected_modulus_md5sum}"
 
       # Verify that cert md5sum matches expected md5sum
       if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key modulus"
+        echo "unmatched rsa key"
       fi
-      ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
 
       # Verify that key md5sum matches expected md5sum
       if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert modulus"
+        echo "unmatched x509 cert"
       fi
     else
       modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
     fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
 
     return
   fi
@@ -938,6 +935,15 @@ function build_driver_from_github() {
       execute_with_retries make -j$(nproc) modules \
         >  kernel-open/build.log \
         2> kernel-open/build_error.log
+      # Sign kernel modules
+      if [[ -n "${PSN}" ]]; then
+        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
+          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+          "${mok_key}" \
+          "${mok_der}" \
+          "${module}"
+        done
+      fi
       tar czvf "${local_tarball}" ../open-gpu-kernel-modules/kernel-open
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
       rm "${local_tarball}"
@@ -947,16 +953,6 @@ function build_driver_from_github() {
     gcloud storage cat "${gcs_tarball}" | tar xzv
   }
 
-  # Sign kernel modules
-  if [[ -n "${PSN}" ]]; then
-    for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
-      "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
-      "${mok_key}" \
-      "${mok_der}" \
-      "${module}"
-    done
-  fi
-
   # install kernel modules
   modinfo nvidia > /dev/null 2>&1 || {
     pushd open-gpu-kernel-modules
@@ -1771,11 +1767,17 @@ function prepare_to_install(){
   workdir=/opt/install-dpgce
   nvsmi_works="0"
   tmpdir=/tmp/
-  readonly temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
   readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
-  readonly uname_r=$(uname -r)
+  uname_r=$(uname -r)
+  readonly uname_r
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
 
   if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
                       mok_der=/var/lib/shim-signed/mok/MOK.der

From a13122eac8d13c43e456b1fbe6a4d7b65518c1d5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 13:37:40 -0800
Subject: [PATCH 022/112] test whether variable is defined before checking its
 value

---
 gpu/install_gpu_driver.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index f16226d4d..c2aee539a 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -778,7 +778,7 @@ function install_nvidia_cudnn() {
 }
 
 function configure_dkms_certs() {
-  if [[ -z "${PSN}" ]]; then
+  if test -v PSN && [[ -z "${PSN}" ]]; then
       echo "No signing secret provided.  skipping";
       return 0
   fi
@@ -921,8 +921,9 @@ function build_driver_from_github() {
     local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz"
     local local_tarball="${workdir}/${build_tarball}"
     local build_dir
-    if [[ -n "${modulus_md5sum}" ]] ; then build_dir="${modulus_md5sum}"
-                                      else build_dir="unsigned" ; fi
+    if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+      then build_dir="${modulus_md5sum}"
+      else build_dir="unsigned" ; fi
 
     local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}"
 

From 3b720484d8c11ea1fb7e97f130fafac64e8039a9 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 15:52:26 -0800
Subject: [PATCH 023/112] cache only the bins and logs

---
 gpu/install_gpu_driver.sh | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index c2aee539a..3c10579d4 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -147,7 +147,7 @@ readonly ROLE
 # Rocky8: 12.0: 525.147.05
 readonly -A DRIVER_FOR_CUDA=(
           ["11.7"]="515.65.01"  ["11.8"]="525.60.13"
-          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.67"     ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.127.05" ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
 )
 readonly -A DRIVER_SUBVER=(
           ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
@@ -905,7 +905,7 @@ function add_repo_cuda() {
 }
 
 function build_driver_from_github() {
-  # closed driver will have been built on rocky8
+  # non-GPL driver will have been built on rocky8
   if is_rocky8 ; then return 0 ; fi
   pushd "${workdir}"
 
@@ -917,8 +917,9 @@ function build_driver_from_github() {
     mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
   }
 
-  test -f "${workdir}/open-gpu-kernel-modules/kernel-open/nvidia.ko" || {
-    local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz"
+  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
     local local_tarball="${workdir}/${build_tarball}"
     local build_dir
     if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
@@ -930,7 +931,7 @@ function build_driver_from_github() {
     if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
       echo "cache hit"
     else
-      # build and cache kernel modules
+      # build the kernel modules
       pushd open-gpu-kernel-modules
       install_build_dependencies
       execute_with_retries make -j$(nproc) modules \
@@ -945,26 +946,23 @@ function build_driver_from_github() {
           "${module}"
         done
       fi
-      tar czvf "${local_tarball}" ../open-gpu-kernel-modules/kernel-open
+      make modules_install \
+        >>  kernel-open/build.log \
+        2>> kernel-open/build_error.log
+      depmod -a
+      # Collect build logs and installed binaries
+      tar czvf "${local_tarball}" \
+        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
       rm "${local_tarball}"
       make clean
       popd
     fi
-    gcloud storage cat "${gcs_tarball}" | tar xzv
-  }
-
-  # install kernel modules
-  modinfo nvidia > /dev/null 2>&1 || {
-    pushd open-gpu-kernel-modules
-    install_build_dependencies
-    make modules_install \
-        >>  kernel-open/build.log \
-        2>> kernel-open/build_error.log
-    depmod -a
-    popd
+    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
   }
 
+  install_kernel_modules
   popd
 }
 

From 2cc19ce34b16cb76b113bad5eaede5b73c194bf6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 16:25:50 -0800
Subject: [PATCH 024/112] build index of kernel modules after unpacking ;
 remove call to non-existent function

---
 gpu/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 3c10579d4..004df710b 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -960,9 +960,9 @@ function build_driver_from_github() {
       popd
     fi
     gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+    depmod -a
   }
 
-  install_kernel_modules
   popd
 }
 

From 5a2d78395b11f00f4ef3b22ac49b8b03612bcc2e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 17:37:49 -0800
Subject: [PATCH 025/112] only build module dependency index once

---
 gpu/install_gpu_driver.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 004df710b..8a9b75413 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -949,7 +949,6 @@ function build_driver_from_github() {
       make modules_install \
         >>  kernel-open/build.log \
         2>> kernel-open/build_error.log
-      depmod -a
       # Collect build logs and installed binaries
       tar czvf "${local_tarball}" \
         "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \

From 1cf12ab52132a911018592986e78f0f3c3e15fa4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 21:44:47 -0800
Subject: [PATCH 026/112] skipping CUDA 11 NCCL build on debian12

---
 gpu/install_gpu_driver.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 8a9b75413..ddaba8dcd 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -210,6 +210,7 @@ function set_cuda_version() {
     CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
   fi
   readonly CUDA_FULL_VERSION
+
 }
 
 set_cuda_version
@@ -615,6 +616,11 @@ function uninstall_local_cudnn8_repo() {
 function install_nvidia_nccl() {
   if test -f "${workdir}/nccl-complete" ; then return ; fi
 
+  if is_cuda11 && is_debian12 ; then
+    echo "NCCL cannot be compiled for CUDA 11 on ${OS_NAME}"
+    return
+  fi
+
   local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
 
   # https://github.com/NVIDIA/nccl/blob/master/README.md
@@ -1388,12 +1394,6 @@ function install_build_dependencies() {
       update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
       update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
       update-alternatives --set gcc /usr/bin/gcc-12
-    elif is_debian12 && is_cuda11 ; then
-      # On debian12, the default compiler does not build NCCL
-      execute_with_retries apt-get install -y -qq gcc-11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-11
     fi
 
   elif is_rocky ; then

From 77a95ff5442bb97096d7ec8d779c49b264f56778 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 22:01:02 -0800
Subject: [PATCH 027/112] skip cuda11 on debian12, rocky9

---
 gpu/test_gpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 536c7b4bf..ed8e82008 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -232,10 +232,10 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
 #          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
 #      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
-#    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-#    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
-#    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-#      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(

From 0b2da1410f4819a643c3eb0dea19db28cdb47be6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 22:53:51 -0800
Subject: [PATCH 028/112] renamed verify_pyspark to verify_instance_pyspark

---
 gpu/test_gpu.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index ed8e82008..d154d6a55 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -51,10 +51,6 @@ def verify_instance(self, name):
     time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience
     self.assert_instance_command(name, "nvidia-smi", 1)
 
-  def verify_pyspark(self, name):
-    # Verify that pyspark works
-    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
-
   def verify_pytorch(self, name):
     test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                self.TORCH_TEST_SCRIPT_FILE_NAME)
@@ -91,6 +87,10 @@ def verify_instance_nvcc(self, name, cuda_version):
     self.assert_instance_command(
         name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) )
 
+  def verify_instance_pyspark(self, name):
+    # Verify that pyspark works
+    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
   def verify_instance_cuda_version(self, name, cuda_version):
     self.assert_instance_command(
         name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) )
@@ -148,8 +148,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
         # Do not attempt this on single instance rocky clusters
         no_op=1
       else:
-        # verify that pyspark from command prompt works
-        self.verify_pyspark(machine_name)
+        # verify that pyspark works from command prompt
+        self.verify_instance_pyspark(machine_name)
 
   @parameterized.parameters(
       ("SINGLE", ["m"], GPU_T4, None, None),

From 0c1df7f92a5a3fe3ea237e31f602e885894922bd Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 23:16:56 -0800
Subject: [PATCH 029/112] failing somewhat gracefully ; skipping tests that
 would fail

---
 gpu/install_gpu_driver.sh | 4 ++++
 gpu/test_gpu.py           | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index ddaba8dcd..b65417bf2 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -940,6 +940,10 @@ function build_driver_from_github() {
       # build the kernel modules
       pushd open-gpu-kernel-modules
       install_build_dependencies
+      if is_cuda11 && is_ubuntu22 ; then
+        echo "Kernel modules cannot be compiled for CUDA 11 on ${OS_NAME}"
+        exit 1
+      fi
       execute_with_retries make -j$(nproc) modules \
         >  kernel-open/build.log \
         2> kernel-open/build_error.log
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index d154d6a55..4611ea5d6 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -233,9 +233,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
 #      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
     if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(

From ce60b035ce0c23decd405fa14c065a01ae04eae8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 11 Dec 2024 23:41:56 -0800
Subject: [PATCH 030/112] skipping single node tests for rocky8

---
 gpu/test_gpu.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 4611ea5d6..de08b4827 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -232,6 +232,11 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
 #          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
 #      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
 
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+
     if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("CUDA < 12 not supported on Dataproc 2.2")

From d16e625b729b6c34c7362eda322c0a91331f62cb Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 00:03:00 -0800
Subject: [PATCH 031/112] re-enable other tests

---
 gpu/test_gpu.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index de08b4827..10b66c194 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -122,11 +122,6 @@ def verify_instance_spark(self):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-    self.skipTest("Running only one test to build cache")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     metadata = None
     if driver_provider is not None:
       metadata = "gpu-driver-provider={}".format(driver_provider)
@@ -157,13 +152,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
   def test_install_gpu_without_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-    self.skipTest("Running only one test to build cache")
-
     self.skipTest("No need to regularly test not installing the agent")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     metadata = "install-gpu-agent=false"
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
@@ -188,9 +178,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
-    self.skipTest("Running only one test to build cache")
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -220,8 +207,6 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
-#    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-#      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
 #    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
 #    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
@@ -267,13 +252,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
   def test_install_gpu_with_mig(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider, cuda_version):
-    self.skipTest("Running only one test to build cache")
 
     self.skipTest("Test is known to fail.  Skipping so that we can exercise others")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
     and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
       self.skipTest("CUDA == 12.0 not supported on debian 12")
@@ -312,10 +293,6 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   )
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
-    self.skipTest("Running only one test to build cache")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
@@ -348,10 +325,6 @@ def test_gpu_allocation(self, configuration, master_accelerator,
   def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
-    self.skipTest("Running only one test to build cache")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \

From 7284ad746535d7263849bb0cb712660c2c700e04 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 00:10:22 -0800
Subject: [PATCH 032/112] Specifying bazel version with variable

---
 gpu/Dockerfile | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/gpu/Dockerfile b/gpu/Dockerfile
index 23668a189..05724eb8c 100644
--- a/gpu/Dockerfile
+++ b/gpu/Dockerfile
@@ -15,8 +15,10 @@ RUN apt-get -qq update \
      curl jq less screen > /dev/null 2>&1  && apt-get clean
 
 # Install bazel signing key, repo and package
-ENV bazel_kr_path=/usr/share/keyrings/bazel-release.pub.gpg
-ENV bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8"
+ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg \
+    bazel_version=7.4.0 \
+    bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" \
+    DEBIAN_FRONTEND=noninteractive
 
 RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \
       | gpg --dearmor -o "${bazel_kr_path}" \
@@ -24,14 +26,13 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \
       | dd of=/etc/apt/sources.list.d/bazel.list status=none \
     && apt-get update -qq
 
-RUN apt-get update -y -qq && \
-    apt-get autoremove -y -qq && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel-7.4.0 > /dev/null 2>&1 && \
+RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
+    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
     apt-get clean
 
-# Set bazel-7.4.0 as the default bazel alternative in this container
-RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-7.4.0 7 && \
-    update-alternatives --set bazel /usr/bin/bazel-7.4.0
+# Set bazel-${bazel_version} as the default bazel alternative in this container
+RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \
+    update-alternatives --set bazel /usr/bin/bazel-${bazel_version}
 
 # Install here any utilities you find useful when troubleshooting
 RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-get clean

From 35e4ba243b5259f9cb671ded62c11c2aab371834 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 00:40:04 -0800
Subject: [PATCH 033/112] fixing up some skip logic

---
 gpu/test_gpu.py | 60 ++++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 10b66c194..60d51541e 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -200,9 +200,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
 
   @parameterized.parameters(
         ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
-        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
+#        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
 #      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.8"),
+      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
@@ -212,30 +212,30 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
 #    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
 #      self.skipTest("CUDA == 12.0 not supported on debian 12")
 
-#    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
-#    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
-#          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-#      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
+    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
+
+    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
-
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-32",
+        machine_type="n1-highmem-8",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
         timeout_in_minutes=30,
-        boot_disk_size="60GB")
+        boot_disk_size="50GB")
 
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
@@ -255,19 +255,18 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
 
     self.skipTest("Test is known to fail.  Skipping so that we can exercise others")
 
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
+#    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
+#    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
+#      self.skipTest("CUDA == 12.0 not supported on debian 12")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
     if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -326,24 +325,23 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-    if configuration == 'SINGLE' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty")
-
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
+#    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
+#    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
+#      self.skipTest("CUDA == 12.0 not supported on debian 12")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
     if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(

From be3756926037610dc4454bdbfea9e30c30b6a98b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 01:08:11 -0800
Subject: [PATCH 034/112] replaced OS_NAME with _shortname

---
 gpu/install_gpu_driver.sh | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index b65417bf2..82953f3cd 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -617,7 +617,7 @@ function install_nvidia_nccl() {
   if test -f "${workdir}/nccl-complete" ; then return ; fi
 
   if is_cuda11 && is_debian12 ; then
-    echo "NCCL cannot be compiled for CUDA 11 on ${OS_NAME}"
+    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
     return
   fi
 
@@ -764,22 +764,14 @@ function install_nvidia_cudnn() {
         echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
       fi
     fi
-  elif is_ubuntu ; then
-    local -a packages
-    packages=(
-      "libcudnn${major_version}=${cudnn_pkg_version}"
-      "libcudnn${major_version}-dev=${cudnn_pkg_version}")
-    execute_with_retries \
-      apt-get install -q -y --no-install-recommends "${packages[*]}"
-    sync
   else
-    echo "Unsupported OS: '${OS_NAME}'"
+    echo "Unsupported OS: '${_shortname}'"
     exit 1
   fi
 
   ldconfig
 
-  echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
+  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
   touch "${workdir}/cudnn-complete"
 }
 
@@ -941,7 +933,7 @@ function build_driver_from_github() {
       pushd open-gpu-kernel-modules
       install_build_dependencies
       if is_cuda11 && is_ubuntu22 ; then
-        echo "Kernel modules cannot be compiled for CUDA 11 on ${OS_NAME}"
+        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
         exit 1
       fi
       execute_with_retries make -j$(nproc) modules \
@@ -1101,7 +1093,7 @@ function install_cuda(){
   if test -f "${workdir}/cuda-repo-complete" ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
-    echo "installed with the driver on ${OS_NAME}"
+    echo "installed with the driver on ${_shortname}"
     return 0
   fi
 
@@ -1129,7 +1121,7 @@ function install_nvidia_gpu_driver() {
           nvidia-smi \
           libglvnd0 \
           libcuda1
-    echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully"
+    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
     return 0
   fi
 

From c9d1d958d2fb4447423284a1b9e300a2dc95fb7b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 09:33:29 -0800
Subject: [PATCH 035/112] skip more single instance tests for rocky8

---
 gpu/test_gpu.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 60d51541e..d43071d3e 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -122,6 +122,11 @@ def verify_instance_spark(self):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+
     metadata = None
     if driver_provider is not None:
       metadata = "gpu-driver-provider={}".format(driver_provider)
@@ -137,12 +142,6 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
-      if ( configuration == 'SINGLE' and \
-           self.getImageOs() == 'rocky' and \
-           self.getImageVersion() > pkg_resources.parse_version("2.1") ):
-        # Do not attempt this on single instance rocky clusters
-        no_op=1
-      else:
         # verify that pyspark works from command prompt
         self.verify_instance_pyspark(machine_name)
 

From b63ae1704d82a4306beb59599a721abd3131e71d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 10:30:46 -0800
Subject: [PATCH 036/112] fixing indentation ; skipping redundant test

---
 gpu/test_gpu.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index d43071d3e..3649a865c 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -142,8 +142,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
-        # verify that pyspark works from command prompt
-        self.verify_instance_pyspark(machine_name)
+      self.verify_instance_nvcc(machine_name, cuda_version)
+      self.verify_instance_pyspark(machine_name)
+      self.verify_instance_spark()
 
   @parameterized.parameters(
       ("SINGLE", ["m"], GPU_T4, None, None),
@@ -177,6 +178,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
+    self.skipTest("No need to regularly installing the agent on its own cluster ; this is exercised elsewhere")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:

From 94c1f13f237bd7d00356974f47f72c9313bc1c0b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 10:57:31 -0800
Subject: [PATCH 037/112] remove retries of flakey tests

---
 cloudbuild/presubmit.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index 882acc4db..d9ae3c9bb 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -105,7 +105,6 @@ run_tests() {
   bazel test \
     --jobs="${max_parallel_tests}" \
     --local_test_jobs="${max_parallel_tests}" \
-    --flaky_test_attempts=3 \
     --action_env="INTERNAL_IP_SSH=true" \
     --test_output="all" \
     --noshow_progress \

From ac477b3ea6456e33b20e5b03c09417caf2c7602f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 11:07:14 -0800
Subject: [PATCH 038/112] oops ; need to define the cuda version to test for

---
 gpu/test_gpu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 3649a865c..35f08f801 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -10,6 +10,7 @@
 
 class NvidiaGpuDriverTestCase(DataprocTestCase):
   COMPONENT = "gpu"
+  DEFAULT_CUDA_VERSION = "12.4"
   INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
@@ -142,7 +143,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
-      self.verify_instance_nvcc(machine_name, cuda_version)
+      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
       self.verify_instance_pyspark(machine_name)
       self.verify_instance_spark()
 

From db7aacf301dad1e5e46e4d0fbd39758d60e324c0 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 13:25:17 -0800
Subject: [PATCH 039/112] passing -q to gcloud to generate empty passphrase if
 no ssh key exists ; selecting a more modern version of the 550 driver

---
 gpu/install_gpu_driver.sh | 78 +++++++++++++++++++++------------------
 gpu/test_gpu.py           |  2 +-
 2 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 82953f3cd..19c578d38 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -137,52 +137,58 @@ fi
 ROLE="$(get_metadata_attribute dataproc-role)"
 readonly ROLE
 
-# CUDA version and Driver version
-# https://docs.nvidia.com/deploy/cuda-compatibility/
-# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
-# https://developer.nvidia.com/cuda-downloads
-
-# Minimum supported version for open kernel driver is 515.43.04
-# https://github.com/NVIDIA/open-gpu-kernel-modules/tags
-# Rocky8: 12.0: 525.147.05
-readonly -A DRIVER_FOR_CUDA=(
+function set_support_matrix() {
+  # CUDA version and Driver version
+  # https://docs.nvidia.com/deploy/cuda-compatibility/
+  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+  # https://developer.nvidia.com/cuda-downloads
+
+  # Minimum supported version for open kernel driver is 515.43.04
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
+  # Rocky8: 12.0: 525.147.05
+  local latest
+  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+  readonly -A DRIVER_FOR_CUDA=(
           ["11.7"]="515.65.01"  ["11.8"]="525.60.13"
-          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.127.05" ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
-)
-readonly -A DRIVER_SUBVER=(
+          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+  )
+  readonly -A DRIVER_SUBVER=(
           ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
-          ["545"]="545.29.06"   ["550"]="550.127.05" ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
-)
-# https://developer.nvidia.com/cudnn-downloads
-if is_debuntu ; then
-readonly -A CUDNN_FOR_CUDA=(
+          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+  )
+  # https://developer.nvidia.com/cudnn-downloads
+  if is_debuntu ; then
+  readonly -A CUDNN_FOR_CUDA=(
           ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
           ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
-)
-elif is_rocky ; then
-# rocky:
-#   12.0: 8.8.1.3
-#   12.1: 8.9.3.28
-#   12.2: 8.9.7.29
-#   12.3: 9.0.0.312
-#   12.4: 9.1.1.17
-#   12.5: 9.2.1.18
-#   12.6: 9.5.1.17
-readonly -A CUDNN_FOR_CUDA=(
+  )
+  elif is_rocky ; then
+  # rocky:
+  #   12.0: 8.8.1.3
+  #   12.1: 8.9.3.28
+  #   12.2: 8.9.7.29
+  #   12.3: 9.0.0.312
+  #   12.4: 9.1.1.17
+  #   12.5: 9.2.1.18
+  #   12.6: 9.5.1.17
+  readonly -A CUDNN_FOR_CUDA=(
           ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
           ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
-)
-fi
-# https://developer.nvidia.com/nccl/nccl-download
-# 12.2: 2.19.3, 12.5: 2.21.5
-readonly -A NCCL_FOR_CUDA=(
+  )
+  fi
+  # https://developer.nvidia.com/nccl/nccl-download
+  # 12.2: 2.19.3, 12.5: 2.21.5
+  readonly -A NCCL_FOR_CUDA=(
           ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
           ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
-)
-readonly -A CUDA_SUBVER=(
+  )
+  readonly -A CUDA_SUBVER=(
           ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
           ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
-)
+  )
+}
+
+set_support_matrix
 
 RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 35f08f801..dc0332ce9 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -31,7 +31,7 @@ def assert_instance_command(self,
 
     retry_count = 5
 
-    ssh_cmd='gcloud compute ssh {} --zone={} --command="{}"'.format(
+    ssh_cmd='gcloud compute -q ssh {} --zone={} --command="{}"'.format(
       instance, self.cluster_zone, cmd)
 
     while retry_count > 0:

From e152fd81525ac9dc6d127741d29da0cf66b9197c Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 16:05:49 -0800
Subject: [PATCH 040/112] including instructions on how to create a secure-boot
 key pair

---
 gpu/create-key-pair.sh | 135 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 gpu/create-key-pair.sh

diff --git a/gpu/create-key-pair.sh b/gpu/create-key-pair.sh
new file mode 100644
index 000000000..8f2a42a70
--- /dev/null
+++ b/gpu/create-key-pair.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script creates a key pair and publishes to cloud secrets or
+# fetches an already published key pair from cloud secrets
+
+set -e
+
+# https://github.com/glevand/secure-boot-utils
+
+# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#adding-shielded-image
+
+# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#generating-security-keys-certificates
+
+# https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys
+
+ITERATION=042
+
+CURRENT_PROJECT_ID="$(gcloud config get project)"
+if [[ -z "${CURRENT_PROJECT_ID}" ]]; then
+    echo 'project is not set.  please set with `gcloud config set project ${PROJECT_ID}`' >&2
+    exit -1
+fi
+PROJECT_ID="${CURRENT_PROJECT_ID}"
+
+function create_key () {
+    local EFI_VAR_NAME="$1"
+    local CN_VAL="$2"
+    local PRIVATE_KEY="tls/${EFI_VAR_NAME}.rsa"
+    local CACERT="tls/${EFI_VAR_NAME}.pem"
+    local CACERT_DER="tls/${EFI_VAR_NAME}.der"
+    CA_KEY_SECRET_NAME="efi-${EFI_VAR_NAME}-priv-key-${ITERATION}"
+    CA_CERT_SECRET_NAME="efi-${EFI_VAR_NAME}-pub-key-${ITERATION}"
+    # If the secrets exist in secret manager, populate the tls/ directory
+    if [[ ! -f "${PRIVATE_KEY}" ]] && gcloud secrets describe "${CA_CERT_SECRET_NAME}" > /dev/null ; then
+      mkdir -p tls
+
+      gcloud secrets versions access "1" \
+        --project="${PROJECT_ID}" \
+        --secret="${CA_KEY_SECRET_NAME}" \
+        | dd of="${PRIVATE_KEY}" status=none
+
+      gcloud secrets versions access "1" \
+        --project="${PROJECT_ID}" \
+        --secret="${CA_CERT_SECRET_NAME}" \
+        | base64 --decode \
+        | dd of="${CACERT_DER}" status=none
+
+      # Create a PEM-format version of the cert
+      openssl x509 \
+        -inform DER \
+        -in "${CACERT_DER}" \
+        -outform PEM \
+        -out "${CACERT}"
+
+      MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt"
+      curl -s -L -o "${MS_UEFI_CA}" 'https://go.microsoft.com/fwlink/p/?linkid=321194'
+
+      echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt
+      echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt
+      modulus_md5sum="$(openssl rsa -noout -modulus -in ${PRIVATE_KEY} | openssl md5 | awk '{print $2}' | tee tls/modulus-md5sum.txt)"
+      return
+    fi
+
+    if [[ -f "${PRIVATE_KEY}" ]]; then
+        modulus_md5sum="$(cat tls/modulus-md5sum.txt)"
+        return
+    fi
+    mkdir -p tls
+
+    echo "generating '${CN_VAL}' '${CACERT}', '${CACERT_DER}' and '${PRIVATE_KEY}'" >&2
+    # Generate new x.509 key and cert
+    openssl req \
+            -newkey rsa:3072 \
+            -nodes \
+            -keyout "${PRIVATE_KEY}" \
+            -new \
+            -x509 \
+            -sha256 \
+            -days 3650 \
+            -subj "/CN=${CN_VAL}/" \
+            -out "${CACERT}"
+
+    # Create a DER-format version of the cert
+    openssl x509 \
+            -outform DER \
+            -in "${CACERT}" \
+            -outform DER \
+            -in "${CACERT}" \
+            -out "${CACERT_DER}"
+
+    # Create a new secret containing private key
+    gcloud secrets create "${CA_KEY_SECRET_NAME}" \
+           --project="${PROJECT_ID}" \
+           --replication-policy="automatic" \
+           --data-file="${PRIVATE_KEY}"
+
+    echo "Private key secret name: '${CA_KEY_SECRET_NAME}'" >&2
+    echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt
+
+    # Create a new secret containing public key
+    cat "${CACERT_DER}" | base64 > "${CACERT_DER}.base64"
+    gcloud secrets create "${CA_CERT_SECRET_NAME}" \
+           --project="${PROJECT_ID}" \
+           --replication-policy="automatic" \
+           --data-file="${CACERT_DER}.base64"
+
+    modulus_md5sum="$(openssl x509 -noout -modulus -in ${CACERT} | openssl md5 | awk '{print $2}')"
+    echo "modulus-md5sum: ${modulus_md5sum}" >&2
+    echo "${modulus_md5sum}" > tls/modulus-md5sum.txt
+    echo "Public key secret name: '${CA_CERT_SECRET_NAME}'" >&2
+    echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt
+
+}
+
+EFI_VAR_NAME=db
+
+create_key "${EFI_VAR_NAME}" "Cloud Dataproc Custom Image CA ${ITERATION}"
+
+echo "modulus_md5sum=${modulus_md5sum}"
+echo "private_secret_name=${CA_KEY_SECRET_NAME}"
+echo "public_secret_name=${CA_CERT_SECRET_NAME}"
+echo "secret_project=${PROJECT_ID}"
+echo "secret_version=1"

From f113ef8517bcd83aa3313fccef263fd99183d377 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 17:47:31 -0800
Subject: [PATCH 041/112] -e for expert, not -p for pro

---
 {gpu => cloudbuild}/create-key-pair.sh |  0
 gpu/install_gpu_driver.sh              | 30 +++++++++++++++++++-------
 2 files changed, 22 insertions(+), 8 deletions(-)
 rename {gpu => cloudbuild}/create-key-pair.sh (100%)

diff --git a/gpu/create-key-pair.sh b/cloudbuild/create-key-pair.sh
similarity index 100%
rename from gpu/create-key-pair.sh
rename to cloudbuild/create-key-pair.sh
diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 19c578d38..c048aa5ef 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1030,18 +1030,32 @@ function install_nvidia_userspace_runfile() {
   if is_rocky8 ; then
     install_build_dependencies
 
-    # build non-open driver
-    execute_with_retries bash "${local_fn}" \
-      --module-signing-hash sha256 \
+    local signing_options
+    signing_options=""
+    if [[ -n "${PSN}" ]]; then
+      signing_options="--module-signing-hash sha256 \
       --module-signing-x509-hash sha256 \
       --module-signing-secret-key "${mok_key}" \
       --module-signing-public-key "${mok_der}" \
       --module-signing-script "/lib/modules/${uname_r}/build/scripts/sign-file" \
+      "
+    fi
+
+    # build non-open driver
+    execute_with_retries bash "${local_fn}" -e -q \
+      ${signing_options} \
       --no-dkms \
-      --install-libglvnd --silent --tmpdir="${tmpdir}"
+      --install-libglvnd \
+      --ui=none \
+      --tmpdir="${tmpdir}" \
+    || {
+      cat /var/log/nvidia-installer.log
+      echo "unable to build kernel modules from runfile"
+      exit 1
+    }
   else
     # prepare to build from github
-    execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}"
+    execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --tmpdir="${tmpdir}"
   fi
   rm -f "${local_fn}"
   touch "${workdir}/userspace-complete"
@@ -1618,12 +1632,12 @@ function clean_up_sources_lists() {
 }
 
 function exit_handler() {
-  set +ex
-  echo "Exit handler invoked"
-
   # Purge private key material until next grant
   clear_dkms_key
 
+  set +ex
+  echo "Exit handler invoked"
+
   # Clear pip cache
   pip cache purge || echo "unable to purge pip cache"
 

From dfc433de40c697e4e517fac00dd5375375aeba79 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 19:08:05 -0800
Subject: [PATCH 042/112] updated 11.8 and 12.0 driver versions

---
 gpu/install_gpu_driver.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index c048aa5ef..87330d0ff 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -149,8 +149,8 @@ function set_support_matrix() {
   local latest
   latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
   readonly -A DRIVER_FOR_CUDA=(
-          ["11.7"]="515.65.01"  ["11.8"]="525.60.13"
-          ["12.0"]="525.60.13"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
+          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
   )
   readonly -A DRIVER_SUBVER=(
           ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"

From 77fc42af5afe0c692e8e4fb6e3b86d819d19e063 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 19:09:12 -0800
Subject: [PATCH 043/112] added a signature check test which allows granular
 selection of platform to test, but does not yet verify signatures

---
 gpu/test_gpu.py | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index dc0332ce9..9766d804f 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -244,7 +244,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
       self.verify_instance(machine_name)
       self.verify_instance_nvcc(machine_name, cuda_version)
       self.verify_instance_pyspark(machine_name)
-      self.verify_instance_spark()
+    self.verify_instance_spark()
 
   @parameterized.parameters(
       ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
@@ -346,6 +346,39 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
       self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
+    self.createCluster(
+      configuration,
+      self.INIT_ACTIONS,
+      machine_type="n1-highmem-8",
+      master_accelerator=master_accelerator,
+      worker_accelerator=worker_accelerator,
+      metadata=metadata,
+      timeout_in_minutes=30,
+      boot_disk_size="50GB",
+      scopes="https://www.googleapis.com/auth/monitoring.write")
+
+    for machine_suffix in machine_suffixes:
+      self.verify_instance("{}-{}".format(self.getClusterName(),machine_suffix))
+      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),machine_suffix))
+
+    self.verify_instance_spark()
+
+  @parameterized.parameters(
+#    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
+#    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
+    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
+  )
+  def tests_driver_signing(self, configuration, machine_suffixes,
+                           master_accelerator, worker_accelerator,
+                           cuda_version, image_os, image_version):
+
+    if self.getImageOs() != image_os:
+      self.skipTest("This test is only run on os {}".format(image_os))
+    if self.getImageVersion() != image_version:
+      self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os))
+
     self.createCluster(
       configuration,
       self.INIT_ACTIONS,

From 8ed498e87a39f2c7fd3784147b70419bcd15595f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 20:06:46 -0800
Subject: [PATCH 044/112] tuning the layout of arguments to userspace.run

---
 gpu/install_gpu_driver.sh | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 87330d0ff..14bafaac3 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1035,9 +1035,9 @@ function install_nvidia_userspace_runfile() {
     if [[ -n "${PSN}" ]]; then
       signing_options="--module-signing-hash sha256 \
       --module-signing-x509-hash sha256 \
-      --module-signing-secret-key "${mok_key}" \
-      --module-signing-public-key "${mok_der}" \
-      --module-signing-script "/lib/modules/${uname_r}/build/scripts/sign-file" \
+      --module-signing-secret-key \"${mok_key}\" \
+      --module-signing-public-key \"${mok_der}\" \
+      --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
       "
     fi
 
@@ -1045,8 +1045,8 @@ function install_nvidia_userspace_runfile() {
     execute_with_retries bash "${local_fn}" -e -q \
       ${signing_options} \
       --no-dkms \
-      --install-libglvnd \
       --ui=none \
+      --install-libglvnd \
       --tmpdir="${tmpdir}" \
     || {
       cat /var/log/nvidia-installer.log
@@ -1055,7 +1055,11 @@ function install_nvidia_userspace_runfile() {
     }
   else
     # prepare to build from github
-    execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --tmpdir="${tmpdir}"
+    execute_with_retries bash "${local_fn}" -e -q \
+      --no-kernel-modules \
+      --ui=none \
+      --install-libglvnd \
+      --tmpdir="${tmpdir}"
   fi
   rm -f "${local_fn}"
   touch "${workdir}/userspace-complete"

From 842d7e5725b40f0d91a95aec2843f0ab9f798e7c Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 20:56:06 -0800
Subject: [PATCH 045/112] scoping DEFAULT_CUDA_VERSION correctly ; exercising
 rocky including kerberos on 12.6

---
 gpu/test_gpu.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 9766d804f..f4182519d 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -7,10 +7,10 @@
 from integration_tests.dataproc_test_case import DataprocTestCase
 
 DEFAULT_TIMEOUT = 15  # minutes
+DEFAULT_CUDA_VERSION = "12.4"
 
 class NvidiaGpuDriverTestCase(DataprocTestCase):
   COMPONENT = "gpu"
-  DEFAULT_CUDA_VERSION = "12.4"
   INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
@@ -138,7 +138,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90,
+        timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl
         boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
@@ -366,7 +366,10 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
   @parameterized.parameters(
 #    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
 #    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
+    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'),
     ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
+    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
+    ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
   )
@@ -386,7 +389,7 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
-      timeout_in_minutes=30,
+      timeout_in_minutes=30, # this test expects driver and nccl cache to be built and stashed before its run
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:

From bb35d11c98f7e4b15e2ed5f0bd0a66f946313a76 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 21:47:21 -0800
Subject: [PATCH 046/112] add a connect timeout to the ssh call instead of
 trying to patch around a longer than expected connection delay

---
 gpu/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f4182519d..b876a2b05 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -31,7 +31,7 @@ def assert_instance_command(self,
 
     retry_count = 5
 
-    ssh_cmd='gcloud compute -q ssh {} --zone={} --command="{}"'.format(
+    ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
       instance, self.cluster_zone, cmd)
 
     while retry_count > 0:

From 2541a6f5b1a88d5b9bc777c260cf54fee8b97b5f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 21:51:42 -0800
Subject: [PATCH 047/112] add some entropy to the process

---
 gpu/test_gpu.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index b876a2b05..7386e111e 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -49,7 +49,9 @@ def assert_instance_command(self,
 
   def verify_instance(self, name):
     # Verify that nvidia-smi works
-    time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience
+    import random
+    # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions
+    time.sleep( 3 + random.randint(1, 10) )
     self.assert_instance_command(name, "nvidia-smi", 1)
 
   def verify_pytorch(self, name):

From ab668ffe88fa75ac8010ef1540d22ae5765a18e8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 22:38:26 -0800
Subject: [PATCH 048/112] perhaps a re-run would have fixed 2.0-rocky8 on that
 last run

---
 cloudbuild/presubmit.sh | 8 ++++++++
 gpu/run-bazel-tests.sh  | 1 -
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index d9ae3c9bb..0139636cb 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -105,6 +105,7 @@ run_tests() {
   bazel test \
     --jobs="${max_parallel_tests}" \
     --local_test_jobs="${max_parallel_tests}" \
+    --flaky_test_attempts=3 \
     --action_env="INTERNAL_IP_SSH=true" \
     --test_output="all" \
     --noshow_progress \
@@ -115,6 +116,13 @@ run_tests() {
 
 main() {
   cd /init-actions
+
+# TODO: once service account is granted permission to access the cloud
+# secrets, we can source this file and set signing material metadata
+# variables from the environment in the python code.
+
+#  eval "$(bash cloudbuild/create-key-pair.sh | sed -e 's/^/export /g')"
+
   configure_gcloud
   configure_gcloud_ssh_key
   initialize_git_repo
diff --git a/gpu/run-bazel-tests.sh b/gpu/run-bazel-tests.sh
index 8e7cd663d..ae717bf5b 100644
--- a/gpu/run-bazel-tests.sh
+++ b/gpu/run-bazel-tests.sh
@@ -17,7 +17,6 @@ declare -a TESTS_TO_RUN=('gpu:test_gpu')
 time bazel test \
   --jobs="${max_parallel_tests}" \
   --local_test_jobs="${max_parallel_tests}" \
-  --flaky_test_attempts=3 \
   --action_env="INTERNAL_IP_SSH=true" \
   --test_output="errors" \
   --test_arg="--image_version=${IMAGE_VERSION}" \

From 934289a3d35f5b200a8b419ef24dd2a4bf506d81 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 12 Dec 2024 23:49:58 -0800
Subject: [PATCH 049/112] increasing init action timeout to account for
 uncached builds

---
 gpu/test_gpu.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 7386e111e..61f0315ad 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -167,7 +167,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
       self.verify_instance("{}-{}".format(self.getClusterName(),
@@ -193,7 +193,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB",
         scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
@@ -238,7 +238,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB")
 
     for machine_suffix in machine_suffixes:
@@ -282,7 +282,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB",
         startup_script="gpu/mig.sh")
 
@@ -314,7 +314,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         boot_disk_size="50GB",
-        timeout_in_minutes=30)
+        timeout_in_minutes=90)
 
     self.verify_instance_spark()
 
@@ -355,7 +355,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
-      timeout_in_minutes=30,
+      timeout_in_minutes=90,
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
 
@@ -391,7 +391,7 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
-      timeout_in_minutes=30, # this test expects driver and nccl cache to be built and stashed before its run
+      timeout_in_minutes=90,
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:

From e5920f8fd2c83d3cc0f0aa40fdd6346122ba6391 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 17:15:44 -0800
Subject: [PATCH 050/112] cache non-open kernel build results

---
 gpu/install_gpu_driver.sh | 84 +++++++++++++++++++++++++--------------
 1 file changed, 55 insertions(+), 29 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 14bafaac3..e45bf8496 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1027,40 +1027,66 @@ function install_nvidia_userspace_runfile() {
                         "${pkg_bucket}/${USERSPACE_FILENAME}" \
                         "${local_fn}"
 
+  local runfile_args
+  runfile_args=""
+  local cache_hit="0"
+  local local_tarball
+
   if is_rocky8 ; then
-    install_build_dependencies
-
-    local signing_options
-    signing_options=""
-    if [[ -n "${PSN}" ]]; then
-      signing_options="--module-signing-hash sha256 \
-      --module-signing-x509-hash sha256 \
-      --module-signing-secret-key \"${mok_key}\" \
-      --module-signing-public-key \"${mok_der}\" \
-      --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
-      "
-    fi
+    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+      local_tarball="${workdir}/${build_tarball}"
+      local build_dir
+      if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
+        then build_dir="${modulus_md5sum}"
+        else build_dir="unsigned" ; fi
+
+      local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}"
+
+      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+        cache_hit="1"
+        runfile_args="--no-kernel-modules"
+        echo "cache hit"
+      else
+        install_build_dependencies
+
+        local signing_options
+        signing_options=""
+        if [[ -n "${PSN}" ]]; then
+          signing_options="--module-signing-hash sha256 \
+          --module-signing-x509-hash sha256 \
+          --module-signing-secret-key \"${mok_key}\" \
+          --module-signing-public-key \"${mok_der}\" \
+          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
+          "
+        fi
 
-    # build non-open driver
-    execute_with_retries bash "${local_fn}" -e -q \
-      ${signing_options} \
-      --no-dkms \
-      --ui=none \
-      --install-libglvnd \
-      --tmpdir="${tmpdir}" \
-    || {
-      cat /var/log/nvidia-installer.log
-      echo "unable to build kernel modules from runfile"
-      exit 1
+        runfile_args="--no-dkms ${signing_options}"
+      fi
     }
   else
-    # prepare to build from github
-    execute_with_retries bash "${local_fn}" -e -q \
-      --no-kernel-modules \
-      --ui=none \
-      --install-libglvnd \
-      --tmpdir="${tmpdir}"
+    runfile_args="--no-kernel-modules"
+  fi
+
+  execute_with_retries bash "${local_fn}" -e -q \
+    ${runfile_args} \
+    --ui=none \
+    --install-libglvnd \
+    --tmpdir="${tmpdir}"
+
+  if is_rocky8 ; then
+    if [[ "${cache_hit}" == "1" ]] ; then
+      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+      depmod -a
+    else
+      tar czvf "${local_tarball}" \
+        /var/log/nvidia-installer.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    fi
   fi
+
   rm -f "${local_fn}"
   touch "${workdir}/userspace-complete"
   sync

From 386177d2b433064f19f5ed21a8921eabc6cd4d52 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 17:48:12 -0800
Subject: [PATCH 051/112] per-kernel sub-directory for kmod tarballs

---
 gpu/install_gpu_driver.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index e45bf8496..a42c7f440 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -930,7 +930,7 @@ function build_driver_from_github() {
       then build_dir="${modulus_md5sum}"
       else build_dir="unsigned" ; fi
 
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
     if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
       echo "cache hit"
@@ -1042,7 +1042,7 @@ function install_nvidia_userspace_runfile() {
         then build_dir="${modulus_md5sum}"
         else build_dir="unsigned" ; fi
 
-      local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}"
+      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
       if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
         cache_hit="1"

From b9668e0ef08d0a93f637561ae166e2605a499c28 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 19:45:19 -0800
Subject: [PATCH 052/112] using upstream repo and branch

---
 gpu/manual-test-runner.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh
index 021528f6c..2527d6fd9 100644
--- a/gpu/manual-test-runner.sh
+++ b/gpu/manual-test-runner.sh
@@ -4,9 +4,9 @@
 #
 # To run the script, the following will bootstrap
 #
-# git clone git@github.com:LLC-Technologies-Collier/initialization-actions
+# git clone git@github.com:GoogleCloudDataproc/initialization-actions
 # cd initialization-actions
-# git checkout gpu-20241207
+# git checkout 2024.11
 # cp gpu/env.json.sample env.json
 # vi env.json
 # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest .

From 2f0148a43f51ea841bf2c4d9d402197912277692 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 19:55:23 -0800
Subject: [PATCH 053/112] corrected grammar error

---
 gpu/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 61f0315ad..7c090ddea 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -181,7 +181,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
-    self.skipTest("No need to regularly installing the agent on its own cluster ; this is exercised elsewhere")
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:

From 19b9ddb44c07f9b427cac0e5cb86b1fe93ace4a7 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 20:13:18 -0800
Subject: [PATCH 054/112] testing Kerberos some more

---
 gpu/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 7c090ddea..1f1e472fa 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -174,7 +174,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
                                           machine_suffix))
 
   @parameterized.parameters(
-      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
+      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
 #      ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"),
 #      ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"),
   )

From 1e5fc0f3c2d94c40405b6a93c572c26536cbe73d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 20:21:32 -0800
Subject: [PATCH 055/112] better implementation of numa node selection

---
 gpu/test_gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 1f1e472fa..a9093a2ba 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -59,7 +59,7 @@ def verify_pytorch(self, name):
                                self.TORCH_TEST_SCRIPT_FILE_NAME)
     self.upload_test_file(test_filename, name)
 
-    verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
         self.TORCH_TEST_SCRIPT_FILE_NAME)
     self.assert_instance_command(name, verify_cmd)
     self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
@@ -69,7 +69,7 @@ def verify_tensorflow(self, name):
                                self.TF_TEST_SCRIPT_FILE_NAME)
     self.upload_test_file(test_filename, name)
 
-    verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
         self.TF_TEST_SCRIPT_FILE_NAME)
     self.assert_instance_command(name, verify_cmd)
     self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)

From 4023031c0e4a3a5517119997cdf35dad8f619e68 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 20:27:13 -0800
Subject: [PATCH 056/112] this time with a test which is exercised

---
 gpu/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index a9093a2ba..404dab004 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -206,7 +206,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
 #        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
 #      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
+      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,

From 03f59a6ef028b5e172c51d3b31ca3f1ceecc44b5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Fri, 13 Dec 2024 23:16:40 -0800
Subject: [PATCH 057/112] skip debian11 on Kerberos

---
 gpu/test_gpu.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 404dab004..940c43c25 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -125,6 +125,8 @@ def verify_instance_spark(self):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
@@ -183,6 +185,11 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   driver_provider):
     self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
 
+    if configuration == 'KERBEROS' \
+    and self.getImageOs() == 'debian' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      self.skipTest("KERBEROS fails on debian11")
+
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
@@ -212,6 +219,11 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
+    if configuration == 'KERBEROS' \
+    and self.getImageOs() == 'debian' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      self.skipTest("KERBEROS fails on debian11")
+
 #    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
 #    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
 #      self.skipTest("CUDA == 12.0 not supported on debian 12")
@@ -379,6 +391,11 @@ def tests_driver_signing(self, configuration, machine_suffixes,
                            master_accelerator, worker_accelerator,
                            cuda_version, image_os, image_version):
 
+    if configuration == 'KERBEROS' \
+    and self.getImageOs() == 'debian' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      self.skipTest("KERBEROS fails on debian11")
+
     if self.getImageOs() != image_os:
       self.skipTest("This test is only run on os {}".format(image_os))
     if self.getImageVersion() != image_version:

From f2146e362d802b2e290610ebb4cb4f6cbaa31dde Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 11:55:18 -0800
Subject: [PATCH 058/112] also skipping 2.1-ubuntu20 on kerberos clusters

---
 gpu/test_gpu.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 940c43c25..b41efb78f 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -186,9 +186,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
 
     if configuration == 'KERBEROS' \
-    and self.getImageOs() == 'debian' \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("KERBEROS fails on debian11")
+      self.skipTest("KERBEROS fails on 2.1 aside from rocky")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -220,9 +220,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    cuda_version):
 
     if configuration == 'KERBEROS' \
-    and self.getImageOs() == 'debian' \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("KERBEROS fails on debian11")
+      self.skipTest("KERBEROS fails on 2.1 aside from rocky")
 
 #    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
 #    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
@@ -392,9 +392,9 @@ def tests_driver_signing(self, configuration, machine_suffixes,
                            cuda_version, image_os, image_version):
 
     if configuration == 'KERBEROS' \
-    and self.getImageOs() == 'debian' \
+    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("KERBEROS fails on debian11")
+      self.skipTest("KERBEROS fails on 2.1 aside from rocky")
 
     if self.getImageOs() != image_os:
       self.skipTest("This test is only run on os {}".format(image_os))

From 1cb99f859564918e617dcd49a73f72938e33caf2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 12:27:36 -0800
Subject: [PATCH 059/112] re-adjusting tests to be performed ; adjusting rather
 than skipping known failure cases

---
 gpu/test_gpu.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index b41efb78f..5efce6381 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -130,7 +130,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
+      configuration='STANDARD'
 
     metadata = None
     if driver_provider is not None:
@@ -160,6 +161,12 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
     self.skipTest("No need to regularly test not installing the agent")
 
     metadata = "install-gpu-agent=false"
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
+      configuration='STANDARD'
+
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
     self.createCluster(
@@ -188,7 +195,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     if configuration == 'KERBEROS' \
     and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("KERBEROS fails on 2.1 aside from rocky")
+      # KERBEROS fails on 2.1 aside from rocky
+      configuration="STANDARD"
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -210,7 +218,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                                     machine_suffix))
 
   @parameterized.parameters(
-        ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "12.0"),
 #        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
 #      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
       ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
@@ -222,7 +230,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     if configuration == 'KERBEROS' \
     and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("KERBEROS fails on 2.1 aside from rocky")
+      # KERBEROS fails on 2.1 aside from rocky
+      configuration="STANDARD"
 
 #    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
 #    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
@@ -240,7 +249,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
+      configuration='STANDARD'
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -312,7 +322,8 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
+      configuration='STANDARD'
 
     metadata = None
     if driver_provider is not None:
@@ -357,7 +368,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
+      configuration='STANDARD'
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -394,7 +406,8 @@ def tests_driver_signing(self, configuration, machine_suffixes,
     if configuration == 'KERBEROS' \
     and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      self.skipTest("KERBEROS fails on 2.1 aside from rocky")
+      # KERBEROS fails on 2.1 aside from rocky
+      configuration="STANDARD"
 
     if self.getImageOs() != image_os:
       self.skipTest("This test is only run on os {}".format(image_os))

From 3a238d18aeeebbb6336cdb65d1eb86a6990391d6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 12:37:47 -0800
Subject: [PATCH 060/112] more temporal variance

---
 gpu/test_gpu.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 5efce6381..6c3c703ec 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -51,7 +51,7 @@ def verify_instance(self, name):
     # Verify that nvidia-smi works
     import random
     # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions
-    time.sleep( 3 + random.randint(1, 10) )
+    time.sleep( 3 + random.randint(1, 30) )
     self.assert_instance_command(name, "nvidia-smi", 1)
 
   def verify_pytorch(self, name):
@@ -179,8 +179,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
         timeout_in_minutes=90,
         boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
 
   @parameterized.parameters(
       ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
@@ -212,10 +212,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         boot_disk_size="50GB",
         scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
-      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
-                                                    machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
 
   @parameterized.parameters(
         ("SINGLE", ["m"],               GPU_T4, None,   "12.0"),

From cc16aa8c9b82e4bb8d47a31f6396ab4b44506cbe Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 13:38:00 -0800
Subject: [PATCH 061/112] skipping CUDA=12.0 for ubuntu22

---
 gpu/test_gpu.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 6c3c703ec..8aa955c45 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -362,7 +362,16 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
 
     if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      # CUDA < 12 not supported on Dataproc 2.2
       self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
+#      cuda_version="12.0" # consider this instead
+
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    and self.getImageOs() == 'ubuntu':
+      # CUDA <= 12 not supported on Dataproc 2.2 with ubuntu
+      self.skipTest("CUDA <= 12 not supported on Dataproc 2.2 with ubuntu")
+#      cuda_version="12.1" # consider this instead
+
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \

From 3ac04bc8e1ecbf5469f753d77784c0a91eb3aaf1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 13:56:41 -0800
Subject: [PATCH 062/112] kerberos not known to succeed on 2.0-rocky8

---
 gpu/test_gpu.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 8aa955c45..40bb64952 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -198,6 +198,12 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
       # KERBEROS fails on 2.1 aside from rocky
       configuration="STANDARD"
 
+    if configuration == 'KERBEROS' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.0"):
+      # KERBEROS fails on 2.0 with rocky
+      configuration="STANDARD"
+
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
@@ -232,6 +238,12 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
       # KERBEROS fails on 2.1 aside from rocky
       configuration="STANDARD"
 
+    if configuration == 'KERBEROS' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.0"):
+      # KERBEROS fails on 2.0 with rocky
+      configuration="STANDARD"
+
 #    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
 #    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
 #      self.skipTest("CUDA == 12.0 not supported on debian 12")
@@ -417,6 +429,12 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       # KERBEROS fails on 2.1 aside from rocky
       configuration="STANDARD"
 
+    if configuration == 'KERBEROS' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.0"):
+      # KERBEROS fails on 2.0 with rocky
+      configuration="STANDARD"
+
     if self.getImageOs() != image_os:
       self.skipTest("This test is only run on os {}".format(image_os))
     if self.getImageVersion() != image_version:

From c6bf91a1e1952ee37b5eaad3eb7b336048820162 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 15:19:29 -0800
Subject: [PATCH 063/112] 2.2 dataproc images do not support CUDA <= 12.0

---
 gpu/test_gpu.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 40bb64952..c27eadb05 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -244,18 +244,14 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
       # KERBEROS fails on 2.0 with rocky
       configuration="STANDARD"
 
-#    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-#    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-#      self.skipTest("CUDA == 12.0 not supported on debian 12")
-
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
       self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
+      self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2")
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
@@ -301,9 +297,9 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
       self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
+      self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2")
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -372,16 +368,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
       self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      # CUDA < 12 not supported on Dataproc 2.2
-      self.skipTest("CUDA < 12 not supported on Dataproc 2.2")
-#      cuda_version="12.0" # consider this instead
-
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
-    and self.getImageOs() == 'ubuntu':
-      # CUDA <= 12 not supported on Dataproc 2.2 with ubuntu
-      self.skipTest("CUDA <= 12 not supported on Dataproc 2.2 with ubuntu")
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2")
 #      cuda_version="12.1" # consider this instead
 
 

From d1b3d48249901e7456c99878946bb62172ef0098 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 17:32:05 -0800
Subject: [PATCH 064/112] skipping SINGLE configuration for rocky8 again

---
 gpu/test_gpu.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index c27eadb05..8bd132922 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -257,7 +257,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      configuration='STANDARD'
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty")
+
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -330,7 +331,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      configuration='STANDARD'
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty")
 
     metadata = None
     if driver_provider is not None:
@@ -378,7 +379,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      configuration='STANDARD'
+      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(

From 751e7a0ae961a6be518c59dce3ba67fb144a30aa Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 19:06:04 -0800
Subject: [PATCH 065/112] not testing 2.0

---
 gpu/test_gpu.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 8bd132922..f9a1cfaaa 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -223,7 +223,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
       self.verify_instance_gpu_agent(machine_name)
 
   @parameterized.parameters(
-        ("SINGLE", ["m"],               GPU_T4, None,   "12.0"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
 #        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
 #      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
       ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
@@ -394,9 +394,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
       scopes="https://www.googleapis.com/auth/monitoring.write")
 
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),machine_suffix))
-      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),machine_suffix))
-
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
     self.verify_instance_spark()
 
   @parameterized.parameters(
@@ -441,10 +441,9 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
-      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
-                                                    machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
 
     self.verify_instance_spark()
 

From e5e3a9e0016ff81c114720341fd62c756c42729b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 19:07:41 -0800
Subject: [PATCH 066/112] trying without test retries ; retries should happen
 within the test, not by re-running the test

---
 cloudbuild/presubmit.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index 0139636cb..9ed39d0ee 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -105,7 +105,6 @@ run_tests() {
   bazel test \
     --jobs="${max_parallel_tests}" \
     --local_test_jobs="${max_parallel_tests}" \
-    --flaky_test_attempts=3 \
     --action_env="INTERNAL_IP_SSH=true" \
     --test_output="all" \
     --noshow_progress \

From c1cd1d9bc84e6f8077b2e0183b2cda084d7d7628 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 14 Dec 2024 19:51:51 -0800
Subject: [PATCH 067/112] kerberos only works on 2.2

---
 gpu/test_gpu.py | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f9a1cfaaa..cc3f4447b 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -193,15 +193,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
 
     if configuration == 'KERBEROS' \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # KERBEROS fails on 2.1 aside from rocky
-      configuration="STANDARD"
-
-    if configuration == 'KERBEROS' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      # KERBEROS fails on 2.0 with rocky
+      # KERBEROS fails on 2.1
       configuration="STANDARD"
 
     metadata = "install-gpu-agent=true"
@@ -233,15 +226,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    cuda_version):
 
     if configuration == 'KERBEROS' \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # KERBEROS fails on 2.1 aside from rocky
-      configuration="STANDARD"
-
-    if configuration == 'KERBEROS' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      # KERBEROS fails on 2.0 with rocky
+      # KERBEROS fails on 2.1
       configuration="STANDARD"
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
@@ -414,15 +400,8 @@ def tests_driver_signing(self, configuration, machine_suffixes,
                            cuda_version, image_os, image_version):
 
     if configuration == 'KERBEROS' \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # KERBEROS fails on 2.1 aside from rocky
-      configuration="STANDARD"
-
-    if configuration == 'KERBEROS' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      # KERBEROS fails on 2.0 with rocky
+      # KERBEROS fails on 2.1
       configuration="STANDARD"
 
     if self.getImageOs() != image_os:

From eac2d462468383f7fb96616b79ebf929dd1a9cbb Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 15 Dec 2024 12:51:39 -0800
Subject: [PATCH 068/112] using expectedFailure instead of skipTest for tests
 which are known to fail

---
 gpu/test_gpu.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index cc3f4447b..164300f5d 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -233,18 +233,19 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
+      # CUDA > 12.4 not supported on older debian/ubuntu releases
+      self.expectedFailure()
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2")
+      # CUDA <= 12.0 not supported on Dataproc 2.2
+      self.expectedFailure()
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty")
-
+      self.expectedFailure()
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -282,11 +283,13 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
+      # CUDA > 12.4 not supported on older debian/ubuntu releases
+      self.expectedFailure()
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2")
+      self.expectedFailure()
+      # CUDA <= 12.0 not supported on Dataproc 2.2
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -317,7 +320,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty")
+      self.expectedFailure()
 
     metadata = None
     if driver_provider is not None:
@@ -348,24 +351,25 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
 
 #    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
 #    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-#      self.skipTest("CUDA == 12.0 not supported on debian 12")
+#      # CUDA == 12.0 not supported on debian 12
+#      self.expectedFailure()
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
+      # CUDA > 12.4 not supported on older debian/ubuntu releases
+      self.expectedFailure()
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2")
-#      cuda_version="12.1" # consider this instead
-
+      # CUDA <= 12.0 not supported on Dataproc 2.2
+      self.expectedFailure()
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty")
+      self.expectedFailure()
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(

From bf1f0c60be178dee2de11dae2ed7282bda28e470 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 15 Dec 2024 15:36:28 -0800
Subject: [PATCH 069/112] document one of the failure states

---
 gpu/test_gpu.py | 82 ++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 164300f5d..a07ade732 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -4,6 +4,8 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 
+import unittest
+
 from integration_tests.dataproc_test_case import DataprocTestCase
 
 DEFAULT_TIMEOUT = 15  # minutes
@@ -15,7 +17,7 @@ class NvidiaGpuDriverTestCase(DataprocTestCase):
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
   GPU_V100 = "type=nvidia-tesla-v100"
-  GPU_A100 = "type=nvidia-tesla-a100"
+  GPU_A100 = "type=nvidia-tesla-a100,count=2"
   GPU_H100 = "type=nvidia-h100-80gb,count=8"
 
   # Tests for PyTorch
@@ -120,7 +122,7 @@ def verify_instance_spark(self):
   @parameterized.parameters(
       ("SINGLE",   ["m"], GPU_T4, None, None),
 #      ("STANDARD", ["m"], GPU_T4, None, None),
-#      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"),
+      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"),
   )
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
@@ -130,8 +132,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      configuration='STANDARD'
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
 
     metadata = None
     if driver_provider is not None:
@@ -164,8 +166,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      configuration='STANDARD'
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
 
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
@@ -194,8 +196,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
 
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # KERBEROS fails on 2.1
-      configuration="STANDARD"
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -218,7 +220,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
   @parameterized.parameters(
         ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
 #        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
-#      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
+      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
       ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
@@ -227,25 +229,25 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
 
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # KERBEROS fails on 2.1
-      configuration="STANDARD"
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      # CUDA > 12.4 not supported on older debian/ubuntu releases
-      self.expectedFailure()
+      # ('CUDA > 12.4 not supported on older debian/ubuntu releases')
+      unittest.expectedFailure(self)
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      # CUDA <= 12.0 not supported on Dataproc 2.2
-      self.expectedFailure()
+      # ('CUDA <= 12.0 not supported on Dataproc 2.2')
+      unittest.expectedFailure(self)
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      self.expectedFailure()
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -273,23 +275,24 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
   def test_install_gpu_with_mig(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider, cuda_version):
-
-    self.skipTest("Test is known to fail.  Skipping so that we can exercise others")
-
-#    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-#    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-#      self.skipTest("CUDA == 12.0 not supported on debian 12")
+    # Operation [projects/.../regions/.../operations/...] failed:
+    # Invalid value for field 'resource.machineType': \
+    # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \
+    # 'machineTypes/a3-highgpu-8g'. \
+    # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature..
+    # ('This use case not thoroughly tested')
+    unittest.expectedFailure(self)
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      # CUDA > 12.4 not supported on older debian/ubuntu releases
-      self.expectedFailure()
+      # ('CUDA > 12.4 not supported on older debian/ubuntu releases')
+      unittest.expectedFailure(self)
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.expectedFailure()
-      # CUDA <= 12.0 not supported on Dataproc 2.2
+      # ('CUDA <= 12.0 not supported on Dataproc 2.2')
+      unittest.expectedFailure(self)
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -319,8 +322,8 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      self.expectedFailure()
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
 
     metadata = None
     if driver_provider is not None:
@@ -349,27 +352,22 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-#    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-#    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-#      # CUDA == 12.0 not supported on debian 12
-#      self.expectedFailure()
-
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      # CUDA > 12.4 not supported on older debian/ubuntu releases
-      self.expectedFailure()
+      # ('CUDA > 12.4 not supported on older debian/ubuntu releases')
+      unittest.expectedFailure(self)
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      # CUDA <= 12.0 not supported on Dataproc 2.2
-      self.expectedFailure()
+      # ('CUDA <= 12.0 not supported on Dataproc 2.2')
+      unittest.expectedFailure(self)
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty
-      self.expectedFailure()
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -405,8 +403,8 @@ def tests_driver_signing(self, configuration, machine_suffixes,
 
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # KERBEROS fails on 2.1
-      configuration="STANDARD"
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
 
     if self.getImageOs() != image_os:
       self.skipTest("This test is only run on os {}".format(image_os))

From 12e6de99310e54e39cd39c63f34b4df854ab46a6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sun, 15 Dec 2024 18:36:30 -0800
Subject: [PATCH 070/112] skipping expected failures

---
 gpu/test_gpu.py | 81 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 21 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index a07ade732..f260d5927 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -27,9 +27,9 @@ class NvidiaGpuDriverTestCase(DataprocTestCase):
   TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
 
   def assert_instance_command(self,
-                            instance,
-                            cmd,
-                            timeout_in_minutes=DEFAULT_TIMEOUT):
+                             instance,
+                             cmd,
+                             timeout_in_minutes=DEFAULT_TIMEOUT):
 
     retry_count = 5
 
@@ -119,6 +119,22 @@ def verify_instance_spark(self):
       +   "spark.yarn.unmanagedAM.enabled=false"
     )
 
+  def verify_driver_signature(self, name):
+    cert_path='/var/lib/dkms/mok.pub'
+    if self.getImageOs() == 'ubuntu':
+      cert_path='/var/lib/shim-signed/mok/MOK.der'
+
+    cert_verification_cmd = """
+perl -Mv5.10 -e '
+my $cert = ( qx{openssl x509 -inform DER -in {} -text}
+             =~ /Serial Number:.*? +(.+?)\s*$/ms );
+my $kmod = ( qx{modinfo nvidia}
+             =~ /^sig_key:\s+(\S+)/ms );
+exit 1 unless $cert eq lc $kmod
+'
+"""
+    self.assert_instance_command( name, cert_verification_cmd.format(cert_path) )
+
   @parameterized.parameters(
       ("SINGLE",   ["m"], GPU_T4, None, None),
 #      ("STANDARD", ["m"], GPU_T4, None, None),
@@ -134,6 +150,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -168,6 +185,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
@@ -198,6 +216,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -231,23 +250,24 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      # ('CUDA > 12.4 not supported on older debian/ubuntu releases')
-      unittest.expectedFailure(self)
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      # ('CUDA <= 12.0 not supported on Dataproc 2.2')
-      unittest.expectedFailure(self)
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
+
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -282,17 +302,16 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
     # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature..
     # ('This use case not thoroughly tested')
     unittest.expectedFailure(self)
+    self.skipTest("known to fail")
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      # ('CUDA > 12.4 not supported on older debian/ubuntu releases')
-      unittest.expectedFailure(self)
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      # ('CUDA <= 12.0 not supported on Dataproc 2.2')
-      unittest.expectedFailure(self)
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -324,6 +343,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -355,19 +375,18 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      # ('CUDA > 12.4 not supported on older debian/ubuntu releases')
-      unittest.expectedFailure(self)
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
     if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      # ('CUDA <= 12.0 not supported on Dataproc 2.2')
-      unittest.expectedFailure(self)
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
 
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -390,10 +409,10 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
   @parameterized.parameters(
 #    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
 #    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
-    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'),
     ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
-    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
-    ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
+#    ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
   )
@@ -401,10 +420,29 @@ def tests_driver_signing(self, configuration, machine_suffixes,
                            master_accelerator, worker_accelerator,
                            cuda_version, image_os, image_version):
 
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
       unittest.expectedFailure(self)
+      self.skipTest("known to fail")
+
+    kvp_array=[]
+    import os
+
+    if "private_secret_name" in os.environ:
+      for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']:
+        kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) )
+
+      if kvp_array[0] == "public_secret_name=":
+        self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+    else:
+      self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+
+    metadata = ",".join( kvp_array )
 
     if self.getImageOs() != image_os:
       self.skipTest("This test is only run on os {}".format(image_os))
@@ -422,9 +460,10 @@ def tests_driver_signing(self, configuration, machine_suffixes,
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
-      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_gpu_agent(machine_name)
+      hostname="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(hostname)
+      self.verify_instance_gpu_agent(hostname)
+#      self.verify_driver_signature(hostname)
 
     self.verify_instance_spark()
 

From f7bf9abb2081087851260fa00325048b9f43fa8e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 16 Dec 2024 14:27:27 -0800
Subject: [PATCH 071/112] updated manual-test-runner.sh instructions

---
 gpu/manual-test-runner.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh
index 2527d6fd9..0199d62ad 100644
--- a/gpu/manual-test-runner.sh
+++ b/gpu/manual-test-runner.sh
@@ -6,7 +6,7 @@
 #
 # git clone git@github.com:GoogleCloudDataproc/initialization-actions
 # cd initialization-actions
-# git checkout 2024.11
+# git checkout 2024.12
 # cp gpu/env.json.sample env.json
 # vi env.json
 # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest .

From 47a6e3b7314c11adf59369b161dcd9ce27443828 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 23 Dec 2024 15:35:20 -0800
Subject: [PATCH 072/112] this one generated from template after refactor

---
 gpu/install_gpu_driver.sh | 1190 +++++++++++++++++++++----------------
 1 file changed, 682 insertions(+), 508 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index a42c7f440..8a483ad40 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -11,6 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+#
+# This initialization action is generated from
+# initialization-actions/templates/gpu/install_gpu_driver.sh.in
+#
+# Modifications made directly to the generated file will be lost when
+# the template is re-evaluated
+
 #
 # This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
 
@@ -25,25 +33,30 @@ function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
 function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
 
-readonly -A supported_os=(
-  ['debian']="10 11 12"
-  ['rocky']="8 9"
-  ['ubuntu']="18.04 20.04 22.04"
-)
+function define_os_comparison_functions() {
+
+  readonly -A supported_os=(
+    ['debian']="10 11 12"
+    ['rocky']="8 9"
+    ['ubuntu']="18.04 20.04 22.04"
+  )
 
-# dynamically define OS version test utility functions
-if [[ "$(os_id)" == "rocky" ]];
-then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
-else _os_version="$(os_version)"; fi
-for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
-  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
-
-  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
-    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
-    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
-    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+  # dynamically define OS version test utility functions
+  if [[ "$(os_id)" == "rocky" ]];
+  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+  else _os_version="$(os_version)"; fi
+  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+    done
   done
-done
+}
+
+define_os_comparison_functions
 
 function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
 
@@ -118,24 +131,346 @@ function get_metadata_attribute() (
   get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
 )
 
-OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-readonly OS_NAME
+function execute_with_retries() (
+  set +x
+  local -r cmd="$*"
 
-# Fetch SPARK config
-SPARK_VERSION_ENV="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-readonly SPARK_VERSION_ENV
-if version_ge "${SPARK_VERSION_ENV}" "3.0" && \
-   version_lt "${SPARK_VERSION_ENV}" "4.0" ; then
-  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
-  readonly SPARK_VERSION="3.0"             # try ${SPARK_VERSION_ENV}
-else
-  echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-  exit 1
-fi
+  if [[ "$cmd" =~ "^apt-get install" ]] ; then
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+  fi
+  for ((i = 0; i < 3; i++)); do
+    set -x
+    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    set +x
+    if [[ $retval == 0 ]] ; then return 0 ; fi
+    sleep 5
+  done
+  return 1
+)
+
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
+
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
+
+function add_contrib_component() {
+  if ! is_debuntu ; then return ; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
+}
+
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
+}
+
+function configure_yarn_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
+
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
+
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
+
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
+
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    fi
+
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
+
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
+
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
+
+
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
+
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
+
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+      fi
+    done
+  fi
+
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
+
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  fi
+
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+
+}
+
+function set_proxy(){
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+  export METADATA_HTTP_PROXY
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+  local no_proxy_svc
+  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
+                      bigquery composer      pubsub bigquerydatatransfer dataflow \
+                      storage  datafusion    ; do
+    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  done
+
+  export NO_PROXY="${no_proxy}"
+}
+
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Download pip packages to tmpfs
+  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+}
+
+function check_os() {
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
+
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+}
+
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+  fi
+
+  apt-get update -qq
+}
+
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+    | dd of="${repo_path}" status=progress
+}
+
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
+#
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+  mkdir -p "$(dirname "${kr_path}")"
+
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
+
+
+readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
+# Dataproc configurations
+readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+readonly HIVE_CONF_DIR='/etc/hive/conf'
+readonly SPARK_CONF_DIR='/etc/spark/conf'
 
-# node role
-ROLE="$(get_metadata_attribute dataproc-role)"
-readonly ROLE
 
 function set_support_matrix() {
   # CUDA version and Driver version
@@ -190,8 +525,6 @@ function set_support_matrix() {
 
 set_support_matrix
 
-RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-
 function set_cuda_version() {
   local cuda_url
   cuda_url=$(get_metadata_attribute 'cuda-url' '')
@@ -211,6 +544,10 @@ function set_cuda_version() {
   readonly DEFAULT_CUDA_VERSION
 
   CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
+    CUDA_FULL_VERSION="${CUDA_VERSION}"
+    CUDA_VERSION="${CUDA_VERSION%.*}"
+  fi
   readonly CUDA_VERSION
   if ( ! test -v CUDA_FULL_VERSION ) ; then
     CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
@@ -309,8 +646,6 @@ readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USER
 USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
 readonly USERSPACE_FILENAME
 
-readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
-
 # Short name for urls
 if is_ubuntu22  ; then
     # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
@@ -459,33 +794,10 @@ readonly GPU_DRIVER_PROVIDER
 INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
 readonly INSTALL_GPU_AGENT
 
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
-
 NVIDIA_SMI_PATH='/usr/bin'
 MIG_MAJOR_CAPS=0
 IS_MIG_ENABLED=0
 
-function execute_with_retries() (
-  set +x
-  local -r cmd="$*"
-
-  if [[ "$cmd" =~ "^apt-get install" ]] ; then
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-  fi
-  for ((i = 0; i < 3; i++)); do
-    set -x
-    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
-    set +x
-    if [[ $retval == 0 ]] ; then return 0 ; fi
-    sleep 5
-  done
-  return 1
-)
-
 CUDA_KEYRING_PKG_INSTALLED="0"
 function install_cuda_keyring_pkg() {
   if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
@@ -503,20 +815,6 @@ function uninstall_cuda_keyring_pkg() {
   CUDA_KEYRING_PKG_INSTALLED="0"
 }
 
-function cache_fetched_package() {
-  local src_url="$1"
-  local gcs_fn="$2"
-  local local_fn="$3"
-
-  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
-    time gcloud storage cp "${gcs_fn}" "${local_fn}"
-  else
-    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
-           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
-  fi
-}
-
-
 function install_local_cuda_repo() {
   if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
 
@@ -719,7 +1017,6 @@ function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
   if test -f "${workdir}/cudnn-complete" ; then return ; fi
-
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -743,132 +1040,42 @@ function install_nvidia_cudnn() {
     if ge_debian12 && is_src_os ; then
       apt-get -y install nvidia-cudnn
     else
-      if is_cudnn8 ; then
-        install_local_cudnn8_repo
-
-        apt-get update -qq
-
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-            "libcudnn8=${cudnn_pkg_version}" \
-            "libcudnn8-dev=${cudnn_pkg_version}"
-
-        uninstall_local_cudnn8_repo
-	sync
-      elif is_cudnn9 ; then
-	install_cuda_keyring_pkg
-
-        apt-get update -qq
-
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
-	sync
-      else
-        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
-      fi
-    fi
-  else
-    echo "Unsupported OS: '${_shortname}'"
-    exit 1
-  fi
-
-  ldconfig
-
-  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
-  touch "${workdir}/cudnn-complete"
-}
-
-function configure_dkms_certs() {
-  if test -v PSN && [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping";
-      return 0
-  fi
-
-  mkdir -p "${CA_TMPDIR}"
-
-  # If the private key exists, verify it
-  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
-    echo "Private key material exists"
-
-    local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
-      modulus_md5sum="${expected_modulus_md5sum}"
-
-      # Verify that cert md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key"
-      fi
-
-      # Verify that key md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert"
-      fi
-    else
-      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-
-    return
-  fi
-
-  # Retrieve cloud secrets keys
-  local sig_priv_secret_name
-  sig_priv_secret_name="${PSN}"
-  local sig_pub_secret_name
-  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
-  local sig_secret_project
-  sig_secret_project="$(get_metadata_attribute secret_project)"
-  local sig_secret_version
-  sig_secret_version="$(get_metadata_attribute secret_version)"
-
-  # If metadata values are not set, do not write mok keys
-  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
-
-  # Write private material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_priv_secret_name}" \
-      | dd status=none of="${CA_TMPDIR}/db.rsa"
-
-  # Write public material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_pub_secret_name}" \
-      | base64 --decode \
-      | dd status=none of="${CA_TMPDIR}/db.der"
+      if is_cudnn8 ; then
+        install_local_cudnn8_repo
 
-  local mok_directory="$(dirname "${mok_key}")"
-  mkdir -p "${mok_directory}"
+        apt-get update -qq
 
-  # symlink private key and copy public cert from volatile storage to DKMS directory
-  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+            "libcudnn8=${cudnn_pkg_version}" \
+            "libcudnn8-dev=${cudnn_pkg_version}"
 
-  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
-}
+        uninstall_local_cudnn8_repo
+	sync
+      elif is_cudnn9 ; then
+	install_cuda_keyring_pkg
 
-function clear_dkms_key {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping" >&2
-      return 0
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
+	sync
+      else
+        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
+      fi
+    fi
+  else
+    echo "Unsupported OS: '${_shortname}'"
+    exit 1
   fi
-  rm -rf "${CA_TMPDIR}" "${mok_key}"
-}
 
-function add_contrib_component() {
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-kernel-open-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib"
+  ldconfig
 
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
-  fi
+  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
+  touch "${workdir}/cudnn-complete"
 }
 
 function add_nonfree_components() {
@@ -884,20 +1091,21 @@ function add_nonfree_components() {
   fi
 }
 
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 function add_repo_nvidia_container_toolkit() {
-  if is_debuntu ; then
-      local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-      local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
-      # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
-      test -f "${kr_path}" ||
-        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
-          | gpg --dearmor -o "${kr_path}"
-
-      test -f "${sources_list_path}" ||
-        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
-          | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
-          | tee "${sources_list_path}"
-  fi
+  local nvctk_root="https://nvidia.github.io/libnvidia-container"
+  local signing_key_url="${nvctk_root}/gpgkey"
+  local repo_data
+
+  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
+                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+
+  os_add_repo nvidia-container-toolkit \
+              "${signing_key_url}" \
+              "${repo_data}" \
+              "no"
 }
 
 function add_repo_cuda() {
@@ -1150,27 +1358,44 @@ function install_cuda(){
   # The OS package distributions are unreliable
   install_cuda_runfile
 
-  # Includes cudNN packages
+  # Includes CUDA packages
   add_repo_cuda
 
   touch "${workdir}/cuda-repo-complete"
 }
 
+function install_nvidia_container_toolkit() {
+  local container_runtime_default
+    if command -v docker     ; then container_runtime_default='docker'
+  elif command -v containerd ; then container_runtime_default='containerd'
+  elif command -v crio       ; then container_runtime_default='crio'
+                               else container_runtime_default='' ; fi
+  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
+
+  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
+
+  add_repo_nvidia_container_toolkit
+  if is_debuntu ; then
+    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
+    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
+  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
+  systemctl restart "${CONTAINER_RUNTIME}"
+}
+
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
   if test -f "${workdir}/gpu-driver-complete" ; then return ; fi
+
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
-    add_repo_nvidia_container_toolkit
     apt-get update -qq
     apt-get -yq install \
-          nvidia-container-toolkit \
-          dkms \
-          nvidia-open-kernel-dkms \
-          nvidia-open-kernel-support \
-          nvidia-smi \
-          libglvnd0 \
-          libcuda1
+        dkms \
+        nvidia-open-kernel-dkms \
+        nvidia-open-kernel-support \
+        nvidia-smi \
+        libglvnd0 \
+        libcuda1
     echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
     return 0
   fi
@@ -1244,60 +1469,6 @@ EOF
   systemctl --no-reload --now enable gpu-utilization-agent.service
 }
 
-function set_hadoop_property() {
-  local -r config_file=$1
-  local -r property=$2
-  local -r value=$3
-  "${bdcfg}" set_property \
-    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
-    --name "${property}" --value "${value}" \
-    --clobber
-}
-
-function configure_yarn_resources() {
-  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
-  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
-
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
-
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
-
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
-
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
-
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
-  fi
-}
-
 function configure_gpu_exclusive_mode() {
   # check if running spark 3, if not, enable GPU exclusive mode
   local spark_version
@@ -1429,53 +1600,239 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
-function install_build_dependencies() {
-  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
+function install_build_dependencies() {
+  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
+
+  if is_debuntu ; then
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
+
+  elif is_rocky ; then
+    execute_with_retries dnf -y -q install gcc
+
+    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
+    set +e
+    eval "${dnf_cmd}" > "${install_log}" 2>&1
+    local retval="$?"
+    set -e
+
+    if [[ "${retval}" == "0" ]] ; then return ; fi
+
+    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
+      # this kernel-devel may have been migrated to the vault
+      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
+      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
+      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
+        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
+       )"
+    fi
+
+    execute_with_retries "${dnf_cmd}"
+  fi
+  touch "${workdir}/build-dependencies-complete"
+}
+
+function install_dependencies() {
+  pkg_list="pciutils screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+}
+
+function prepare_gpu_env(){
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+
+  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
+  nvsmi_works="0"
+
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+}
+
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
+  fi
+}
+
+function delete_mig_instances() (
+  # delete all instances
+  set +e
+  nvidia-smi mig -dci
+
+  case "${?}" in
+    "0" ) echo "compute instances deleted"            ;;
+    "2" ) echo "invalid argument"                     ;;
+    "6" ) echo "No compute instances found to delete" ;;
+    *   ) echo "unrecognized return code"             ;;
+  esac
+
+  nvidia-smi mig -dgi
+  case "${?}" in
+    "0" ) echo "compute instances deleted"        ;;
+    "2" ) echo "invalid argument"                 ;;
+    "6" ) echo "No GPU instances found to delete" ;;
+    *   ) echo "unrecognized return code"         ;;
+  esac
+)
+
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
+function configure_mig_cgi() {
+  delete_mig_instances
+  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
+  if test -n "${META_MIG_CGI_VALUE}"; then
+    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
+  else
+    if lspci | grep -q H100 ; then
+      # run the following command to list placement profiles
+      # nvidia-smi mig -lgipp
+      #
+      # This is the result when using H100 instances on 20241220
+      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
+      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
+      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
+      # GPU  0 Profile ID  9 Placements: {0,4}:4
+      # GPU  0 Profile ID  5 Placement : {0}:4
+      # GPU  0 Profile ID  0 Placement : {0}:8
+
+      # For H100 3D controllers, use profile 19, 7x1G instances
+      nvidia-smi mig -cgi 19 -C
+    elif lspci | grep -q A100 ; then
+      # Dataproc only supports A100s right now split in 2 if not specified
+      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
+      nvidia-smi mig -cgi 9,9 -C
+    else
+      echo "unrecognized 3D controller"
+    fi
+  fi
+}
+
+function enable_mig() {
+  nvidia-smi -mig 1
+}
+
+
+function configure_dkms_certs() {
+  if test -v PSN && [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
+
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+
+    return
+  fi
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
+}
 
-  if is_debuntu ; then
-    if is_ubuntu22 && is_cuda12 ; then
-      # On ubuntu22, the default compiler does not build some kernel module versions
-      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
-      execute_with_retries apt-get install -y -qq gcc-12
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-12
-    fi
+function check_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
 
-  elif is_rocky ; then
-    execute_with_retries dnf -y -q install gcc
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
 
-    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    set +e
-    eval "${dnf_cmd}" > "${install_log}" 2>&1
-    local retval="$?"
-    set -e
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
+    exit 1
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Please either disable secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return 1
+  fi
 
-    if [[ "${retval}" == "0" ]] ; then return ; fi
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
 
-    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
-      # this kernel-devel may have been migrated to the vault
-      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
-      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
-      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
-        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
-       )"
-    fi
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
 
-    execute_with_retries "${dnf_cmd}"
-  fi
-  touch "${workdir}/build-dependencies-complete"
+  configure_dkms_certs
 }
 
-function install_dependencies() {
-  pkg_list="pciutils screen"
-  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
-  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
-}
 
 function main() {
   # This configuration should be run on all nodes
@@ -1503,9 +1860,8 @@ function main() {
     # if mig is enabled drivers would have already been installed
     if [[ $IS_MIG_ENABLED -eq 0 ]]; then
       install_nvidia_gpu_driver
-
+      install_nvidia_container_toolkit
       install_cuda
-
       load_kernel_module
 
       if [[ -n ${CUDNN_VERSION} ]]; then
@@ -1556,109 +1912,11 @@ function main() {
   fi
 
   # Restart YARN services if they are running already
-  if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-resourcemanager.service
-  fi
-  if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-nodemanager.service
-  fi
-}
-
-function clean_up_sources_lists() {
-  #
-  # bigtop (primary)
-  #
-  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
-
-  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
-    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
-
-    local regional_bigtop_repo_uri
-    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
-      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
-      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
-      cut -d ' ' -f 2 |
-      head -1)
-
-    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
-    else
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+  for svc in resourcemanager nodemanager; do
+    if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then
+      systemctl restart hadoop-yarn-${svc}.service
     fi
-
-    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
-    rm -f "${bigtop_kr_path}"
-    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
-
-    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-  fi
-
-  #
-  # adoptium
-  #
-  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
-  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
-  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
-  rm -f "${adoptium_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
-   | gpg --dearmor -o "${adoptium_kr_path}"
-  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
-   > /etc/apt/sources.list.d/adoptium.list
-
-
-  #
-  # docker
-  #
-  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
-  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
-  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
-
-  rm -f "${docker_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
-    | gpg --dearmor -o "${docker_kr_path}"
-  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
-    > ${docker_repo_file}
-
-  #
-  # google cloud + logging/monitoring
-  #
-  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
-    rm -f /usr/share/keyrings/cloud.google.gpg
-    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
-    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
-      list_file="/etc/apt/sources.list.d/${list}.list"
-      if [[ -f "${list_file}" ]]; then
-        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
-      fi
-    done
-  fi
-
-  #
-  # cran-r
-  #
-  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
-    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
-    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
-    rm -f /usr/share/keyrings/cran-r.gpg
-    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
-      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
-    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
-  fi
-
-  #
-  # mysql
-  #
-  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
-    rm -f /usr/share/keyrings/mysql.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
-      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
-    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
-  fi
-
-  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
-
+  done
 }
 
 function exit_handler() {
@@ -1694,6 +1952,7 @@ function exit_handler() {
     # re-hold systemd package
     if ge_debian12 ; then
     apt-mark hold systemd libsystemd0 ; fi
+    hold_nvidia_packages
   else
     dnf clean all
   fi
@@ -1761,55 +2020,21 @@ print( "    samples-taken: ", scalar @siz, $/,
   return 0
 }
 
-function set_proxy(){
-  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
-
-  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
-
-  export METADATA_HTTP_PROXY
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  export no_proxy=metadata.google.internal,169.254.169.254
-  export NO_PROXY=metadata.google.internal,169.254.169.254
-}
-
-function mount_ramdisk(){
-  local free_mem
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
-
-  # Write to a ramdisk instead of churning the persistent disk
-
-  tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
-  mount -t tmpfs tmpfs "${tmpdir}"
-
-  # Download conda packages to tmpfs
-  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
-
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  pip cache purge || echo "unable to purge pip cache"
+function prepare_to_install(){
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
 
-  # Download pip packages to tmpfs
-  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+  prepare_gpu_env
 
-  # Download OS packages to tmpfs
-  if is_debuntu ; then
-    mount -t tmpfs tmpfs /var/cache/apt/archives
-  else
-    mount -t tmpfs tmpfs /var/cache/dnf
-  fi
-}
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
 
-function prepare_to_install(){
-  # Verify OS compatability and Secure boot state
-  check_os_and_secure_boot
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
 
   workdir=/opt/install-dpgce
-  nvsmi_works="0"
   tmpdir=/tmp/
   temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
   readonly temp_bucket
@@ -1818,39 +2043,14 @@ function prepare_to_install(){
   readonly uname_r
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
-  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-  readonly CA_TMPDIR
-  PSN="$(get_metadata_attribute private_secret_name)"
-  readonly PSN
-
-  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
-                      mok_der=/var/lib/shim-signed/mok/MOK.der
-                 else mok_key=/var/lib/dkms/mok.key
-                      mok_der=/var/lib/dkms/mok.pub ; fi
-
-  if   is_cuda11 ; then gcc_ver="11"
-  elif is_cuda12 ; then gcc_ver="12" ; fi
 
   mkdir -p "${workdir}"
   trap exit_handler EXIT
   set_proxy
   mount_ramdisk
-  configure_dkms_certs
 
   readonly install_log="${tmpdir}/install.log"
 
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION_ENV}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION_ENV}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION_ENV}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
-
   if test -f "${workdir}/prepare-complete" ; then return ; fi
 
   repair_old_backports
@@ -1882,32 +2082,6 @@ function prepare_to_install(){
   touch "${workdir}/prepare-complete"
 }
 
-# Verify if compatible linux distros and secure boot options are used
-function check_os_and_secure_boot() {
-  local SECURE_BOOT="disabled"
-  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
-  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
-      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
-      exit 1
-  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
-      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
-      exit 1
-  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
-      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
-      exit 1
-  fi
-
-  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
-    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
-    exit 1
-  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
-    echo "Secure boot is enabled, but no signing material provided."
-    echo "Please either disable secure boot or provide signing material as per"
-    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-    return 1
-  fi
-}
-
 prepare_to_install
 
 main

From 26719af037ee77ecfb8328dec04931ba5b032abd Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 23 Dec 2024 20:26:25 -0800
Subject: [PATCH 073/112] do not point to local rpm pgp key

---
 gpu/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 8a483ad40..d485e19ce 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -437,8 +437,8 @@ function dnf_add_repo() {
   local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
 
   curl -s -L "${repo_url}" \
-    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
     | dd of="${repo_path}" status=progress
+#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
 }
 
 #

From 74c09f4e6362b131b2e165eded7869b74c8247da Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 23 Dec 2024 23:58:40 -0800
Subject: [PATCH 074/112] re-ordering to reduce delta from master

---
 gpu/install_gpu_driver.sh | 1025 ++++++++++++++++++-------------------
 1 file changed, 503 insertions(+), 522 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index d485e19ce..8164fc44e 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -12,13 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# This initialization action is generated from
-# initialization-actions/templates/gpu/install_gpu_driver.sh.in
-#
-# Modifications made directly to the generated file will be lost when
-# the template is re-evaluated
-
 #
 # This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
 
@@ -33,30 +26,25 @@ function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
 function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
 
-function define_os_comparison_functions() {
-
-  readonly -A supported_os=(
-    ['debian']="10 11 12"
-    ['rocky']="8 9"
-    ['ubuntu']="18.04 20.04 22.04"
-  )
+readonly -A supported_os=(
+  ['debian']="10 11 12"
+  ['rocky']="8 9"
+  ['ubuntu']="18.04 20.04 22.04"
+)
 
-  # dynamically define OS version test utility functions
-  if [[ "$(os_id)" == "rocky" ]];
-  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
-  else _os_version="$(os_version)"; fi
-  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
-    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
-
-    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
-      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
-      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
-      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
-    done
+# dynamically define OS version test utility functions
+if [[ "$(os_id)" == "rocky" ]];
+then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+else _os_version="$(os_version)"; fi
+for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
   done
-}
-
-define_os_comparison_functions
+done
 
 function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
 
@@ -131,399 +119,64 @@ function get_metadata_attribute() (
   get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
 )
 
-function execute_with_retries() (
-  set +x
-  local -r cmd="$*"
-
-  if [[ "$cmd" =~ "^apt-get install" ]] ; then
-    apt-get -y clean
-    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
-  fi
-  for ((i = 0; i < 3; i++)); do
-    set -x
-    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
-    set +x
-    if [[ $retval == 0 ]] ; then return 0 ; fi
-    sleep 5
-  done
-  return 1
+OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+readonly OS_NAME
+
+# node role
+ROLE="$(get_metadata_attribute dataproc-role)"
+readonly ROLE
+
+# CUDA version and Driver version
+# https://docs.nvidia.com/deploy/cuda-compatibility/
+# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+# https://developer.nvidia.com/cuda-downloads
+
+# Minimum supported version for open kernel driver is 515.43.04
+# https://github.com/NVIDIA/open-gpu-kernel-modules/tags
+# Rocky8: 12.0: 525.147.05
+latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+readonly -A DRIVER_FOR_CUDA=(
+        ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
+        ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+)
+readonly -A DRIVER_SUBVER=(
+        ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+        ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+)
+# https://developer.nvidia.com/cudnn-downloads
+if is_debuntu ; then
+readonly -A CUDNN_FOR_CUDA=(
+        ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+        ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+)
+elif is_rocky ; then
+# rocky:
+#   12.0: 8.8.1.3
+#   12.1: 8.9.3.28
+#   12.2: 8.9.7.29
+#   12.3: 9.0.0.312
+#   12.4: 9.1.1.17
+#   12.5: 9.2.1.18
+#   12.6: 9.5.1.17
+readonly -A CUDNN_FOR_CUDA=(
+        ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
+        ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
+)
+fi
+# https://developer.nvidia.com/nccl/nccl-download
+# 12.2: 2.19.3, 12.5: 2.21.5
+readonly -A NCCL_FOR_CUDA=(
+        ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
+        ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+)
+readonly -A CUDA_SUBVER=(
+        ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
+        ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
 )
 
-function cache_fetched_package() {
-  local src_url="$1"
-  local gcs_fn="$2"
-  local local_fn="$3"
-
-  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
-    time gcloud storage cp "${gcs_fn}" "${local_fn}"
-  else
-    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
-           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
-  fi
-}
-
-function add_contrib_component() {
-  if ! is_debuntu ; then return ; fi
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-kernel-open-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib"
-
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
-  fi
-}
-
-function set_hadoop_property() {
-  local -r config_file=$1
-  local -r property=$2
-  local -r value=$3
-  "${bdcfg}" set_property \
-    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
-    --name "${property}" --value "${value}" \
-    --clobber
-}
-
-function configure_yarn_resources() {
-  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
-  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
-
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
-
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
-
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
-
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
-
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
-  fi
-}
-
-function clean_up_sources_lists() {
-  #
-  # bigtop (primary)
-  #
-  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
-
-  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
-    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
-
-    local regional_bigtop_repo_uri
-    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
-      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
-      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
-      cut -d ' ' -f 2 |
-      head -1)
-
-    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
-    else
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
-    fi
-
-    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
-    rm -f "${bigtop_kr_path}"
-    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
-
-    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-  fi
-
-  #
-  # adoptium
-  #
-  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
-  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
-  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
-  rm -f "${adoptium_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
-   | gpg --dearmor -o "${adoptium_kr_path}"
-  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
-   > /etc/apt/sources.list.d/adoptium.list
-
-
-  #
-  # docker
-  #
-  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
-  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
-  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
-
-  rm -f "${docker_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
-    | gpg --dearmor -o "${docker_kr_path}"
-  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
-    > ${docker_repo_file}
-
-  #
-  # google cloud + logging/monitoring
-  #
-  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
-    rm -f /usr/share/keyrings/cloud.google.gpg
-    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
-    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
-      list_file="/etc/apt/sources.list.d/${list}.list"
-      if [[ -f "${list_file}" ]]; then
-        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
-      fi
-    done
-  fi
-
-  #
-  # cran-r
-  #
-  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
-    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
-    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
-    rm -f /usr/share/keyrings/cran-r.gpg
-    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
-      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
-    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
-  fi
-
-  #
-  # mysql
-  #
-  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
-    rm -f /usr/share/keyrings/mysql.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
-      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
-    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
-  fi
-
-  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
-
-}
-
-function set_proxy(){
-  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
-
-  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
-
-  export METADATA_HTTP_PROXY
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
-  local no_proxy_svc
-  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
-                      bigquery composer      pubsub bigquerydatatransfer dataflow \
-                      storage  datafusion    ; do
-    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
-  done
-
-  export NO_PROXY="${no_proxy}"
-}
-
-function mount_ramdisk(){
-  local free_mem
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
-
-  # Write to a ramdisk instead of churning the persistent disk
-
-  tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
-  mount -t tmpfs tmpfs "${tmpdir}"
-
-  # Download conda packages to tmpfs
-  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
-
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  pip cache purge || echo "unable to purge pip cache"
-
-  # Download pip packages to tmpfs
-  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
-
-  # Download OS packages to tmpfs
-  if is_debuntu ; then
-    mount -t tmpfs tmpfs /var/cache/apt/archives
-  else
-    mount -t tmpfs tmpfs /var/cache/dnf
-  fi
-}
-
-function check_os() {
-  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
-      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
-      exit 1
-  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
-      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
-      exit 1
-  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
-      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
-      exit 1
-  fi
-
-  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-  readonly SPARK_VERSION
-  if version_lt "${SPARK_VERSION}" "3.1" || \
-     version_ge "${SPARK_VERSION}" "4.0" ; then
-    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-    exit 1
-  fi
-
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
-}
-
-#
-# Generate repo file under /etc/apt/sources.list.d/
-#
-function apt_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local -r include_src="${4:-yes}"
-  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
-
-  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
-  if [[ "${include_src}" == "yes" ]] ; then
-    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
-  fi
-
-  apt-get update -qq
-}
-
-#
-# Generate repo file under /etc/yum.repos.d/
-#
-function dnf_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
-  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
-
-  curl -s -L "${repo_url}" \
-    | dd of="${repo_path}" status=progress
-#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
-}
-
-#
-# Keyrings default to
-# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
-# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
-#
-function os_add_repo() {
-  local -r repo_name="$1"
-  local -r signing_key_url="$2"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local kr_path
-  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
-
-  mkdir -p "$(dirname "${kr_path}")"
-
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
-    | gpg --import --no-default-keyring --keyring "${kr_path}"
-
-  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
-                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
-}
-
-
-readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
-
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
-
-
-function set_support_matrix() {
-  # CUDA version and Driver version
-  # https://docs.nvidia.com/deploy/cuda-compatibility/
-  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
-  # https://developer.nvidia.com/cuda-downloads
-
-  # Minimum supported version for open kernel driver is 515.43.04
-  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
-  # Rocky8: 12.0: 525.147.05
-  local latest
-  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
-  readonly -A DRIVER_FOR_CUDA=(
-          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
-          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
-  )
-  readonly -A DRIVER_SUBVER=(
-          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
-          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
-  )
-  # https://developer.nvidia.com/cudnn-downloads
-  if is_debuntu ; then
-  readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
-  )
-  elif is_rocky ; then
-  # rocky:
-  #   12.0: 8.8.1.3
-  #   12.1: 8.9.3.28
-  #   12.2: 8.9.7.29
-  #   12.3: 9.0.0.312
-  #   12.4: 9.1.1.17
-  #   12.5: 9.2.1.18
-  #   12.6: 9.5.1.17
-  readonly -A CUDNN_FOR_CUDA=(
-          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
-  )
-  fi
-  # https://developer.nvidia.com/nccl/nccl-download
-  # 12.2: 2.19.3, 12.5: 2.21.5
-  readonly -A NCCL_FOR_CUDA=(
-          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
-          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
-  )
-  readonly -A CUDA_SUBVER=(
-          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
-          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
-  )
-}
-
-set_support_matrix
+# Verify SPARK compatability
+RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 
 function set_cuda_version() {
   local cuda_url
@@ -602,7 +255,7 @@ function set_driver_version() {
   DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
 
   readonly DRIVER_VERSION
-  readonly DRIVER="${DRIVER_VERSION%%.*}"
+  readonly DRIVER=${DRIVER_VERSION%%.*}
 
   export DRIVER_VERSION DRIVER
 
@@ -653,14 +306,14 @@ if is_ubuntu22  ; then
     # use packages from previous release until such time as nvidia
     # release ubuntu2204 builds
 
-    shortname="$(os_id)$(os_vercat)"
     nccl_shortname="ubuntu2004"
+    shortname="$(os_id)$(os_vercat)"
 elif ge_rocky9 ; then
     # use packages from previous release until such time as nvidia
     # release rhel9 builds
 
-    shortname="rhel9"
     nccl_shortname="rhel8"
+    shortname="rhel9"
 elif is_rocky ; then
     shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
     nccl_shortname="${shortname}"
@@ -794,10 +447,33 @@ readonly GPU_DRIVER_PROVIDER
 INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
 readonly INSTALL_GPU_AGENT
 
+# Dataproc configurations
+readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+readonly HIVE_CONF_DIR='/etc/hive/conf'
+readonly SPARK_CONF_DIR='/etc/spark/conf'
+
 NVIDIA_SMI_PATH='/usr/bin'
 MIG_MAJOR_CAPS=0
 IS_MIG_ENABLED=0
 
+function execute_with_retries() (
+  set +x
+  local -r cmd="$*"
+
+  if [[ "$cmd" =~ "^apt-get install" ]] ; then
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+  fi
+  for ((i = 0; i < 3; i++)); do
+    set -x
+    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    set +x
+    if [[ $retval == 0 ]] ; then return 0 ; fi
+    sleep 5
+  done
+  return 1
+)
+
 CUDA_KEYRING_PKG_INSTALLED="0"
 function install_cuda_keyring_pkg() {
   if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
@@ -818,8 +494,6 @@ function uninstall_cuda_keyring_pkg() {
 function install_local_cuda_repo() {
   if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
 
-  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  CUDA_LOCAL_REPO_INSTALLED="1"
   pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
   CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
   readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
@@ -1068,14 +742,105 @@ function install_nvidia_cudnn() {
       fi
     fi
   else
-    echo "Unsupported OS: '${_shortname}'"
+    echo "Unsupported OS: '${OS_NAME}'"
     exit 1
   fi
 
   ldconfig
 
-  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
   touch "${workdir}/cudnn-complete"
+  echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
+}
+
+function configure_dkms_certs() {
+  if test -v PSN && [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
+
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+
+    return
+  fi
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
+}
+
+function add_contrib_component() {
+  if ! is_debuntu ; then return ; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
 }
 
 function add_nonfree_components() {
@@ -1116,13 +881,14 @@ function add_repo_cuda() {
   fi
 }
 
+readonly uname_r=$(uname -r)
+
 function build_driver_from_github() {
   # non-GPL driver will have been built on rocky8
   if is_rocky8 ; then return 0 ; fi
   pushd "${workdir}"
-
   test -d "${workdir}/open-gpu-kernel-modules" || {
-    local tarball_fn="${DRIVER_VERSION}.tar.gz"
+    tarball_fn="${DRIVER_VERSION}.tar.gz"
     curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
       "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
       | tar xz
@@ -1469,6 +1235,60 @@ EOF
   systemctl --no-reload --now enable gpu-utilization-agent.service
 }
 
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
+}
+
+function configure_yarn_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+}
+
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
+
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
+
 function configure_gpu_exclusive_mode() {
   # check if running spark 3, if not, enable GPU exclusive mode
   local spark_version
@@ -1649,8 +1469,6 @@ function install_dependencies() {
 }
 
 function prepare_gpu_env(){
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
 
   readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
   nvsmi_works="0"
@@ -1721,90 +1539,12 @@ function configure_mig_cgi() {
     fi
   fi
 }
-
-function enable_mig() {
-  nvidia-smi -mig 1
-}
-
-
-function configure_dkms_certs() {
-  if test -v PSN && [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping";
-      return 0
-  fi
-
-  mkdir -p "${CA_TMPDIR}"
-
-  # If the private key exists, verify it
-  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
-    echo "Private key material exists"
-
-    local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
-    if [[ -n "${expected_modulus_md5sum}" ]]; then
-      modulus_md5sum="${expected_modulus_md5sum}"
-
-      # Verify that cert md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key"
-      fi
-
-      # Verify that key md5sum matches expected md5sum
-      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert"
-      fi
-    else
-      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-
-    return
-  fi
-
-  # Retrieve cloud secrets keys
-  local sig_priv_secret_name
-  sig_priv_secret_name="${PSN}"
-  local sig_pub_secret_name
-  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
-  local sig_secret_project
-  sig_secret_project="$(get_metadata_attribute secret_project)"
-  local sig_secret_version
-  sig_secret_version="$(get_metadata_attribute secret_version)"
-
-  # If metadata values are not set, do not write mok keys
-  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
-
-  # Write private material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_priv_secret_name}" \
-      | dd status=none of="${CA_TMPDIR}/db.rsa"
-
-  # Write public material to volatile storage
-  gcloud secrets versions access "${sig_secret_version}" \
-         --project="${sig_secret_project}" \
-         --secret="${sig_pub_secret_name}" \
-      | base64 --decode \
-      | dd status=none of="${CA_TMPDIR}/db.der"
-
-  local mok_directory="$(dirname "${mok_key}")"
-  mkdir -p "${mok_directory}"
-
-  # symlink private key and copy public cert from volatile storage to DKMS directory
-  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
-  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
-
-  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
-}
-
-function clear_dkms_key {
-  if [[ -z "${PSN}" ]]; then
-      echo "No signing secret provided.  skipping" >&2
-      return 0
-  fi
-  rm -rf "${CA_TMPDIR}" "${mok_key}"
+
+function enable_mig() {
+  nvidia-smi -mig 1
 }
 
+
 function check_secure_boot() {
   local SECURE_BOOT="disabled"
   SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
@@ -1919,6 +1659,116 @@ function main() {
   done
 }
 
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
+
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
+
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
+
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    fi
+
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
+
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
+
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
+
+
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
+
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
+
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+      fi
+    done
+  fi
+
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
+
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  fi
+
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+
+}
+
 function exit_handler() {
   # Purge private key material until next grant
   clear_dkms_key
@@ -2020,6 +1870,56 @@ print( "    samples-taken: ", scalar @siz, $/,
   return 0
 }
 
+function set_proxy(){
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+  export METADATA_HTTP_PROXY
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+  local no_proxy_svc
+  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
+                      bigquery composer      pubsub bigquerydatatransfer dataflow \
+                      storage  datafusion    ; do
+    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  done
+
+  export NO_PROXY="${no_proxy}"
+}
+
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
+
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  pip cache purge || echo "unable to purge pip cache"
+
+  # Download pip packages to tmpfs
+  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+}
+
 function prepare_to_install(){
   # Verify OS compatability and Secure boot state
   check_os
@@ -2027,20 +1927,11 @@ function prepare_to_install(){
 
   prepare_gpu_env
 
-  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
-  ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
   workdir=/opt/install-dpgce
   tmpdir=/tmp/
   temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
   readonly temp_bucket
   readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
-  uname_r=$(uname -r)
-  readonly uname_r
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
@@ -2082,6 +1973,96 @@ function prepare_to_install(){
   touch "${workdir}/prepare-complete"
 }
 
+function check_os() {
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
+
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+}
+
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+  fi
+
+  apt-get update -qq
+}
+
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | dd of="${repo_path}" status=progress
+#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+}
+
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
+#
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+  mkdir -p "$(dirname "${kr_path}")"
+
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
+
+
+readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
 prepare_to_install
 
 main

From 53c1ef1c0a4ae308347078499457a7658f2cc670 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 09:29:37 -0800
Subject: [PATCH 075/112] custom image usage can come later

---
 cloudbuild/presubmit.sh | 132 ----------------------------------------
 1 file changed, 132 deletions(-)
 delete mode 100644 cloudbuild/presubmit.sh

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
deleted file mode 100644
index 9ed39d0ee..000000000
--- a/cloudbuild/presubmit.sh
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/bin/bash
-
-set -euxo pipefail
-
-# Declare global variable for passing tests between functions
-declare -a TESTS_TO_RUN
-
-configure_gcloud() {
-  gcloud config set core/disable_prompts TRUE
-  gcloud config set compute/region us-central1
-}
-
-configure_gcloud_ssh_key() {
-  mkdir "${HOME}/.ssh"
-
-  gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \
-    --ciphertext-file=cloudbuild/ssh-key.enc \
-    --plaintext-file="${HOME}/.ssh/google_compute_engine"
-
-  gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \
-    --ciphertext-file=cloudbuild/ssh-key.pub.enc \
-    --plaintext-file="${HOME}/.ssh/google_compute_engine.pub"
-
-  chmod 600 "${HOME}/.ssh/google_compute_engine"
-}
-
-# Fetches master branch from GitHub and "resets" local changes to be relative to it,
-# so we can diff what changed relatively to master branch.
-initialize_git_repo() {
-  rm -fr .git
-  git config --global init.defaultBranch main
-  git init
-
-  git config user.email "ia-tests@presubmit.example.com"
-  git config user.name "ia-tests"
-
-  git remote add origin "https://github.com/GoogleCloudDataproc/initialization-actions.git"
-  git fetch origin master
-  # Fetch all PRs to get history for PRs created from forked repos
-  git fetch origin +refs/pull/*/merge:refs/remotes/origin/pr/* > /dev/null 2>&1
-
-  git reset --hard "${COMMIT_SHA}"
-
-  git rebase origin/master
-}
-
-# This function adds all changed files to git "index" and diffs them against master branch
-# to determine all changed files and looks for tests in directories with changed files.
-determine_tests_to_run() {
-  # Infer the files that changed
-  mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD)
-  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only)
-  echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}"
-  echo "Changed files: ${CHANGED_FILES[*]}"
-
-  # Run all tests if common directories modified by deleting files
-  if [[ "${#DELETED_BUILD_FILES[@]}" -gt 0 ]]; then
-    echo "All tests will be run: the following BUILD files '${DELETED_BUILD_FILES[*]}' were removed"
-    TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
-    return 0
-  fi
-
-  set +x
-  # Determines init actions directories that were changed
-  declare -a changed_dirs
-  for changed_file in "${CHANGED_FILES[@]}"; do
-    local changed_dir
-    changed_dir="$(dirname "${changed_file}")/"
-    # Convert `init/internal/` dir to `init/`
-    changed_dir="${changed_dir%%/*}/"
-    # Run all tests if common directories modified
-    if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
-      continue
-      echo "All tests will be run: '${changed_dir}' was changed"
-      TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
-      return 0
-    fi
-    # Hack to workaround empty array expansion on old versions of Bash.
-    # See: https://stackoverflow.com/a/7577209/3227693
-    if [[ $changed_dir != ./ ]] && [[ ${changed_dirs[*]+" ${changed_dirs[*]} "} != *" ${changed_dir} "* ]]; then
-      changed_dirs+=("$changed_dir")
-    fi
-  done
-  echo "Changed directories: ${changed_dirs[*]}"
-
-  # Determines test target in changed init action directories to run
-  for changed_dir in "${changed_dirs[@]}"; do
-    # NOTE: The ::-1 removes the trailing '/'
-    local test_name=${changed_dir::-1}
-    # Some of our py_tests (that has dashes in the name) are defined in the top-level directory
-    if [[ $test_name == *"-"* ]]; then
-      local test_target=":test_${test_name//-/_}"
-    else
-      local test_target="${test_name}:test_${test_name}"
-    fi
-    TESTS_TO_RUN+=("${test_target}")
-  done
-  echo "Tests: ${TESTS_TO_RUN[*]}"
-
-  set -x
-}
-
-run_tests() {
-  local -r max_parallel_tests=20
-  bazel test \
-    --jobs="${max_parallel_tests}" \
-    --local_test_jobs="${max_parallel_tests}" \
-    --action_env="INTERNAL_IP_SSH=true" \
-    --test_output="all" \
-    --noshow_progress \
-    --noshow_loading_progress \
-    --test_arg="--image_version=${IMAGE_VERSION}" \
-    "${TESTS_TO_RUN[@]}"
-}
-
-main() {
-  cd /init-actions
-
-# TODO: once service account is granted permission to access the cloud
-# secrets, we can source this file and set signing material metadata
-# variables from the environment in the python code.
-
-#  eval "$(bash cloudbuild/create-key-pair.sh | sed -e 's/^/export /g')"
-
-  configure_gcloud
-  configure_gcloud_ssh_key
-  initialize_git_repo
-  determine_tests_to_run
-  run_tests
-}
-
-main

From 97046b13b1ce48bb6a916c8fc3c68cf61af7fbdd Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 09:30:25 -0800
Subject: [PATCH 076/112] see #1283

---
 cloudbuild/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
index aebaffd84..2ea91e3e5 100644
--- a/cloudbuild/Dockerfile
+++ b/cloudbuild/Dockerfile
@@ -21,8 +21,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
     echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \
     dd of="${bazel_repo_file}" status=none && \
     apt-get update -qq
-RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
+RUN apt-get autoremove -y -qq && \
+    apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \
     apt-get clean
 
 # Set bazel-${bazel_version} as the default bazel alternative in this container

From 484308b2c4e81f19acce7a5dfa045b263192d425 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 09:32:16 -0800
Subject: [PATCH 077/112] replaced incorrectly removed presubmit.sh and removed
 custom image key creation script intended to be removed in
 70f37b638e8309a669625844034946fc1b51037a

---
 cloudbuild/create-key-pair.sh | 135 ----------------------------------
 cloudbuild/presubmit.sh       | 125 +++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 135 deletions(-)
 delete mode 100644 cloudbuild/create-key-pair.sh
 create mode 100644 cloudbuild/presubmit.sh

diff --git a/cloudbuild/create-key-pair.sh b/cloudbuild/create-key-pair.sh
deleted file mode 100644
index 8f2a42a70..000000000
--- a/cloudbuild/create-key-pair.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/bin/bash
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS-IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This script creates a key pair and publishes to cloud secrets or
-# fetches an already published key pair from cloud secrets
-
-set -e
-
-# https://github.com/glevand/secure-boot-utils
-
-# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#adding-shielded-image
-
-# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#generating-security-keys-certificates
-
-# https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys
-
-ITERATION=042
-
-CURRENT_PROJECT_ID="$(gcloud config get project)"
-if [[ -z "${CURRENT_PROJECT_ID}" ]]; then
-    echo 'project is not set.  please set with `gcloud config set project ${PROJECT_ID}`' >&2
-    exit -1
-fi
-PROJECT_ID="${CURRENT_PROJECT_ID}"
-
-function create_key () {
-    local EFI_VAR_NAME="$1"
-    local CN_VAL="$2"
-    local PRIVATE_KEY="tls/${EFI_VAR_NAME}.rsa"
-    local CACERT="tls/${EFI_VAR_NAME}.pem"
-    local CACERT_DER="tls/${EFI_VAR_NAME}.der"
-    CA_KEY_SECRET_NAME="efi-${EFI_VAR_NAME}-priv-key-${ITERATION}"
-    CA_CERT_SECRET_NAME="efi-${EFI_VAR_NAME}-pub-key-${ITERATION}"
-    # If the secrets exist in secret manager, populate the tls/ directory
-    if [[ ! -f "${PRIVATE_KEY}" ]] && gcloud secrets describe "${CA_CERT_SECRET_NAME}" > /dev/null ; then
-      mkdir -p tls
-
-      gcloud secrets versions access "1" \
-        --project="${PROJECT_ID}" \
-        --secret="${CA_KEY_SECRET_NAME}" \
-        | dd of="${PRIVATE_KEY}" status=none
-
-      gcloud secrets versions access "1" \
-        --project="${PROJECT_ID}" \
-        --secret="${CA_CERT_SECRET_NAME}" \
-        | base64 --decode \
-        | dd of="${CACERT_DER}" status=none
-
-      # Create a PEM-format version of the cert
-      openssl x509 \
-        -inform DER \
-        -in "${CACERT_DER}" \
-        -outform PEM \
-        -out "${CACERT}"
-
-      MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt"
-      curl -s -L -o "${MS_UEFI_CA}" 'https://go.microsoft.com/fwlink/p/?linkid=321194'
-
-      echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt
-      echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt
-      modulus_md5sum="$(openssl rsa -noout -modulus -in ${PRIVATE_KEY} | openssl md5 | awk '{print $2}' | tee tls/modulus-md5sum.txt)"
-      return
-    fi
-
-    if [[ -f "${PRIVATE_KEY}" ]]; then
-        modulus_md5sum="$(cat tls/modulus-md5sum.txt)"
-        return
-    fi
-    mkdir -p tls
-
-    echo "generating '${CN_VAL}' '${CACERT}', '${CACERT_DER}' and '${PRIVATE_KEY}'" >&2
-    # Generate new x.509 key and cert
-    openssl req \
-            -newkey rsa:3072 \
-            -nodes \
-            -keyout "${PRIVATE_KEY}" \
-            -new \
-            -x509 \
-            -sha256 \
-            -days 3650 \
-            -subj "/CN=${CN_VAL}/" \
-            -out "${CACERT}"
-
-    # Create a DER-format version of the cert
-    openssl x509 \
-            -outform DER \
-            -in "${CACERT}" \
-            -outform DER \
-            -in "${CACERT}" \
-            -out "${CACERT_DER}"
-
-    # Create a new secret containing private key
-    gcloud secrets create "${CA_KEY_SECRET_NAME}" \
-           --project="${PROJECT_ID}" \
-           --replication-policy="automatic" \
-           --data-file="${PRIVATE_KEY}"
-
-    echo "Private key secret name: '${CA_KEY_SECRET_NAME}'" >&2
-    echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt
-
-    # Create a new secret containing public key
-    cat "${CACERT_DER}" | base64 > "${CACERT_DER}.base64"
-    gcloud secrets create "${CA_CERT_SECRET_NAME}" \
-           --project="${PROJECT_ID}" \
-           --replication-policy="automatic" \
-           --data-file="${CACERT_DER}.base64"
-
-    modulus_md5sum="$(openssl x509 -noout -modulus -in ${CACERT} | openssl md5 | awk '{print $2}')"
-    echo "modulus-md5sum: ${modulus_md5sum}" >&2
-    echo "${modulus_md5sum}" > tls/modulus-md5sum.txt
-    echo "Public key secret name: '${CA_CERT_SECRET_NAME}'" >&2
-    echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt
-
-}
-
-EFI_VAR_NAME=db
-
-create_key "${EFI_VAR_NAME}" "Cloud Dataproc Custom Image CA ${ITERATION}"
-
-echo "modulus_md5sum=${modulus_md5sum}"
-echo "private_secret_name=${CA_KEY_SECRET_NAME}"
-echo "public_secret_name=${CA_CERT_SECRET_NAME}"
-echo "secret_project=${PROJECT_ID}"
-echo "secret_version=1"
diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
new file mode 100644
index 000000000..eec7adb76
--- /dev/null
+++ b/cloudbuild/presubmit.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+# Declare global variable for passing tests between functions
+declare -a TESTS_TO_RUN
+
+configure_gcloud() {
+  gcloud config set core/disable_prompts TRUE
+  gcloud config set compute/region us-central1
+}
+
+configure_gcloud_ssh_key() {
+  mkdir "${HOME}/.ssh"
+
+  gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \
+    --ciphertext-file=cloudbuild/ssh-key.enc \
+    --plaintext-file="${HOME}/.ssh/google_compute_engine"
+
+  gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \
+    --ciphertext-file=cloudbuild/ssh-key.pub.enc \
+    --plaintext-file="${HOME}/.ssh/google_compute_engine.pub"
+
+  chmod 600 "${HOME}/.ssh/google_compute_engine"
+}
+
+# Fetches master branch from GitHub and "resets" local changes to be relative to it,
+# so we can diff what changed relatively to master branch.
+initialize_git_repo() {
+  rm -fr .git
+  git config --global init.defaultBranch main
+  git init
+
+  git config user.email "ia-tests@presubmit.example.com"
+  git config user.name "ia-tests"
+
+  git remote add origin "https://github.com/GoogleCloudDataproc/initialization-actions.git"
+  git fetch origin master
+  # Fetch all PRs to get history for PRs created from forked repos
+  git fetch origin +refs/pull/*/merge:refs/remotes/origin/pr/* > /dev/null 2>&1
+
+  git reset --hard "${COMMIT_SHA}"
+
+  git rebase origin/master
+}
+
+# This function adds all changed files to git "index" and diffs them against master branch
+# to determine all changed files and looks for tests in directories with changed files.
+determine_tests_to_run() {
+  # Infer the files that changed
+  mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD)
+  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only)
+  echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}"
+  echo "Changed files: ${CHANGED_FILES[*]}"
+
+  # Run all tests if common directories modified by deleting files
+  if [[ "${#DELETED_BUILD_FILES[@]}" -gt 0 ]]; then
+    echo "All tests will be run: the following BUILD files '${DELETED_BUILD_FILES[*]}' were removed"
+    TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
+    return 0
+  fi
+
+  set +x
+  # Determines init actions directories that were changed
+  declare -a changed_dirs
+  for changed_file in "${CHANGED_FILES[@]}"; do
+    local changed_dir
+    changed_dir="$(dirname "${changed_file}")/"
+    # Convert `init/internal/` dir to `init/`
+    changed_dir="${changed_dir%%/*}/"
+    # Run all tests if common directories modified
+    if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
+      echo "All tests will be run: '${changed_dir}' was changed"
+      TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
+      return 0
+    fi
+    # Hack to workaround empty array expansion on old versions of Bash.
+    # See: https://stackoverflow.com/a/7577209/3227693
+    if [[ $changed_dir != ./ ]] && [[ ${changed_dirs[*]+" ${changed_dirs[*]} "} != *" ${changed_dir} "* ]]; then
+      changed_dirs+=("$changed_dir")
+    fi
+  done
+  echo "Changed directories: ${changed_dirs[*]}"
+
+  # Determines test target in changed init action directories to run
+  for changed_dir in "${changed_dirs[@]}"; do
+    # NOTE: The ::-1 removes the trailing '/'
+    local test_name=${changed_dir::-1}
+    # Some of our py_tests (that has dashes in the name) are defined in the top-level directory
+    if [[ $test_name == *"-"* ]]; then
+      local test_target=":test_${test_name//-/_}"
+    else
+      local test_target="${test_name}:test_${test_name}"
+    fi
+    TESTS_TO_RUN+=("${test_target}")
+  done
+  echo "Tests: ${TESTS_TO_RUN[*]}"
+
+  set -x
+}
+
+run_tests() {
+  local -r max_parallel_tests=20
+  bazel test \
+    --jobs="${max_parallel_tests}" \
+    --local_test_jobs="${max_parallel_tests}" \
+    --flaky_test_attempts=3 \
+    --action_env="INTERNAL_IP_SSH=true" \
+    --test_output="all" \
+    --noshow_progress \
+    --noshow_loading_progress \
+    --test_arg="--image_version=${IMAGE_VERSION}" \
+    "${TESTS_TO_RUN[@]}"
+}
+
+main() {
+  cd /init-actions
+  configure_gcloud
+  configure_gcloud_ssh_key
+  initialize_git_repo
+  determine_tests_to_run
+  run_tests
+}
+
+main

From 61b94da8ad289fa51bba8528ab3744bf321002ac Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 09:36:14 -0800
Subject: [PATCH 078/112] revert nearly to master

---
 gpu/manual-test-runner.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh
index 0199d62ad..3f126670b 100644
--- a/gpu/manual-test-runner.sh
+++ b/gpu/manual-test-runner.sh
@@ -5,18 +5,21 @@
 # To run the script, the following will bootstrap
 #
 # git clone git@github.com:GoogleCloudDataproc/initialization-actions
-# cd initialization-actions
 # git checkout 2024.12
+# cd initialization-actions
 # cp gpu/env.json.sample env.json
 # vi env.json
 # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest .
 # time docker run -it gpu-init-actions-runner:latest gpu/manual-test-runner.sh
 #
 # The bazel run(s) happen in separate screen windows.
+#  To create a new screen window, press ^a c
 #  To see a list of screen windows, press ^a "
 # Num Name
 #
+#   0 monitor
 #   1 2.0-debian10
+#   2 sh
 
 
 readonly timestamp="$(date +%F-%H-%M)"
@@ -33,7 +36,7 @@ export PROJECT_ID="$(jq    -r .PROJECT_ID           env.json)"
 export REGION="$(jq        -r .REGION               env.json)"
 export BUCKET="$(jq        -r .BUCKET               env.json)"
 
-gcs_log_dir="gs://${BUCKET}/gpu-dpgce/builds/${BUILD_ID}/logs"
+gcs_log_dir="gs://${BUCKET}/${BUILD_ID}/logs"
 
 function exit_handler() {
   RED='\\e[0;31m'
@@ -44,11 +47,8 @@ function exit_handler() {
   # TODO: list clusters which match our BUILD_ID and clean them up
   # TODO: remove any test related resources in the project
 
-  # We allow the user to monitor the logs from within screen session.
-  # Logs can be archived if necessary, but won't be unless needed.
-
-#  echo 'Uploading local logs to GCS bucket.'
-#  gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/"
+  echo 'Uploading local logs to GCS bucket.'
+  gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/"
 
   if [[ -f "${tmp_dir}/tests_success" ]]; then
     echo -e "${GREEN}Workflow succeeded${NC}, check logs at ${log_dir}/ or ${gcs_log_dir}/"

From 8b4f4f8623d241d4a022a9346e893279d32cd1ce Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 09:39:58 -0800
Subject: [PATCH 079/112] can include extended test suite later

---
 gpu/verify_pytorch.py    |  8 --------
 gpu/verify_tensorflow.py | 28 ----------------------------
 2 files changed, 36 deletions(-)
 delete mode 100644 gpu/verify_pytorch.py
 delete mode 100644 gpu/verify_tensorflow.py

diff --git a/gpu/verify_pytorch.py b/gpu/verify_pytorch.py
deleted file mode 100644
index dd4910d97..000000000
--- a/gpu/verify_pytorch.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import torch
-print("get CUDA details : == : ")
-use_cuda = torch.cuda.is_available()
-if use_cuda:
-    print('__CUDNN VERSION:', torch.backends.cudnn.version())
-    print('__Number CUDA Devices:', torch.cuda.device_count())
-    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
-    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)
diff --git a/gpu/verify_tensorflow.py b/gpu/verify_tensorflow.py
deleted file mode 100644
index 2faf2c717..000000000
--- a/gpu/verify_tensorflow.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import tensorflow as tf
-print("Get GPU Details : ")
-print(tf.config.list_physical_devices('GPU'))
-#print(tf.test.is_gpu_available())
-
-if tf.test.gpu_device_name():
-    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
-    print("Please install GPU version of TF")
-
-gpu_available = tf.config.list_physical_devices('GPU')
-print("gpu_available : " + str(gpu_available))
-
-#is_cuda_gpu_available = tf.config.list_physical_devices('GPU',cuda_only=True)
-is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
-print("is_cuda_gpu_available : " + str(is_cuda_gpu_available))
-
-#is_cuda_gpu_min_3 = tf.config.list_physical_devices('GPU',True, (3,0))
-is_cuda_gpu_min_3 = tf.test.is_gpu_available(True, (3,0))
-print("is_cuda_gpu_min_3 : " + str(is_cuda_gpu_min_3))
-
-from tensorflow.python.client import device_lib
-
-def get_available_gpus():
-    local_device_protos = device_lib.list_local_devices()
-    return [x.name for x in local_device_protos if x.device_type == 'GPU']
-
-print("Run GPU Functions Below : ")
-print(get_available_gpus())

From 3bc45ff78d525ba1c562c9a0f0d7ad27d5365d7e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 09:42:45 -0800
Subject: [PATCH 080/112] order commands correctly

---
 gpu/manual-test-runner.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh
index 3f126670b..37982bfe4 100644
--- a/gpu/manual-test-runner.sh
+++ b/gpu/manual-test-runner.sh
@@ -5,8 +5,8 @@
 # To run the script, the following will bootstrap
 #
 # git clone git@github.com:GoogleCloudDataproc/initialization-actions
-# git checkout 2024.12
 # cd initialization-actions
+# git checkout 2024.12
 # cp gpu/env.json.sample env.json
 # vi env.json
 # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest .

From 6a76b4ec05bd1e55752d82b4e0d377c12bf4b8f6 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 24 Dec 2024 09:56:01 -0800
Subject: [PATCH 081/112] placing all completion files in a common directory

---
 gpu/install_gpu_driver.sh | 112 +++++++++-----------------------------
 1 file changed, 27 insertions(+), 85 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 8164fc44e..212aa6fbe 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -492,7 +492,7 @@ function uninstall_cuda_keyring_pkg() {
 }
 
 function install_local_cuda_repo() {
-  if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi
 
   pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
   CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
@@ -513,16 +513,16 @@ function install_local_cuda_repo() {
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
 
-  touch "${workdir}/install-local-cuda-repo-complete"
+  touch "${workdir}/complete/install-local-cuda-repo"
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  rm -f "${workdir}/install-local-cuda-repo-complete"
+  rm -f "${workdir}/complete/install-local-cuda-repo"
 }
 
 CUDNN_PKG_NAME=""
 function install_local_cudnn_repo() {
-  if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
   CUDNN_PKG_NAME="${pkgname}"
   local_deb_fn="${pkgname}_1.0-1_amd64.deb"
@@ -538,18 +538,18 @@ function install_local_cudnn_repo() {
 
   cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
-  touch "${workdir}/install-local-cudnn-repo-complete"
+  touch "${workdir}/complete/install-local-cudnn-repo"
 }
 
 function uninstall_local_cudnn_repo() {
   apt-get purge -yq "${CUDNN_PKG_NAME}"
-  rm -f "${workdir}/install-local-cudnn-repo-complete"
+  rm -f "${workdir}/complete/install-local-cudnn-repo"
 }
 
 CUDNN8_LOCAL_REPO_INSTALLED="0"
 CUDNN8_PKG_NAME=""
 function install_local_cudnn8_repo() {
-  if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
 
   if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
   elif is_debian ; then cudnn8_shortname="debian11"
@@ -583,16 +583,16 @@ function install_local_cudnn8_repo() {
   rm -f "${local_deb_fn}"
 
   cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  touch "${workdir}/install-local-cudnn8-repo-complete"
+  touch "${workdir}/complete/install-local-cudnn8-repo"
 }
 
 function uninstall_local_cudnn8_repo() {
   apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  rm -f "${workdir}/install-local-cudnn8-repo-complete"
+  rm -f "${workdir}/complete/install-local-cudnn8-repo"
 }
 
 function install_nvidia_nccl() {
-  if test -f "${workdir}/nccl-complete" ; then return ; fi
+  if test -f "${workdir}/complete/nccl" ; then return ; fi
 
   if is_cuda11 && is_debian12 ; then
     echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
@@ -683,14 +683,14 @@ function install_nvidia_nccl() {
   fi
 
   popd
-  touch "${workdir}/nccl-complete"
+  touch "${workdir}/complete/nccl"
 }
 
 function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
 function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
-  if test -f "${workdir}/cudnn-complete" ; then return ; fi
+  if test -f "${workdir}/complete/cudnn" ; then return ; fi
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -748,7 +748,7 @@ function install_nvidia_cudnn() {
 
   ldconfig
 
-  touch "${workdir}/cudnn-complete"
+  touch "${workdir}/complete/cudnn"
   echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
 }
 
@@ -994,7 +994,7 @@ function install_nvidia_userspace_runfile() {
   #
   # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
   # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  if test -f "${workdir}/userspace-complete" ; then return ; fi
+  if test -f "${workdir}/complete/userspace" ; then return ; fi
   local local_fn="${tmpdir}/userspace.run"
 
   cache_fetched_package "${USERSPACE_URL}" \
@@ -1062,12 +1062,12 @@ function install_nvidia_userspace_runfile() {
   fi
 
   rm -f "${local_fn}"
-  touch "${workdir}/userspace-complete"
+  touch "${workdir}/complete/userspace"
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${workdir}/cuda-complete" ; then return ; fi
+  if test -f "${workdir}/complete/cuda" ; then return ; fi
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
@@ -1076,7 +1076,7 @@ function install_cuda_runfile() {
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
   rm -f "${local_fn}"
-  touch "${workdir}/cuda-complete"
+  touch "${workdir}/complete/cuda"
   sync
 }
 
@@ -1114,7 +1114,7 @@ function load_kernel_module() {
 }
 
 function install_cuda(){
-  if test -f "${workdir}/cuda-repo-complete" ; then return ; fi
+  if test -f "${workdir}/complete/cuda-repo" ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     echo "installed with the driver on ${_shortname}"
@@ -1127,7 +1127,7 @@ function install_cuda(){
   # Includes CUDA packages
   add_repo_cuda
 
-  touch "${workdir}/cuda-repo-complete"
+  touch "${workdir}/complete/cuda-repo"
 }
 
 function install_nvidia_container_toolkit() {
@@ -1150,7 +1150,7 @@ function install_nvidia_container_toolkit() {
 
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if test -f "${workdir}/gpu-driver-complete" ; then return ; fi
+  if test -f "${workdir}/complete/gpu-driver" ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
@@ -1172,7 +1172,7 @@ function install_nvidia_gpu_driver() {
   build_driver_from_github
 
   echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  touch "${workdir}/gpu-driver-complete"
+  touch "${workdir}/complete/gpu-driver"
 }
 
 function install_ops_agent(){
@@ -1184,7 +1184,7 @@ function install_ops_agent(){
   curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
   execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
-  touch "${workdir}/ops-agent-complete"
+  touch "${workdir}/complete/ops-agent"
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
@@ -1421,7 +1421,7 @@ function nvsmi() {
 }
 
 function install_build_dependencies() {
-  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi
+  if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
 
   if is_debuntu ; then
     if is_ubuntu22 && is_cuda12 ; then
@@ -1459,7 +1459,7 @@ function install_build_dependencies() {
 
     execute_with_retries "${dnf_cmd}"
   fi
-  touch "${workdir}/build-dependencies-complete"
+  touch "${workdir}/complete/build-dependencies"
 }
 
 function install_dependencies() {
@@ -1487,64 +1487,6 @@ function hold_nvidia_packages() {
   fi
 }
 
-function delete_mig_instances() (
-  # delete all instances
-  set +e
-  nvidia-smi mig -dci
-
-  case "${?}" in
-    "0" ) echo "compute instances deleted"            ;;
-    "2" ) echo "invalid argument"                     ;;
-    "6" ) echo "No compute instances found to delete" ;;
-    *   ) echo "unrecognized return code"             ;;
-  esac
-
-  nvidia-smi mig -dgi
-  case "${?}" in
-    "0" ) echo "compute instances deleted"        ;;
-    "2" ) echo "invalid argument"                 ;;
-    "6" ) echo "No GPU instances found to delete" ;;
-    *   ) echo "unrecognized return code"         ;;
-  esac
-)
-
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
-function configure_mig_cgi() {
-  delete_mig_instances
-  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
-  if test -n "${META_MIG_CGI_VALUE}"; then
-    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
-  else
-    if lspci | grep -q H100 ; then
-      # run the following command to list placement profiles
-      # nvidia-smi mig -lgipp
-      #
-      # This is the result when using H100 instances on 20241220
-      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
-      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
-      # GPU  0 Profile ID  9 Placements: {0,4}:4
-      # GPU  0 Profile ID  5 Placement : {0}:4
-      # GPU  0 Profile ID  0 Placement : {0}:8
-
-      # For H100 3D controllers, use profile 19, 7x1G instances
-      nvidia-smi mig -cgi 19 -C
-    elif lspci | grep -q A100 ; then
-      # Dataproc only supports A100s right now split in 2 if not specified
-      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
-      nvidia-smi mig -cgi 9,9 -C
-    else
-      echo "unrecognized 3D controller"
-    fi
-  fi
-}
-
-function enable_mig() {
-  nvidia-smi -mig 1
-}
-
-
 function check_secure_boot() {
   local SECURE_BOOT="disabled"
   SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
@@ -1935,14 +1877,14 @@ function prepare_to_install(){
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
-  mkdir -p "${workdir}"
+  mkdir -p "${workdir}/complete"
   trap exit_handler EXIT
   set_proxy
   mount_ramdisk
 
   readonly install_log="${tmpdir}/install.log"
 
-  if test -f "${workdir}/prepare-complete" ; then return ; fi
+  if test -f "${workdir}/complete/prepare" ; then return ; fi
 
   repair_old_backports
 
@@ -1970,7 +1912,7 @@ function prepare_to_install(){
   screen -d -m -LUS keep-running-df \
     bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 
-  touch "${workdir}/prepare-complete"
+  touch "${workdir}/complete/prepare"
 }
 
 function check_os() {

From e59214640d65f4f807eb4865af3ddc71daea0986 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 13 Jan 2025 13:39:47 -0800
Subject: [PATCH 082/112] extend supported version list to include latest
 release of each minor version and their associated driver

---
 gpu/install_gpu_driver.sh | 89 ++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 212aa6fbe..9d6bfc135 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -137,42 +137,51 @@ readonly ROLE
 # Rocky8: 12.0: 525.147.05
 latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
 readonly -A DRIVER_FOR_CUDA=(
-        ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
-        ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+    ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
+    ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
+    ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
+    ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
+    ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08"
+    ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03"
 )
 readonly -A DRIVER_SUBVER=(
-        ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
-        ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+    ["410"]="410.104" ["415"]="415.27" ["418"]="418.113"
+    ["430"]="430.64" ["435"]="435.21" ["440"]="440.100"
+    ["450"]="450.119.03" ["455"]="455.45.01" ["460"]="460.91.03"
+    ["465"]="465.31" ["470"]="470.256.02" ["495"]="495.46"
+    ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05"
+    ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06"
+    ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03"
+    ["565"]="565.77"
 )
 # https://developer.nvidia.com/cudnn-downloads
-if is_debuntu ; then
 readonly -A CUDNN_FOR_CUDA=(
-        ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
-        ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+    ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5"
+    ["11.0"]="8.0.4" ["11.1"]="8.0.5" ["11.2"]="8.1.1"
+    ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" ["11.5"]="8.3.1.22"
+    ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17"
+    ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5"
+    ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18"
+    ["12.6"]="9.6.0.74"
 )
-elif is_rocky ; then
-# rocky:
-#   12.0: 8.8.1.3
-#   12.1: 8.9.3.28
-#   12.2: 8.9.7.29
-#   12.3: 9.0.0.312
-#   12.4: 9.1.1.17
-#   12.5: 9.2.1.18
-#   12.6: 9.5.1.17
-readonly -A CUDNN_FOR_CUDA=(
-        ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
-        ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
-)
-fi
 # https://developer.nvidia.com/nccl/nccl-download
 # 12.2: 2.19.3, 12.5: 2.21.5
 readonly -A NCCL_FOR_CUDA=(
-        ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
-        ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+    ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
+    ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4"
+    ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12"
+    ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3"
+    ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4"
+    ["12.5"]="2.22.3" ["12.6"]="2.23.4"
 )
 readonly -A CUDA_SUBVER=(
-        ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
-        ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
+    ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
+    ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2"
+    ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2"
+    ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
+    ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
+    ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
+    ["12.6"]="12.6.3"
 )
 
 # Verify SPARK compatability
@@ -375,15 +384,25 @@ function set_cuda_runfile_url() {
   # driver version named in cuda runfile filename
   # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
   readonly -A drv_for_cuda=(
-          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
-          ["11.8.0"]="520.61.05"
-          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
-          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
-          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
-          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
-          ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
-          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
-          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+      ["10.0.130"]="410.48"
+      ["10.1.234"]="418.87.00"
+      ["10.2.89"]="440.33.01"
+      ["11.0.3"]="450.51.06"
+      ["11.1.1"]="455.42.00"
+      ["11.2.2"]="460.32.03"
+      ["11.3.1"]="465.19.01"
+      ["11.4.4"]="470.82.01"
+      ["11.5.2"]="495.29.05"
+      ["11.6.2"]="510.47.03"
+      ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+      ["11.8.0"]="520.61.05"
+      ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+      ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+      ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+      ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+      ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+      ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
+      ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
   )
 
   # Verify that the file with the indicated combination exists
@@ -1890,7 +1909,7 @@ function prepare_to_install(){
 
   if is_debuntu ; then
     clean_up_sources_lists
-    apt-get update -qq
+    apt-get --allow-releaseinfo-change update
     apt-get -y clean
     apt-get -o DPkg::Lock::Timeout=60 -y autoremove
     if ge_debian12 ; then

From 4559ecc1ce4f1979658d73d4302bf3e45d856012 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 13 Jan 2025 13:41:14 -0800
Subject: [PATCH 083/112] tested with CUDA 11.6.2/510.108.03

* nccl build completes successfully on debian10

* account for nvidia-smi ABI change post 11.6
---
 gpu/install_gpu_driver.sh | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 9d6bfc135..71bef8293 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -345,7 +345,7 @@ function set_cuda_runfile_url() {
   local MAX_DRIVER_VERSION
   local MAX_CUDA_VERSION
 
-  local MIN_OPEN_DRIVER_VER="515.48.07"
+  MIN_OPEN_DRIVER_VER="515.43.04"
   local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
   local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
 
@@ -904,7 +904,7 @@ readonly uname_r=$(uname -r)
 
 function build_driver_from_github() {
   # non-GPL driver will have been built on rocky8
-  if is_rocky8 ; then return 0 ; fi
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi
   pushd "${workdir}"
   test -d "${workdir}/open-gpu-kernel-modules" || {
     tarball_fn="${DRIVER_VERSION}.tar.gz"
@@ -1025,7 +1025,7 @@ function install_nvidia_userspace_runfile() {
   local cache_hit="0"
   local local_tarball
 
-  if is_rocky8 ; then
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then
     local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
     test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
       local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
@@ -1039,7 +1039,9 @@ function install_nvidia_userspace_runfile() {
 
       if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
         cache_hit="1"
-        runfile_args="--no-kernel-modules"
+        if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
+          runfile_args="${runfile_args} --no-kernel-modules"
+        fi
         echo "cache hit"
       else
         install_build_dependencies
@@ -1054,11 +1056,13 @@ function install_nvidia_userspace_runfile() {
           --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
           "
         fi
-
-        runfile_args="--no-dkms ${signing_options}"
+        runfile_args="${signing_options}"
+        if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
+          runfile_args="${runfile_args} --no-dkms"
+        fi
       fi
     }
-  else
+  elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
     runfile_args="--no-kernel-modules"
   fi
 
@@ -1499,8 +1503,8 @@ function prepare_gpu_env(){
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
 # Users should run apt-mark unhold before they wish to upgrade these packages
 function hold_nvidia_packages() {
-  apt-mark hold nvidia-*
-  apt-mark hold libnvidia-*
+#  apt-mark hold nvidia-*
+#  apt-mark hold libnvidia-*
   if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
     apt-mark hold xserver-xorg-video-nvidia*
   fi

From 16c8485b54e8ed1fe5fa4c5610f4ff29768a4761 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 13 Jan 2025 18:35:06 -0800
Subject: [PATCH 084/112] exercised with cuda 11.1

* cleaned up nccl build and pack code a bit
* no longer installing cudnn from local debian repo
* unpacking nccl from cache immediately rather than waiting until
  later in the code
* determine cudnn version by what is available in the repo
* less noise from apt-mark hold
* nccl build tested on 11.1 and 11.6
* account for abi change in nvidia-smi
---
 gpu/install_gpu_driver.sh | 87 ++++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 71bef8293..373fe664a 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -277,7 +277,8 @@ function set_driver_version() {
 
 set_driver_version
 
-readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
+readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
 readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
 
 # Parameters for NVIDIA-provided cuDNN library
@@ -285,9 +286,9 @@ readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
 CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
 function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
 function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+# The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
+if is_rocky  && (version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}") ; then
+  CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
 elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
   # cuDNN v8 is not distribution for ubuntu20+, debian12
   CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
@@ -620,30 +621,6 @@ function install_nvidia_nccl() {
 
   local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
 
-  # https://github.com/NVIDIA/nccl/blob/master/README.md
-  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
-  # Fermi:     SM_20,             compute_30
-  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
-  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
-  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
-
-  # The following architectures are suppored by open kernel driver
-  # Volta:     SM_70,SM_72,       compute_70,compute_72
-  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
-
-  # The following architectures are supported by CUDA v11.8+
-  # Ada:       SM_89,             compute_89
-  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
-  # Blackwell: SM_100,            compute_100
-                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
-  if version_ge "${CUDA_VERSION}" "11.8" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
-  fi
-  if version_ge "${CUDA_VERSION}" "12.0" ; then
-    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
-  fi
-
   mkdir -p "${workdir}"
   pushd "${workdir}"
 
@@ -668,11 +645,37 @@ function install_nvidia_nccl() {
     if echo "${output}" | grep -q "${gcs_tarball}" ; then
       # cache hit - unpack from cache
       echo "cache hit"
+      gcloud storage cat "${gcs_tarball}" | tar xvz
     else
       # build and cache
       pushd nccl
       # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
       install_build_dependencies
+
+      # https://github.com/NVIDIA/nccl/blob/master/README.md
+      # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+      # Fermi:     SM_20,             compute_30
+      # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+      # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+      # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+      # The following architectures are suppored by open kernel driver
+      # Volta:     SM_70,SM_72,       compute_70,compute_72
+      # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+      # The following architectures are supported by CUDA v11.8+
+      # Ada:       SM_89,             compute_89
+      # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+      # Blackwell: SM_100,            compute_100
+                      NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
+      if version_gt "${CUDA_VERSION}" "11.6" ; then
+        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
+      if version_ge "${CUDA_VERSION}" "11.8" ; then
+        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
+      if version_ge "${CUDA_VERSION}" "12.0" ; then
+        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
+
       if is_debuntu ; then
         # These packages are required to build .deb packages from source
         execute_with_retries \
@@ -686,13 +689,13 @@ function install_nvidia_nccl() {
         export NVCC_GENCODE
         execute_with_retries make -j$(nproc) pkg.redhat.build
       fi
-      tar czvf "/${local_tarball}" "../${build_path}"
-      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      rm "${local_tarball}"
+      tar czvf "${local_tarball}" "../${build_path}"
       make clean
       popd
+      tar xzvf "${local_tarball}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
     fi
-    gcloud storage cat "${gcs_tarball}" | tar xz
   }
 
   if is_debuntu ; then
@@ -734,16 +737,17 @@ function install_nvidia_cudnn() {
       apt-get -y install nvidia-cudnn
     else
       if is_cudnn8 ; then
-        install_local_cudnn8_repo
+        add_repo_cuda
 
         apt-get update -qq
+	# Ignore version requested and use the latest version in the package index
+	cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"
 
         execute_with_retries \
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
             "libcudnn8-dev=${cudnn_pkg_version}"
 
-        uninstall_local_cudnn8_repo
 	sync
       elif is_cudnn9 ; then
 	install_cuda_keyring_pkg
@@ -1503,8 +1507,10 @@ function prepare_gpu_env(){
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
 # Users should run apt-mark unhold before they wish to upgrade these packages
 function hold_nvidia_packages() {
-#  apt-mark hold nvidia-*
-#  apt-mark hold libnvidia-*
+  if ! is_debuntu ; then return ; fi
+
+  apt-mark hold nvidia-*    > /dev/null 2>&1
+  apt-mark hold libnvidia-* > /dev/null 2>&1
   if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
     apt-mark hold xserver-xorg-video-nvidia*
   fi
@@ -1587,17 +1593,22 @@ function main() {
         rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
       done
 
-      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
       if test -n "$(nvsmi -L)" ; then
 	# cache the result of the gpu query
         ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
         echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
+	chmod a+r "/var/run/nvidia-gpu-index.txt"
       fi
+      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e V100 -e A100 -e H100 || echo -n "")"
       NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
       if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
         # enable MIG on every GPU
 	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
-	  nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
+          if version_le "${CUDA_VERSION}" "11.6" ; then
+            nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
+          else
+	    nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
+          fi
 	done
 
         NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'

From afd5f2f4f15cb1edcf0d6c7e9e0a1b94e08700f3 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 14 Jan 2025 14:33:27 -0800
Subject: [PATCH 085/112] reverting cloudbuild/Dockerfile to master

---
 cloudbuild/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
index 2ea91e3e5..aebaffd84 100644
--- a/cloudbuild/Dockerfile
+++ b/cloudbuild/Dockerfile
@@ -21,8 +21,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
     echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \
     dd of="${bazel_repo_file}" status=none && \
     apt-get update -qq
-RUN apt-get autoremove -y -qq && \
-    apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \
+RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
+    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
     apt-get clean
 
 # Set bazel-${bazel_version} as the default bazel alternative in this container

From 2272f97cb1c9ebbd29491311a289247ae33720d5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 14 Jan 2025 16:10:57 -0800
Subject: [PATCH 086/112] nvidia is 404ing for download.nvidia.com ; using
 us.download.nvidia.com

---
 gpu/install_gpu_driver.sh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 373fe664a..f93992cb4 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -246,10 +246,10 @@ function set_driver_version() {
     if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
       major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
       driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
-      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+      if curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
         # use the version indicated by the cuda url as the default if it exists
 	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
-      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+      elif curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
         # use the maximum sub-version available for the major version indicated in cuda url as the default
 	DEFAULT_DRIVER="${driver_max_maj_version}"
       fi
@@ -268,7 +268,7 @@ function set_driver_version() {
 
   export DRIVER_VERSION DRIVER
 
-  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  gpu_driver_url="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
   if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
     echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
     exit 1
@@ -302,7 +302,7 @@ readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
 readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
 
 # Parameters for NVIDIA-provided Debian GPU driver
-readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
 
 readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
 
@@ -383,7 +383,7 @@ function set_cuda_runfile_url() {
   fi
 
   # driver version named in cuda runfile filename
-  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+  # (these may not be actual driver versions - see https://us.download.nvidia.com/XFree86/Linux-x86_64/)
   readonly -A drv_for_cuda=(
       ["10.0.130"]="410.48"
       ["10.1.234"]="418.87.00"
@@ -401,7 +401,7 @@ function set_cuda_runfile_url() {
       ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
       ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
       ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
-      ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+      ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/
       ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
       ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
   )
@@ -1599,11 +1599,11 @@ function main() {
         echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
 	chmod a+r "/var/run/nvidia-gpu-index.txt"
       fi
-      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e V100 -e A100 -e H100 || echo -n "")"
+      MIG_GPU_LIST="$(nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n "")"
       NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
       if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
         # enable MIG on every GPU
-	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
+	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
           if version_le "${CUDA_VERSION}" "11.6" ; then
             nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
           else

From 3b2dc66fdd366d43a9b769c6769308e608870de4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 14 Jan 2025 21:45:20 -0800
Subject: [PATCH 087/112] skipping rocky9

---
 gpu/test_gpu.py | 54 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f260d5927..e1ced1f41 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -4,8 +4,6 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 
-import unittest
-
 from integration_tests.dataproc_test_case import DataprocTestCase
 
 DEFAULT_TIMEOUT = 15  # minutes
@@ -18,7 +16,7 @@ class NvidiaGpuDriverTestCase(DataprocTestCase):
   GPU_T4   = "type=nvidia-tesla-t4"
   GPU_V100 = "type=nvidia-tesla-v100"
   GPU_A100 = "type=nvidia-tesla-a100,count=2"
-  GPU_H100 = "type=nvidia-h100-80gb,count=8"
+  GPU_H100 = "type=nvidia-h100-80gb,count=2"
 
   # Tests for PyTorch
   TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
@@ -56,12 +54,20 @@ def verify_instance(self, name):
     time.sleep( 3 + random.randint(1, 30) )
     self.assert_instance_command(name, "nvidia-smi", 1)
 
+  def verify_pyspark(self, name):
+    # Verify that pyspark works
+    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
   def verify_pytorch(self, name):
     test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                self.TORCH_TEST_SCRIPT_FILE_NAME)
     self.upload_test_file(test_filename, name)
 
-    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+    conda_env="dpgce"
+    verify_cmd = \
+      "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \
+      "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \
+      "${envpath}/bin/python {}".format(
         self.TORCH_TEST_SCRIPT_FILE_NAME)
     self.assert_instance_command(name, verify_cmd)
     self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
@@ -70,8 +76,11 @@ def verify_tensorflow(self, name):
     test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                self.TF_TEST_SCRIPT_FILE_NAME)
     self.upload_test_file(test_filename, name)
-
-    verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format(
+    # all on a single numa node
+    verify_cmd = \
+      "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format("dpgce") + \
+      "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \
+      "${envpath}/bin/python {}".format(
         self.TF_TEST_SCRIPT_FILE_NAME)
     self.assert_instance_command(name, verify_cmd)
     self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)
@@ -149,7 +158,6 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = None
@@ -184,7 +192,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     if driver_provider is not None:
@@ -215,7 +222,6 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true"
@@ -246,10 +252,12 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
@@ -265,7 +273,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
 
@@ -298,10 +305,9 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
     # Operation [projects/.../regions/.../operations/...] failed:
     # Invalid value for field 'resource.machineType': \
     # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \
-    # 'machineTypes/a3-highgpu-8g'. \
+    # 'machineTypes/a3-highgpu-2g'. \
     # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature..
     # ('This use case not thoroughly tested')
-    unittest.expectedFailure(self)
     self.skipTest("known to fail")
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
@@ -318,7 +324,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        master_machine_type="a3-highgpu-8g",
+        master_machine_type="a3-highgpu-2g",
         worker_machine_type="a2-highgpu-2g",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
@@ -338,11 +344,17 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
 
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = None
@@ -372,6 +384,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
@@ -385,7 +400,6 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
@@ -416,18 +430,16 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
   )
-  def tests_driver_signing(self, configuration, machine_suffixes,
+  def untested_driver_signing(self, configuration, machine_suffixes,
                            master_accelerator, worker_accelerator,
                            cuda_version, image_os, image_version):
 
-    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
-    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     kvp_array=[]

From 0c420b70c6a5dc4ded7b50d29b50fbe198827d70 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 14 Jan 2025 21:45:58 -0800
Subject: [PATCH 088/112] * adding version 12.6 to the support matrix *
 changing layout of gcs package folder * install_pytorch function created and
 called when cuDNN is being installed

---
 gpu/install_gpu_driver.sh | 71 ++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 12 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index f93992cb4..b91046422 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -142,7 +142,7 @@ readonly -A DRIVER_FOR_CUDA=(
     ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
     ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
     ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08"
-    ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03"
+    ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142"
 )
 readonly -A DRIVER_SUBVER=(
     ["410"]="410.104" ["415"]="415.27" ["418"]="418.113"
@@ -403,7 +403,7 @@ function set_cuda_runfile_url() {
       ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
       ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/
       ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
-      ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+      ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
   )
 
   # Verify that the file with the indicated combination exists
@@ -413,16 +413,20 @@ function set_cuda_runfile_url() {
   local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
 
   NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
-  readonly NVIDIA_CUDA_URL
-
-  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
-  readonly CUDA_RUNFILE
 
   if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
     echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then
+      echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead"
+    fi
     exit 1
   fi
 
+  readonly NVIDIA_CUDA_URL
+
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
   if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
     echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
   elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
@@ -588,7 +592,7 @@ function install_local_cudnn8_repo() {
 
   # cache the cudnn package
   cache_fetched_package "${local_deb_url}" \
-                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${pkg_bucket}/nvidia/cudnn/${CUDNN8_CUDA_VER}/${deb_fn}" \
                         "${local_deb_fn}"
 
   local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
@@ -639,7 +643,7 @@ function install_nvidia_nccl() {
   test -d "${workdir}/nccl/build" || {
     local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
     local local_tarball="${workdir}/${build_tarball}"
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}"
 
     output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
     if echo "${output}" | grep -q "${gcs_tarball}" ; then
@@ -775,6 +779,48 @@ function install_nvidia_cudnn() {
   echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
 }
 
+function install_pytorch() {
+  if test -f "${workdir}/complete/pytorch" ; then return ; fi
+  local env
+  env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
+  local mc3=/opt/conda/miniconda3
+  local envpath="${mc3}/envs/${env}"
+  # Set numa node to 0 for all GPUs
+  for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
+  local verb=create
+  if test -d "${envpath}" ; then verb=install ; fi
+
+  readonly USE_PYTORCH=$(get_metadata_attribute 'use-pytorch' 'no')
+  case "${USE_PYTORCH^^}" in
+    "1" | "YES" | "TRUE" )
+      local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
+      local local_tarball="${workdir}/${build_tarball}"
+      local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
+
+      output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+      if echo "${output}" | grep -q "${gcs_tarball}" ; then
+        # cache hit - unpack from cache
+        echo "cache hit"
+        mkdir -p "${envpath}"
+        gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
+      else
+	cudart_spec="cuda-cudart"
+        if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
+        "${mc3}/bin/mamba" "${verb}" -n "${env}" \
+	  -c conda-forge -c nvidia -c rapidsai \
+          numba pytorch tensorflow[and-cuda] rapids pyspark \
+          "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
+        pushd "${envpath}"
+        tar czf "${local_tarball}" .
+	popd
+	gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      fi
+      ;;
+    * ) echo "skip pytorch install" ;;
+  esac
+  touch "${workdir}/complete/pytorch"
+}
+
 function configure_dkms_certs() {
   if test -v PSN && [[ -z "${PSN}" ]]; then
       echo "No signing secret provided.  skipping";
@@ -927,7 +973,7 @@ function build_driver_from_github() {
       then build_dir="${modulus_md5sum}"
       else build_dir="unsigned" ; fi
 
-    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
     if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
       echo "cache hit"
@@ -1021,7 +1067,7 @@ function install_nvidia_userspace_runfile() {
   local local_fn="${tmpdir}/userspace.run"
 
   cache_fetched_package "${USERSPACE_URL}" \
-                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
+                        "${pkg_bucket}/nvidia/${USERSPACE_FILENAME}" \
                         "${local_fn}"
 
   local runfile_args
@@ -1039,7 +1085,7 @@ function install_nvidia_userspace_runfile() {
         then build_dir="${modulus_md5sum}"
         else build_dir="unsigned" ; fi
 
-      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+      local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
       if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
         cache_hit="1"
@@ -1098,7 +1144,7 @@ function install_cuda_runfile() {
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
-			"${pkg_bucket}/${CUDA_RUNFILE}" \
+			"${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \
                         "${local_fn}"
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
@@ -1578,6 +1624,7 @@ function main() {
       if [[ -n ${CUDNN_VERSION} ]]; then
         install_nvidia_nccl
         install_nvidia_cudnn
+        install_pytorch
       fi
       #Install GPU metrics collection in Stackdriver if needed
       if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then

From f69d071f68b93888e93260b269d7f652a5e6f282 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 15 Jan 2025 08:40:55 -0800
Subject: [PATCH 089/112] incorrect version check removed

---
 gpu/test_gpu.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index e1ced1f41..6ee2fb845 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -347,10 +347,6 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
       self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
-    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
-
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):

From 73ffce5d29b6dce7d7c7392eff99e5485fcdad84 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 12:04:00 -0800
Subject: [PATCH 090/112] only install pytorch if include-pytorch metadata set
 to true

---
 gpu/install_gpu_driver.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index b91046422..41a489447 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -790,8 +790,7 @@ function install_pytorch() {
   local verb=create
   if test -d "${envpath}" ; then verb=install ; fi
 
-  readonly USE_PYTORCH=$(get_metadata_attribute 'use-pytorch' 'no')
-  case "${USE_PYTORCH^^}" in
+  case "${INCLUDE_PYTORCH^^}" in
     "1" | "YES" | "TRUE" )
       local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
       local local_tarball="${workdir}/${build_tarball}"
@@ -1548,6 +1547,9 @@ function prepare_gpu_env(){
 
   if   is_cuda11 ; then gcc_ver="11"
   elif is_cuda12 ; then gcc_ver="12" ; fi
+
+  INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
+  readonly INCLUDE_PYTORCH
 }
 
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
@@ -1624,8 +1626,10 @@ function main() {
       if [[ -n ${CUDNN_VERSION} ]]; then
         install_nvidia_nccl
         install_nvidia_cudnn
-        install_pytorch
       fi
+      case "${INCLUDE_PYTORCH^^}" in
+        "1" | "YES" | "TRUE" ) install_pytorch ;;
+      esac
       #Install GPU metrics collection in Stackdriver if needed
       if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
         #install_ops_agent

From 521df6288f6a4935639249c745e409fa95117ce8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 13:40:57 -0800
Subject: [PATCH 091/112] since call to install_pytorch is protected by
 metadata check, skip metadata check within the function ; create new function
 harden_sshd_config and call it

---
 gpu/install_gpu_driver.sh | 76 +++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 41a489447..63dbf493b 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -787,36 +787,31 @@ function install_pytorch() {
   local envpath="${mc3}/envs/${env}"
   # Set numa node to 0 for all GPUs
   for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
-  local verb=create
-  if test -d "${envpath}" ; then verb=install ; fi
-
-  case "${INCLUDE_PYTORCH^^}" in
-    "1" | "YES" | "TRUE" )
-      local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
-      local local_tarball="${workdir}/${build_tarball}"
-      local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
-
-      output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
-      if echo "${output}" | grep -q "${gcs_tarball}" ; then
-        # cache hit - unpack from cache
-        echo "cache hit"
-        mkdir -p "${envpath}"
-        gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
-      else
-	cudart_spec="cuda-cudart"
-        if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
-        "${mc3}/bin/mamba" "${verb}" -n "${env}" \
-	  -c conda-forge -c nvidia -c rapidsai \
-          numba pytorch tensorflow[and-cuda] rapids pyspark \
-          "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
-        pushd "${envpath}"
-        tar czf "${local_tarball}" .
-	popd
-	gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      fi
-      ;;
-    * ) echo "skip pytorch install" ;;
-  esac
+
+  local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
+  local local_tarball="${workdir}/${build_tarball}"
+  local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
+
+  output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+  if echo "${output}" | grep -q "${gcs_tarball}" ; then
+    # cache hit - unpack from cache
+    echo "cache hit"
+    mkdir -p "${envpath}"
+    gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
+  else
+    local verb=create
+    if test -d "${envpath}" ; then verb=install ; fi
+    cudart_spec="cuda-cudart"
+    if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
+    "${mc3}/bin/mamba" "${verb}" -n "${env}" \
+      -c conda-forge -c nvidia -c rapidsai \
+      numba pytorch tensorflow[and-cuda] rapids pyspark \
+      "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
+    pushd "${envpath}"
+    tar czf "${local_tarball}" .
+    popd
+    gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+  fi
   touch "${workdir}/complete/pytorch"
 }
 
@@ -1947,6 +1942,24 @@ function mount_ramdisk(){
   fi
 }
 
+function harden_sshd_config() {
+  # disable sha1 and md5 use in kex and kex-gss features
+  declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms")
+  for ftr in "${!feature_map[@]}" ; do
+    export feature=${feature_map[$ftr]}
+    sshd_config_line=$(
+      (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
+       ssh -Q "${ftr}" ) \
+      | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}<STDIN>;
+      print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if "@a"')
+    grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
+    echo "$sshd_config_line" >> /tmp/sshd_config_new
+    # TODO: test whether sshd will reload with this change before mv
+    mv /tmp/sshd_config_new /etc/ssh/sshd_config
+  done
+  systemctl reload ssh
+}
+
 function prepare_to_install(){
   # Verify OS compatability and Secure boot state
   check_os
@@ -1971,9 +1984,10 @@ function prepare_to_install(){
 
   if test -f "${workdir}/complete/prepare" ; then return ; fi
 
-  repair_old_backports
+  harden_sshd_config
 
   if is_debuntu ; then
+    repair_old_backports
     clean_up_sources_lists
     apt-get --allow-releaseinfo-change update
     apt-get -y clean

From c0b60b2b1e34576f0489664012c9ea0b2cf46d47 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 13:47:55 -0800
Subject: [PATCH 092/112] increasing timeout and machine shape to reduce
 no-cache build time

---
 gpu/test_gpu.py                         | 14 +++++++-------
 integration_tests/dataproc_test_case.py |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 6ee2fb845..e9c2d92ad 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -6,7 +6,7 @@
 
 from integration_tests.dataproc_test_case import DataprocTestCase
 
-DEFAULT_TIMEOUT = 15  # minutes
+DEFAULT_TIMEOUT = 45  # minutes
 DEFAULT_CUDA_VERSION = "12.4"
 
 class NvidiaGpuDriverTestCase(DataprocTestCase):
@@ -199,7 +199,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-8",
+        machine_type="n1-standard-16",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
@@ -230,7 +230,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-8",
+        machine_type="n1-standard-16",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
@@ -280,7 +280,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-8",
+        machine_type="n1-standard-16",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
@@ -361,7 +361,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
         configuration,
         self.INIT_ACTIONS,
         metadata=metadata,
-        machine_type="n1-highmem-8",
+        machine_type="n1-standard-16",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         boot_disk_size="50GB",
@@ -402,7 +402,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     self.createCluster(
       configuration,
       self.INIT_ACTIONS,
-      machine_type="n1-highmem-8",
+      machine_type="n1-standard-16",
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
@@ -460,7 +460,7 @@ def untested_driver_signing(self, configuration, machine_suffixes,
     self.createCluster(
       configuration,
       self.INIT_ACTIONS,
-      machine_type="n1-highmem-8",
+      machine_type="n1-standard-16",
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 936718498..314603ea1 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -23,7 +23,7 @@
 
 INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "false").lower() == "true"
 
-DEFAULT_TIMEOUT = 15  # minutes
+DEFAULT_TIMEOUT = 45  # minutes
 
 
 class DataprocTestCase(parameterized.TestCase):

From 30c97c4ccfc76921258474075c7197bb7ed6a496 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 14:01:06 -0800
Subject: [PATCH 093/112] skip full test run due to edits to integration_tests
 directory

---
 cloudbuild/presubmit.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index eec7adb76..8f5a0a4b1 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -70,6 +70,7 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
+      continue # to be removed before merge
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0

From 84b1fb9dee4d21949549256a9a7bb0e7907d21a4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 14:09:37 -0800
Subject: [PATCH 094/112] ubuntu18 does not know about kex-gss ; use correct
 driver version number for cuda 11.1.1 url generation

---
 gpu/install_gpu_driver.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 63dbf493b..b98a5c9f2 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -389,7 +389,7 @@ function set_cuda_runfile_url() {
       ["10.1.234"]="418.87.00"
       ["10.2.89"]="440.33.01"
       ["11.0.3"]="450.51.06"
-      ["11.1.1"]="455.42.00"
+      ["11.1.1"]="455.32.00"
       ["11.2.2"]="460.32.03"
       ["11.3.1"]="465.19.01"
       ["11.4.4"]="470.82.01"
@@ -1944,7 +1944,8 @@ function mount_ramdisk(){
 
 function harden_sshd_config() {
   # disable sha1 and md5 use in kex and kex-gss features
-  declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms")
+  declare -A feature_map=(["kex"]="kexalgorithms")
+  if ( ! is_ubuntu || ge_ubuntu20 ) ; then feature_map["kex-gss"]="gssapikexalgorithms" ; fi
   for ftr in "${!feature_map[@]}" ; do
     export feature=${feature_map[$ftr]}
     sshd_config_line=$(

From 11cbe953dd46bafea6bfb0bb04a6f50e0626ef79 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 19:48:17 -0800
Subject: [PATCH 095/112] on rocky9 sshd service is called sshd instead of ssh
 as the rest of the platforms call it

---
 gpu/install_gpu_driver.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index b98a5c9f2..f7b5900f1 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1958,7 +1958,9 @@ function harden_sshd_config() {
     # TODO: test whether sshd will reload with this change before mv
     mv /tmp/sshd_config_new /etc/ssh/sshd_config
   done
-  systemctl reload ssh
+  local svc=ssh
+  if ge_rocky9 ; then svc="sshd" ; fi
+  systemctl reload "${svc}"
 }
 
 function prepare_to_install(){

From 56fe50cf4a5e9ed10cf41ab3d47734f7e02948bc Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 21:05:54 -0800
Subject: [PATCH 096/112] kex-gss is new in debian11

---
 gpu/install_gpu_driver.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index f7b5900f1..7a0801081 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1945,7 +1945,8 @@ function mount_ramdisk(){
 function harden_sshd_config() {
   # disable sha1 and md5 use in kex and kex-gss features
   declare -A feature_map=(["kex"]="kexalgorithms")
-  if ( ! is_ubuntu || ge_ubuntu20 ) ; then feature_map["kex-gss"]="gssapikexalgorithms" ; fi
+  if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then
+    feature_map["kex-gss"]="gssapikexalgorithms" ; fi
   for ftr in "${!feature_map[@]}" ; do
     export feature=${feature_map[$ftr]}
     sshd_config_line=$(

From b1cd1d0c5864233b2f81f6b7cabba72b226782c4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 21:22:22 -0800
Subject: [PATCH 097/112] all rocky call it sshd it seems

---
 gpu/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 7a0801081..dcda8154a 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1960,7 +1960,7 @@ function harden_sshd_config() {
     mv /tmp/sshd_config_new /etc/ssh/sshd_config
   done
   local svc=ssh
-  if ge_rocky9 ; then svc="sshd" ; fi
+  if is_rocky ; then svc="sshd" ; fi
   systemctl reload "${svc}"
 }
 

From ca94393c8555e15ccceaaacbad0d3813e41c7b9e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 21 Jan 2025 22:02:35 -0800
Subject: [PATCH 098/112] cudnn no longer available on debian10

---
 gpu/install_gpu_driver.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index dcda8154a..188ffcd7b 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -717,6 +717,7 @@ function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
   if test -f "${workdir}/complete/cudnn" ; then return ; fi
+  if le_debian10 ; then return ; fi
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version

From 1d2166c53e4919a9e1bb34c22ad72373d5f5d83b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 22 Jan 2025 16:31:42 -0800
Subject: [PATCH 099/112] compared with #1282 ; this change matches parity more
 closely

---
 gpu/install_gpu_driver.sh | 231 ++++++++++++++++++++++----------------
 1 file changed, 135 insertions(+), 96 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 188ffcd7b..b79c67d6b 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -134,8 +134,7 @@ readonly ROLE
 
 # Minimum supported version for open kernel driver is 515.43.04
 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
-# Rocky8: 12.0: 525.147.05
-latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+latest="$(curl -s https://us.download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
 readonly -A DRIVER_FOR_CUDA=(
     ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
     ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
@@ -165,7 +164,6 @@ readonly -A CUDNN_FOR_CUDA=(
     ["12.6"]="9.6.0.74"
 )
 # https://developer.nvidia.com/nccl/nccl-download
-# 12.2: 2.19.3, 12.5: 2.21.5
 readonly -A NCCL_FOR_CUDA=(
     ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
     ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4"
@@ -184,10 +182,16 @@ readonly -A CUDA_SUBVER=(
     ["12.6"]="12.6.3"
 )
 
-# Verify SPARK compatability
-RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-
 function set_cuda_version() {
+  case "${DATAPROC_IMAGE_VERSION}" in
+    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    *   )
+      echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
+      exit 1
+      ;;
+  esac
   local cuda_url
   cuda_url=$(get_metadata_attribute 'cuda-url' '')
   if [[ -n "${cuda_url}" ]] ; then
@@ -195,14 +199,9 @@ function set_cuda_version() {
     local CUDA_URL_VERSION
     CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
     if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
-      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
-      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION}"
     fi
   fi
-
-  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
-    DEFAULT_CUDA_VERSION='12.4'
-  fi
   readonly DEFAULT_CUDA_VERSION
 
   CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
@@ -215,7 +214,6 @@ function set_cuda_version() {
     CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
   fi
   readonly CUDA_FULL_VERSION
-
 }
 
 set_cuda_version
@@ -264,7 +262,7 @@ function set_driver_version() {
   DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
 
   readonly DRIVER_VERSION
-  readonly DRIVER=${DRIVER_VERSION%%.*}
+  readonly DRIVER="${DRIVER_VERSION%%.*}"
 
   export DRIVER_VERSION DRIVER
 
@@ -498,25 +496,24 @@ function execute_with_retries() (
   return 1
 )
 
-CUDA_KEYRING_PKG_INSTALLED="0"
 function install_cuda_keyring_pkg() {
-  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
+  is_complete cuda-keyring-installed && return
   local kr_ver=1.1
   curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
     -o "${tmpdir}/cuda-keyring.deb"
   dpkg -i "${tmpdir}/cuda-keyring.deb"
   rm -f "${tmpdir}/cuda-keyring.deb"
-  CUDA_KEYRING_PKG_INSTALLED="1"
+  mark_complete cuda-keyring-installed
 }
 
 function uninstall_cuda_keyring_pkg() {
   apt-get purge -yq cuda-keyring
-  CUDA_KEYRING_PKG_INSTALLED="0"
+  mark_incomplete cuda-keyring-installed
 }
 
 function install_local_cuda_repo() {
-  if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi
+  is_complete install-local-cuda-repo && return
 
   pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
   CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
@@ -537,16 +534,15 @@ function install_local_cuda_repo() {
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
 
-  touch "${workdir}/complete/install-local-cuda-repo"
+  mark_complete install-local-cuda-repo
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cuda-repo"
+  mark_incomplete install-local-cuda-repo
 }
 
-CUDNN_PKG_NAME=""
 function install_local_cudnn_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
+  is_complete install-local-cudnn-repo && return
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
   CUDNN_PKG_NAME="${pkgname}"
   local_deb_fn="${pkgname}_1.0-1_amd64.deb"
@@ -562,18 +558,16 @@ function install_local_cudnn_repo() {
 
   cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
-  touch "${workdir}/complete/install-local-cudnn-repo"
+  mark_complete install-local-cudnn-repo
 }
 
 function uninstall_local_cudnn_repo() {
   apt-get purge -yq "${CUDNN_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cudnn-repo"
+  mark_incomplete install-local-cudnn-repo
 }
 
-CUDNN8_LOCAL_REPO_INSTALLED="0"
-CUDNN8_PKG_NAME=""
 function install_local_cudnn8_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
+  is_complete install-local-cudnn8-repo && return
 
   if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
   elif is_debian ; then cudnn8_shortname="debian11"
@@ -607,16 +601,16 @@ function install_local_cudnn8_repo() {
   rm -f "${local_deb_fn}"
 
   cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  touch "${workdir}/complete/install-local-cudnn8-repo"
+  mark_complete install-local-cudnn8-repo
 }
 
 function uninstall_local_cudnn8_repo() {
   apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cudnn8-repo"
+  mark_incomplete install-local-cudnn8-repo
 }
 
 function install_nvidia_nccl() {
-  if test -f "${workdir}/complete/nccl" ; then return ; fi
+  is_complete nccl && return
 
   if is_cuda11 && is_debian12 ; then
     echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
@@ -709,14 +703,14 @@ function install_nvidia_nccl() {
   fi
 
   popd
-  touch "${workdir}/complete/nccl"
+  mark_complete nccl
 }
 
 function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
 function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
 function install_nvidia_cudnn() {
-  if test -f "${workdir}/complete/cudnn" ; then return ; fi
+  is_complete cudnn && return
   if le_debian10 ; then return ; fi
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
@@ -764,6 +758,7 @@ function install_nvidia_cudnn() {
           "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
           "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
           "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
+
 	sync
       else
         echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
@@ -776,8 +771,8 @@ function install_nvidia_cudnn() {
 
   ldconfig
 
-  touch "${workdir}/complete/cudnn"
   echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
+  mark_complete cudnn
 }
 
 function install_pytorch() {
@@ -948,7 +943,7 @@ function add_repo_cuda() {
 readonly uname_r=$(uname -r)
 
 function build_driver_from_github() {
-  # non-GPL driver will have been built on rocky8
+  # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version
   if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi
   pushd "${workdir}"
   test -d "${workdir}/open-gpu-kernel-modules" || {
@@ -976,7 +971,7 @@ function build_driver_from_github() {
       # build the kernel modules
       pushd open-gpu-kernel-modules
       install_build_dependencies
-      if is_cuda11 && is_ubuntu22 ; then
+      if ( is_cuda11 && is_ubuntu22 ) ; then
         echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
         exit 1
       fi
@@ -985,12 +980,14 @@ function build_driver_from_github() {
         2> kernel-open/build_error.log
       # Sign kernel modules
       if [[ -n "${PSN}" ]]; then
+        configure_dkms_certs
         for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
           "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
           "${mok_key}" \
           "${mok_der}" \
           "${module}"
         done
+        clear_dkms_key
       fi
       make modules_install \
         >>  kernel-open/build.log \
@@ -1030,12 +1027,12 @@ function build_driver_from_packages() {
     add_contrib_component
     apt-get update -qq
     execute_with_retries apt-get install -y -qq --no-install-recommends dkms
-    #configure_dkms_certs
+    configure_dkms_certs
     execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
     sync
 
   elif is_rocky ; then
-    #configure_dkms_certs
+    configure_dkms_certs
     if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
       echo "nvidia-driver:${DRIVER}-dkms installed successfully"
     else
@@ -1043,7 +1040,7 @@ function build_driver_from_packages() {
     fi
     sync
   fi
-  #clear_dkms_key
+  clear_dkms_key
 }
 
 function install_nvidia_userspace_runfile() {
@@ -1058,7 +1055,7 @@ function install_nvidia_userspace_runfile() {
   #
   # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
   # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  if test -f "${workdir}/complete/userspace" ; then return ; fi
+  is_complete userspace && return
   local local_fn="${tmpdir}/userspace.run"
 
   cache_fetched_package "${USERSPACE_URL}" \
@@ -1090,7 +1087,7 @@ function install_nvidia_userspace_runfile() {
         echo "cache hit"
       else
         install_build_dependencies
-
+        configure_dkms_certs
         local signing_options
         signing_options=""
         if [[ -n "${PSN}" ]]; then
@@ -1117,11 +1114,12 @@ function install_nvidia_userspace_runfile() {
     --install-libglvnd \
     --tmpdir="${tmpdir}"
 
-  if is_rocky8 ; then
+  if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then
     if [[ "${cache_hit}" == "1" ]] ; then
       gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
       depmod -a
     else
+      clear_dkms_key
       tar czvf "${local_tarball}" \
         /var/log/nvidia-installer.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
@@ -1130,21 +1128,22 @@ function install_nvidia_userspace_runfile() {
   fi
 
   rm -f "${local_fn}"
-  touch "${workdir}/complete/userspace"
+  mark_complete userspace
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${workdir}/complete/cuda" ; then return ; fi
+  is_complete cuda && return
+
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
-			"${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \
+                        "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \
                         "${local_fn}"
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
   rm -f "${local_fn}"
-  touch "${workdir}/complete/cuda"
+  mark_complete cuda
   sync
 }
 
@@ -1170,7 +1169,9 @@ function install_cuda_toolkit() {
 function load_kernel_module() {
   # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
   for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+    ( set +e
+      rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+    )
   done
 
   depmod -a
@@ -1182,7 +1183,8 @@ function load_kernel_module() {
 }
 
 function install_cuda(){
-  if test -f "${workdir}/complete/cuda-repo" ; then return ; fi
+  is_complete cuda-repo && return
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     echo "installed with the driver on ${_shortname}"
@@ -1195,10 +1197,12 @@ function install_cuda(){
   # Includes CUDA packages
   add_repo_cuda
 
-  touch "${workdir}/complete/cuda-repo"
+  mark_complete cuda-repo
 }
 
 function install_nvidia_container_toolkit() {
+  is_complete install-nvtk && return
+
   local container_runtime_default
     if command -v docker     ; then container_runtime_default='docker'
   elif command -v containerd ; then container_runtime_default='containerd'
@@ -1214,11 +1218,14 @@ function install_nvidia_container_toolkit() {
     execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
   nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
   systemctl restart "${CONTAINER_RUNTIME}"
+
+  mark_complete install-nvtk
 }
 
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if test -f "${workdir}/complete/gpu-driver" ; then return ; fi
+  is_complete gpu-driver && return
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
@@ -1240,11 +1247,11 @@ function install_nvidia_gpu_driver() {
   build_driver_from_github
 
   echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  touch "${workdir}/complete/gpu-driver"
+  mark_complete gpu-driver
 }
 
 function install_ops_agent(){
-  if test -f "${workdir}/ops-agent-complete" ; then return ; fi
+  is_complete ops-agent && return
 
   mkdir -p /opt/google
   cd /opt/google
@@ -1252,7 +1259,7 @@ function install_ops_agent(){
   curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
   execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
-  touch "${workdir}/complete/ops-agent"
+  mark_complete ops-agent
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
@@ -1272,7 +1279,7 @@ function install_gpu_agent() {
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
   local venv="${install_dir}/venv"
-  python3 -m venv "${venv}"
+  /opt/conda/miniconda3/bin/python3 -m venv "${venv}"
 (
   source "${venv}/bin/activate"
   python3 -m pip install --upgrade pip
@@ -1329,11 +1336,12 @@ function configure_yarn_resources() {
 
 # This configuration should be applied only if GPU is attached to the node
 function configure_yarn_nodemanager() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
   set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
   set_hadoop_property 'yarn-site.xml' \
@@ -1358,13 +1366,12 @@ function configure_yarn_nodemanager() {
 }
 
 function configure_gpu_exclusive_mode() {
-  # check if running spark 3, if not, enable GPU exclusive mode
-  local spark_version
-  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
-  if [[ ${spark_version} != 3.* ]]; then
-    # include exclusive mode on GPU
-    nvidia-smi -c EXCLUSIVE_PROCESS
-  fi
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
+  # only run this function when spark < 3.0
+  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
+  # include exclusive mode on GPU
+  nvsmi -c EXCLUSIVE_PROCESS
+  clear_nvsmi_cache
 }
 
 function fetch_mig_scripts() {
@@ -1376,6 +1383,7 @@ function fetch_mig_scripts() {
 }
 
 function configure_gpu_script() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # Download GPU discovery script
   local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
   mkdir -p ${spark_gpu_script_dir}
@@ -1402,6 +1410,7 @@ function configure_gpu_script() {
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
 ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
@@ -1411,18 +1420,18 @@ EOF
   chmod a+rx "${gpus_resources_script}"
 
   local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-  if version_ge "${SPARK_VERSION}" "3.0" ; then
-    local gpu_count
-    gpu_count="$(lspci | grep NVIDIA | wc -l)"
-    local executor_cores
-    executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
-    local executor_memory
-    executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
-    local task_cpus=2
-    local gpu_amount
-    gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
-
-    cat >>"${spark_defaults_conf}" <<EOF
+  if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi
+
+  local executor_cores
+  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+  local executor_memory
+  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+  local task_cpus=2
+  local gpu_amount
+#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
+
+  cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
 # Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
 # query explain output won't show GPU operator, if the user has doubts
@@ -1430,6 +1439,7 @@ EOF
 # having AQE enabled gives user the best performance.
 spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.resource.gpu.amount=${gpu_count}
+spark.plugins=com.nvidia.spark.SQLPlugin
 spark.executor.cores=${executor_cores}
 spark.executor.memory=${executor_memory_gb}G
 spark.dynamicAllocation.enabled=false
@@ -1439,10 +1449,10 @@ spark.task.cpus=2
 spark.yarn.unmanagedAM.enabled=false
 ###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
 EOF
-  fi
 }
 
 function configure_gpu_isolation() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # enable GPU isolation
   sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
   if [[ $IS_MIG_ENABLED -ne 0 ]]; then
@@ -1477,7 +1487,7 @@ function nvsmi() {
   elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
   else nvsmi_works="1" ; fi
 
-  if [[ "$1" == "-L" ]] ; then
+  if test -v 1 && [[ "$1" == "-L" ]] ; then
     local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
     if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
     else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
@@ -1489,7 +1499,7 @@ function nvsmi() {
 }
 
 function install_build_dependencies() {
-  if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
+  is_complete build-dependencies && return
 
   if is_debuntu ; then
     if is_ubuntu22 && is_cuda12 ; then
@@ -1527,25 +1537,57 @@ function install_build_dependencies() {
 
     execute_with_retries "${dnf_cmd}"
   fi
-  touch "${workdir}/complete/build-dependencies"
+  mark_complete build-dependencies
+}
+
+function is_complete() {
+  phase="$1"
+  test -f "${workdir}/complete/${phase}"
+}
+
+function mark_complete() {
+  phase="$1"
+  touch "${workdir}/complete/${phase}"
+}
+
+function mark_incomplete() {
+  phase="$1"
+  rm -f "${workdir}/complete/${phase}"
 }
 
 function install_dependencies() {
+  is_complete install-dependencies && return 0
+
   pkg_list="pciutils screen"
   if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
   elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+  mark_complete install-dependencies
 }
 
 function prepare_gpu_env(){
 
+  set +e
+  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+  set -e
+
   readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
   nvsmi_works="0"
 
   if   is_cuda11 ; then gcc_ver="11"
   elif is_cuda12 ; then gcc_ver="12" ; fi
 
-  INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
-  readonly INCLUDE_PYTORCH
+  if ! test -v DEFAULT_RAPIDS_RUNTIME ; then
+    readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
+  fi
+
+  # Set variables from metadata
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
+  INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
+  INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')"
+  readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH
+
+  # determine whether we have nvidia-smi installed and working
+  nvsmi
 }
 
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
@@ -1584,8 +1626,6 @@ function check_secure_boot() {
                       mok_der=/var/lib/shim-signed/mok/MOK.der
                  else mok_key=/var/lib/dkms/mok.key
                       mok_der=/var/lib/dkms/mok.pub ; fi
-
-  configure_dkms_certs
 }
 
 
@@ -1836,15 +1876,12 @@ function exit_handler() {
       /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
       /usr/lib \
       /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
       /opt/conda/miniconda3 | sort -h
   elif is_debian ; then
     du -x -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
       /var/lib/{docker,mysql,} \
-      /usr/lib \
       /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
       /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
       /usr/bin \
       /usr \
@@ -1853,11 +1890,9 @@ function exit_handler() {
   else
     du -hs \
       /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
+      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
       /usr/lib64/google-cloud-sdk \
-      /usr/lib \
       /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
       /opt/conda/miniconda3
   fi
 
@@ -1874,11 +1909,12 @@ function exit_handler() {
   perl -e '@siz=( sort { $a => $b }
                    map { (split)[2] =~ /^(\d+)/ }
                   grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
+$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting};
 print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+       "starting-disk-used: $starting", $/,
+       "maximum-disk-used:  $max", $/,
+       "minimum-disk-used:  $min", $/,
+       "     increased-by:  $inc", $/ )' < "/run/disk-usage.log"
 
   echo "exit_handler has completed"
 
@@ -1987,18 +2023,21 @@ function prepare_to_install(){
 
   readonly install_log="${tmpdir}/install.log"
 
-  if test -f "${workdir}/complete/prepare" ; then return ; fi
+  is_complete prepare.common && return
 
   harden_sshd_config
 
   if is_debuntu ; then
     repair_old_backports
     clean_up_sources_lists
-    apt-get --allow-releaseinfo-change update
+    apt-get update -qq --allow-releaseinfo-change
     apt-get -y clean
     apt-get -o DPkg::Lock::Timeout=60 -y autoremove
     if ge_debian12 ; then
     apt-mark unhold systemd libsystemd0 ; fi
+    if is_ubuntu ; then
+      while ! command -v gcloud ; do sleep 5s ; done
+    fi
   else
     dnf clean all
   fi
@@ -2016,7 +2055,7 @@ function prepare_to_install(){
   screen -d -m -LUS keep-running-df \
     bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
 
-  touch "${workdir}/complete/prepare"
+  mark_complete prepare.common
 }
 
 function check_os() {

From 50142f6ee1b8ece3bfc168dcb6aeef2d23bb6824 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 22 Jan 2025 17:21:04 -0800
Subject: [PATCH 100/112] slightly better variable declaration ordering ; it is
 better still in the templates/ directory from #1282

---
 gpu/install_gpu_driver.sh | 95 ++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index b79c67d6b..a48e624e9 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -216,8 +216,6 @@ function set_cuda_version() {
   readonly CUDA_FULL_VERSION
 }
 
-set_cuda_version
-
 function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
 function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
 function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
@@ -273,39 +271,27 @@ function set_driver_version() {
   fi
 }
 
-set_driver_version
-
-readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
-readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
-readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
-
-# Parameters for NVIDIA-provided cuDNN library
-readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-# The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
-if is_rocky  && (version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}") ; then
-  CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
-elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
-  # cuDNN v8 is not distribution for ubuntu20+, debian12
-  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
-  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-  CUDNN_VERSION="8.8.0.121"
-fi
-readonly CUDNN_VERSION
-
-readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
-
-# Parameters for NVIDIA-provided Debian GPU driver
-readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-
-readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+function set_cudnn_version() {
+  readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
+  readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
+  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+  # Parameters for NVIDIA-provided cuDNN library
+  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+  # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
+  if ( is_rocky  && version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
+    CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
+  elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
+    # cuDNN v8 is not distribution for ubuntu20+, debian12
+    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+  elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
+    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+    CUDNN_VERSION="8.8.0.121"
+  fi
+  readonly CUDNN_VERSION
+}
 
-USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
-readonly USERSPACE_FILENAME
 
 # Short name for urls
 if is_ubuntu22  ; then
@@ -330,15 +316,14 @@ else
     nccl_shortname="${shortname}"
 fi
 
-# Parameters for NVIDIA-provided package repositories
-readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+function set_nv_urls() {
+  # Parameters for NVIDIA-provided package repositories
+  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
 
-# Parameters for NVIDIA-provided NCCL library
-readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
-NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
-readonly NCCL_REPO_URL
-readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
+  # Parameter for NVIDIA-provided Rocky Linux GPU driver
+  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+}
 
 function set_cuda_runfile_url() {
   local MAX_DRIVER_VERSION
@@ -436,11 +421,7 @@ function set_cuda_runfile_url() {
   fi
 }
 
-set_cuda_runfile_url
-
-# Parameter for NVIDIA-provided Rocky Linux GPU driver
-readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
-
+function set_cudnn_tarball_url() {
 CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
 CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
 if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
@@ -460,6 +441,7 @@ if ( version_ge "${CUDA_VERSION}" "12.0" ); then
 fi
 readonly CUDNN_TARBALL
 readonly CUDNN_TARBALL_URL
+}
 
 # Whether to install NVIDIA-provided or OS-provided GPU driver
 GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
@@ -610,6 +592,9 @@ function uninstall_local_cudnn8_repo() {
 }
 
 function install_nvidia_nccl() {
+  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
   is_complete nccl && return
 
   if is_cuda11 && is_debian12 ; then
@@ -1044,6 +1029,13 @@ function build_driver_from_packages() {
 }
 
 function install_nvidia_userspace_runfile() {
+  # Parameters for NVIDIA-provided Debian GPU driver
+  readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+  readonly USERSPACE_FILENAME
 
   # This .run file contains NV's OpenGL implementation as well as
   # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
@@ -1565,6 +1557,10 @@ function install_dependencies() {
 }
 
 function prepare_gpu_env(){
+  #set_support_matrix
+
+  set_cuda_version
+  set_driver_version
 
   set +e
   gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
@@ -1588,6 +1584,11 @@ function prepare_gpu_env(){
 
   # determine whether we have nvidia-smi installed and working
   nvsmi
+
+  set_nv_urls
+  set_cuda_runfile_url
+  set_cudnn_version
+  set_cudnn_tarball_url
 }
 
 # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades

From 6363203f7ee077b2e26eb1cebe41fa7d0f43bb63 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 22 Jan 2025 18:34:33 -0800
Subject: [PATCH 101/112] install spark rapids

---
 gpu/install_gpu_driver.sh | 40 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index a48e624e9..f9df64b31 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1566,7 +1566,6 @@ function prepare_gpu_env(){
   gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
   set -e
 
-  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
   nvsmi_works="0"
 
   if   is_cuda11 ; then gcc_ver="11"
@@ -1708,6 +1707,8 @@ function main() {
     fi
 
     configure_yarn_nodemanager
+    if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+      install_spark_rapids ; fi
     configure_gpu_script
     configure_gpu_isolation
   elif [[ "${ROLE}" == "Master" ]]; then
@@ -2149,6 +2150,43 @@ function os_add_repo() {
 
 readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
 
+function install_spark_rapids() {
+  # Update SPARK RAPIDS config
+  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
+
+  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
+  local -r scala_ver="2.12"
+
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+  fi
+
+  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
+
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+
+  local jar_basename
+
+  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+
+  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
+  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+}
+
 prepare_to_install
 
 main

From dba00dfef9b292681b5e75ad15e348ae7bbafc8e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 22 Jan 2025 20:57:37 -0800
Subject: [PATCH 102/112] cache the results of nvidia-smi --query-gpu

---
 gpu/install_gpu_driver.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index f9df64b31..18e694d73 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1404,9 +1404,15 @@ function configure_gpu_script() {
 #
 # Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
+set -e
+resources_json="/dev/shm/nvidia/gpusResources.json"
+if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi
+
+mkdir -p "$(dirname ${resources_json})"
+
 ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
-echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}"
 EOF
 
   chmod a+rx "${gpus_resources_script}"

From 96a8d6d01a4ff525c99f153f04fc8330df9edfdb Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 23 Jan 2025 06:50:15 -0800
Subject: [PATCH 103/112] reduce development time

---
 cloudbuild/presubmit.sh                 | 1 -
 integration_tests/dataproc_test_case.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index 8f5a0a4b1..f796dd1f8 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -105,7 +105,6 @@ run_tests() {
   bazel test \
     --jobs="${max_parallel_tests}" \
     --local_test_jobs="${max_parallel_tests}" \
-    --flaky_test_attempts=3 \
     --action_env="INTERNAL_IP_SSH=true" \
     --test_output="all" \
     --noshow_progress \
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 314603ea1..4e4848523 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -180,7 +180,7 @@ def createCluster(self,
         if not FLAGS.skip_cleanup:
           args.append("--max-age=60m")
 
-        args.append("--max-idle=25m")
+        args.append("--max-idle=45m")
 
         cmd = "{} dataproc clusters create {} {}".format(
             "gcloud beta" if beta else "gcloud", self.name, " ".join(args))

From 11f099c804f33f5b63e69767584f735b55bf815d Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 23 Jan 2025 07:56:06 -0800
Subject: [PATCH 104/112] exercising more CUDA variants ; testing whether tests
 fail on long runs

---
 gpu/test_gpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index e9c2d92ad..3ec053e0e 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -370,10 +370,10 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     self.verify_instance_spark()
 
   @parameterized.parameters(
-    ("SINGLE", ["m"], GPU_T4, None, "11.8"),
-#    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
-    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+#    ("SINGLE", ["m"], GPU_T4, None, "11.8"),
+    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
+    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
   )
   def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes,

From 8ae2c0a2ecd995c49f87e458272b8f5cb8b3e4fe Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 23 Jan 2025 11:26:10 -0800
Subject: [PATCH 105/112] try to reduce concurrent builds ; extend build time
 further ; only enable spark rapids on images >= 2.1

---
 gpu/install_gpu_driver.sh               | 59 ++++++++++++++++++++++++-
 integration_tests/dataproc_test_case.py |  4 +-
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 18e694d73..c0da65e34 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -624,6 +624,14 @@ function install_nvidia_nccl() {
     local local_tarball="${workdir}/${build_tarball}"
     local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}"
 
+    if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+      # do not build in tests with < 32 cores
+      sleep $(( ( RANDOM % 11 ) + 10 ))
+      while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
+        sleep 5m
+      done
+    fi
+
     output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
     if echo "${output}" | grep -q "${gcs_tarball}" ; then
       # cache hit - unpack from cache
@@ -631,6 +639,8 @@ function install_nvidia_nccl() {
       gcloud storage cat "${gcs_tarball}" | tar xvz
     else
       # build and cache
+      touch "${local_tarball}.building"
+      gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
       pushd nccl
       # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
       install_build_dependencies
@@ -677,6 +687,7 @@ function install_nvidia_nccl() {
       popd
       tar xzvf "${local_tarball}"
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      gcloud storage rm "${gcs_tarball}.building"
       rm "${local_tarball}"
     fi
   }
@@ -773,6 +784,14 @@ function install_pytorch() {
   local local_tarball="${workdir}/${build_tarball}"
   local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
 
+  if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+    # do not build in tests with < 32 cores
+    sleep $(( ( RANDOM % 11 ) + 10 ))
+    while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
+      sleep 5m
+    done
+  fi
+
   output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
   if echo "${output}" | grep -q "${gcs_tarball}" ; then
     # cache hit - unpack from cache
@@ -780,6 +799,8 @@ function install_pytorch() {
     mkdir -p "${envpath}"
     gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
   else
+    touch "${local_tarball}.building"
+    gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
     local verb=create
     if test -d "${envpath}" ; then verb=install ; fi
     cudart_spec="cuda-cudart"
@@ -792,6 +813,7 @@ function install_pytorch() {
     tar czf "${local_tarball}" .
     popd
     gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    gcloud storage rm "${gcs_tarball}.building"
   fi
   touch "${workdir}/complete/pytorch"
 }
@@ -950,10 +972,20 @@ function build_driver_from_github() {
 
     local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
+    if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+      # do not build in tests with < 32 cores
+      sleep $(( ( RANDOM % 11 ) + 10 ))
+      while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
+        sleep 5m
+      done
+    fi
+
     if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
       echo "cache hit"
     else
       # build the kernel modules
+      touch "${local_tarball}.building"
+      gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
       pushd open-gpu-kernel-modules
       install_build_dependencies
       if ( is_cuda11 && is_ubuntu22 ) ; then
@@ -982,6 +1014,7 @@ function build_driver_from_github() {
         "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      gcloud storage rm "${gcs_tarball}.building"
       rm "${local_tarball}"
       make clean
       popd
@@ -1071,6 +1104,14 @@ function install_nvidia_userspace_runfile() {
 
       local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
+      if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
+        # do not build in tests with < 32 cores
+        sleep $(( ( RANDOM % 11 ) + 10 ))
+        while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
+          sleep 5m
+        done
+      fi
+
       if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
         cache_hit="1"
         if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
@@ -1078,6 +1119,9 @@ function install_nvidia_userspace_runfile() {
         fi
         echo "cache hit"
       else
+        # build the kernel modules
+        touch "${local_tarball}.building"
+        gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
         install_build_dependencies
         configure_dkms_certs
         local signing_options
@@ -1116,6 +1160,7 @@ function install_nvidia_userspace_runfile() {
         /var/log/nvidia-installer.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      gcloud storage rm "${gcs_tarball}.building"
     fi
   fi
 
@@ -1429,6 +1474,13 @@ EOF
 #  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
   gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
 
+  plugin_line=""
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    if version_ge "${DATAPROC_IMAGE_VERSION}" 2.1 ; then
+      plugin_line="spark.plugins=com.nvidia.spark.SQLPlugin"
+    fi
+  fi
+
   cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
 # Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
@@ -1437,7 +1489,6 @@ EOF
 # having AQE enabled gives user the best performance.
 spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
 spark.executor.resource.gpu.amount=${gpu_count}
-spark.plugins=com.nvidia.spark.SQLPlugin
 spark.executor.cores=${executor_cores}
 spark.executor.memory=${executor_memory_gb}G
 spark.dynamicAllocation.enabled=false
@@ -1445,6 +1496,7 @@ spark.dynamicAllocation.enabled=false
 spark.task.resource.gpu.amount=${gpu_amount}
 spark.task.cpus=2
 spark.yarn.unmanagedAM.enabled=false
+${plugin_line}
 ###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
 EOF
 }
@@ -1714,7 +1766,10 @@ function main() {
 
     configure_yarn_nodemanager
     if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
-      install_spark_rapids ; fi
+      if version_ge "${DATAPROC_IMAGE_VERSION}" 2.1 ; then
+        install_spark_rapids
+      fi
+    fi
     configure_gpu_script
     configure_gpu_isolation
   elif [[ "${ROLE}" == "Master" ]]; then
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 4e4848523..8f08472bd 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -178,9 +178,9 @@ def createCluster(self,
           args.append("--zone={}".format(self.cluster_zone))
 
         if not FLAGS.skip_cleanup:
-          args.append("--max-age=60m")
+          args.append("--max-age=120m")
 
-        args.append("--max-idle=45m")
+        args.append("--max-idle=60m")
 
         cmd = "{} dataproc clusters create {} {}".format(
             "gcloud beta" if beta else "gcloud", self.name, " ".join(args))

From 02732e183363e99bc4a0e523726ee4306cd1bc54 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 23 Jan 2025 17:23:48 -0800
Subject: [PATCH 106/112] fixed bug with spark rapids version assignment ; more
 conservative about requirements for ramdisk ; roll back spark.SQLPlugin
 change

---
 gpu/install_gpu_driver.sh | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index c0da65e34..a419be423 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1374,6 +1374,7 @@ function configure_yarn_resources() {
 # This configuration should be applied only if GPU is attached to the node
 function configure_yarn_nodemanager() {
   if [[ "${gpu_count}" == "0" ]] ; then return ; fi
+
   set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
@@ -1474,13 +1475,6 @@ EOF
 #  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
   gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
 
-  plugin_line=""
-  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
-    if version_ge "${DATAPROC_IMAGE_VERSION}" 2.1 ; then
-      plugin_line="spark.plugins=com.nvidia.spark.SQLPlugin"
-    fi
-  fi
-
   cat >>"${spark_defaults_conf}" <<EOF
 ###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
 # Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
@@ -1496,7 +1490,7 @@ spark.dynamicAllocation.enabled=false
 spark.task.resource.gpu.amount=${gpu_amount}
 spark.task.cpus=2
 spark.yarn.unmanagedAM.enabled=false
-${plugin_line}
+spark.plugins=com.nvidia.spark.SQLPlugin
 ###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
 EOF
 }
@@ -1765,11 +1759,7 @@ function main() {
     fi
 
     configure_yarn_nodemanager
-    if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
-      if version_ge "${DATAPROC_IMAGE_VERSION}" 2.1 ; then
-        install_spark_rapids
-      fi
-    fi
+    install_spark_rapids
     configure_gpu_script
     configure_gpu_isolation
   elif [[ "${ROLE}" == "Master" ]]; then
@@ -2016,12 +2006,12 @@ function set_proxy(){
 function mount_ramdisk(){
   local free_mem
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+  if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi
 
   # Write to a ramdisk instead of churning the persistent disk
 
   tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
+  mkdir -p "${tmpdir}/pkgs_dirs"
   mount -t tmpfs tmpfs "${tmpdir}"
 
   # Download conda packages to tmpfs
@@ -2212,15 +2202,18 @@ function os_add_repo() {
 readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
 
 function install_spark_rapids() {
+  if [[ "${RAPIDS_RUNTIME}" != "SPARK" ]]; then return ; fi
+
   # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_SPARK_RAPIDS_VERSION
+  DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
   local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
 
   if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
 
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})

From 57fef50d238e61b336915920322523c3a5e862a8 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 23 Jan 2025 18:48:49 -0800
Subject: [PATCH 107/112] * gpu does not work on capacity scheduler on dataproc
 2.0 ; use fair * protect against race condition on removing the .building
 files * add logic for pre-11.7 cuda package repo back in * clean up and
 verify yarn config

---
 gpu/install_gpu_driver.sh | 48 +++++++++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index a419be423..e2d5c6591 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -687,7 +687,7 @@ function install_nvidia_nccl() {
       popd
       tar xzvf "${local_tarball}"
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      gcloud storage rm "${gcs_tarball}.building"
+      if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
       rm "${local_tarball}"
     fi
   }
@@ -813,7 +813,7 @@ function install_pytorch() {
     tar czf "${local_tarball}" .
     popd
     gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-    gcloud storage rm "${gcs_tarball}.building"
+    if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
   fi
   touch "${workdir}/complete/pytorch"
 }
@@ -941,7 +941,16 @@ function add_repo_nvidia_container_toolkit() {
 
 function add_repo_cuda() {
   if is_debuntu ; then
-    install_cuda_keyring_pkg # 11.7+, 12.0+
+    if version_le "${CUDA_VERSION}" 11.6 ; then
+      local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
+      local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
+      echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
+      | sudo tee "${sources_list_path}"
+      curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
+        -o "${kr_path}"
+    else
+      install_cuda_keyring_pkg # 11.7+, 12.0+
+    fi
   elif is_rocky ; then
     execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
   fi
@@ -1014,7 +1023,7 @@ function build_driver_from_github() {
         "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      gcloud storage rm "${gcs_tarball}.building"
+      if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
       rm "${local_tarball}"
       make clean
       popd
@@ -1160,7 +1169,7 @@ function install_nvidia_userspace_runfile() {
         /var/log/nvidia-installer.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
-      gcloud storage rm "${gcs_tarball}.building"
+      if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
     fi
   fi
 
@@ -1369,13 +1378,32 @@ function configure_yarn_resources() {
     'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
 
   set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  # Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below
+  if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
+    fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml"
+    set_hadoop_property 'yarn-site.xml' \
+      'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler'
+    set_hadoop_property 'yarn-site.xml' \
+      "yarn.scheduler.fair.user-as-default-queue" "false"
+    set_hadoop_property 'yarn-site.xml' \
+      "yarn.scheduler.fair.allocation.file" "${fs_xml}"
+    set_hadoop_property 'yarn-site.xml' \
+      'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+    cat > "${fs_xml}" <<EOF
+<!-- ${fs_xml} -->
+<allocations>
+  <queueMaxAppsDefault>1</queueMaxAppsDefault>
+</allocations>
+EOF
+  fi
 }
 
 # This configuration should be applied only if GPU is attached to the node
 function configure_yarn_nodemanager() {
   if [[ "${gpu_count}" == "0" ]] ; then return ; fi
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
   set_hadoop_property 'yarn-site.xml' \
@@ -1387,9 +1415,9 @@ function configure_yarn_nodemanager() {
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
   set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+    'yarn.nodemanager.container-executor.class' 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.group' 'yarn'
 
   # Fix local dirs access permissions
   local yarn_local_dirs=()

From cc5abca91c4170fa600cf659f881092637eddb0c Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 23 Jan 2025 22:50:21 -0800
Subject: [PATCH 108/112] revert test_install_gpu_cuda_nvidia_with_spark_job
 cuda versions

---
 gpu/test_gpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 3ec053e0e..e9c2d92ad 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -370,10 +370,10 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     self.verify_instance_spark()
 
   @parameterized.parameters(
-#    ("SINGLE", ["m"], GPU_T4, None, "11.8"),
-    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
-#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+    ("SINGLE", ["m"], GPU_T4, None, "11.8"),
+#    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
+    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
 #    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
   )
   def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes,

From 8936442e1b2f9546d2a49e58e9754afdbf9d8c67 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 28 Jan 2025 13:56:52 -0800
Subject: [PATCH 109/112] configure for use with JupyterLab

---
 gpu/install_gpu_driver.sh | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index e2d5c6591..acf3e21db 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -772,15 +772,18 @@ function install_nvidia_cudnn() {
 }
 
 function install_pytorch() {
-  if test -f "${workdir}/complete/pytorch" ; then return ; fi
+  is_complete pytorch && return
+
   local env
   env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
   local mc3=/opt/conda/miniconda3
   local envpath="${mc3}/envs/${env}"
+  if [[ "${env}" == "base" ]]; then
+    echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
   # Set numa node to 0 for all GPUs
   for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done
 
-  local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
+  local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
   local local_tarball="${workdir}/${build_tarball}"
   local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
 
@@ -805,17 +808,28 @@ function install_pytorch() {
     if test -d "${envpath}" ; then verb=install ; fi
     cudart_spec="cuda-cudart"
     if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
+
+    # Install pytorch and company to this environment
     "${mc3}/bin/mamba" "${verb}" -n "${env}" \
       -c conda-forge -c nvidia -c rapidsai \
       numba pytorch tensorflow[and-cuda] rapids pyspark \
       "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
+
+    # Install jupyter kernel in this environment
+    "${envpath}/bin/python3" -m pip install ipykernel
+
+    # package environment and cache in GCS
     pushd "${envpath}"
     tar czf "${local_tarball}" .
     popd
     gcloud storage cp "${local_tarball}" "${gcs_tarball}"
     if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
   fi
-  touch "${workdir}/complete/pytorch"
+
+  # register the environment as a selectable kernel
+  "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"
+
+  mark_complete pytorch
 }
 
 function configure_dkms_certs() {
@@ -2067,11 +2081,11 @@ function harden_sshd_config() {
     feature_map["kex-gss"]="gssapikexalgorithms" ; fi
   for ftr in "${!feature_map[@]}" ; do
     export feature=${feature_map[$ftr]}
-    sshd_config_line=$(
+    sshd_config_line="${feature} $(
       (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
        ssh -Q "${ftr}" ) \
-      | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}<STDIN>;
-      print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if "@a"')
+      | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)"
+
     grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
     echo "$sshd_config_line" >> /tmp/sshd_config_new
     # TODO: test whether sshd will reload with this change before mv

From 0bc3c1f29876a30758d1cb3db22d2ff965fc75c4 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 29 Jan 2025 14:31:36 -0800
Subject: [PATCH 110/112] 2.2 should use 12.6.3 (latest)

---
 gpu/install_gpu_driver.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index acf3e21db..917816bd1 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -186,7 +186,7 @@ function set_cuda_version() {
   case "${DATAPROC_IMAGE_VERSION}" in
     "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
     "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
-    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
     *   )
       echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
       exit 1

From e56ddd0fbef897c5fb2ab2d2397e5a4f3a72b330 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Sat, 1 Feb 2025 15:32:48 -0800
Subject: [PATCH 111/112] Addressing review from cnauroth

gpu/install_gpu_driver.sh:
* use the same retry arguments in all calls to curl
* correct 12.3's driver and sub-version
* improve logic for pause as other workers perform build
* remove call to undefined clear_nvsmi_cache
* move closing "fi" to line of its own
* added comments for unclear logic
* removed commented code
* remove unused curl for latest driver version

gpu/test_gpu.py
* removed excess test
* added comment about numa node selection
* removed skips of rocky9 ; 2.2.44-rocky9 build succeeds
---
 gpu/install_gpu_driver.sh | 192 +++++++++++++++++++++++++-------------
 gpu/test_gpu.py           |  56 ++---------
 2 files changed, 137 insertions(+), 111 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 917816bd1..6fc243fd2 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -61,9 +61,9 @@ function repair_old_backports {
 
   # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
   debdists="https://deb.debian.org/debian/dists"
-  oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
-  oldstable=$(   curl -s "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
-  stable=$(      curl -s "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');
+  oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
+  oldstable=$(   curl ${curl_retry_args} "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
+  stable=$(      curl ${curl_retry_args} "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');
 
   matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
 
@@ -134,13 +134,12 @@ readonly ROLE
 
 # Minimum supported version for open kernel driver is 515.43.04
 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
-latest="$(curl -s https://us.download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
 readonly -A DRIVER_FOR_CUDA=(
     ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
     ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
     ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
     ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
-    ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08"
+    ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06"
     ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142"
 )
 readonly -A DRIVER_SUBVER=(
@@ -231,6 +230,8 @@ function set_driver_version() {
   local cuda_url
   cuda_url=$(get_metadata_attribute 'cuda-url' '')
 
+  local nv_xf86_x64_base="https://us.download.nvidia.com/XFree86/Linux-x86_64"
+
   local DEFAULT_DRIVER
   # Take default from gpu-driver-url metadata value
   if [[ -n "${gpu_driver_url}" ]] ; then
@@ -242,12 +243,12 @@ function set_driver_version() {
     if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
       major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
       driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
-      if curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+      if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
         # use the version indicated by the cuda url as the default if it exists
-	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
-      elif curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+      elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
         # use the maximum sub-version available for the major version indicated in cuda url as the default
-	DEFAULT_DRIVER="${driver_max_maj_version}"
+        DEFAULT_DRIVER="${driver_max_maj_version}"
       fi
     fi
   fi
@@ -264,8 +265,8 @@ function set_driver_version() {
 
   export DRIVER_VERSION DRIVER
 
-  gpu_driver_url="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+  gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
     echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
     exit 1
   fi
@@ -397,7 +398,7 @@ function set_cuda_runfile_url() {
 
   NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
 
-  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+  if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
     echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
     if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then
       echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead"
@@ -481,7 +482,7 @@ function execute_with_retries() (
 function install_cuda_keyring_pkg() {
   is_complete cuda-keyring-installed && return
   local kr_ver=1.1
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+  curl ${curl_retry_args} \
     "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
     -o "${tmpdir}/cuda-keyring.deb"
   dpkg -i "${tmpdir}/cuda-keyring.deb"
@@ -503,7 +504,7 @@ function install_local_cuda_repo() {
   readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
   readonly DIST_KEYRING_DIR="/var/${pkgname}"
 
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+  curl ${curl_retry_args} \
     "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
 
   dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
@@ -511,7 +512,7 @@ function install_local_cuda_repo() {
   cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
 
   if is_ubuntu ; then
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    curl ${curl_retry_args} \
       "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
@@ -531,7 +532,7 @@ function install_local_cudnn_repo() {
   local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
 
   # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+  curl ${curl_retry_args} \
     "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
 
   dpkg -i "${tmpdir}/local-installer.deb"
@@ -609,7 +610,7 @@ function install_nvidia_nccl() {
 
   test -d "${workdir}/nccl" || {
     local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    curl ${curl_retry_args} \
       "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
       | tar xz
     mv "nccl-${NCCL_VERSION}-1" nccl
@@ -625,11 +626,22 @@ function install_nvidia_nccl() {
     local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}"
 
     if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
-      # do not build in tests with < 32 cores
+      # when running with fewer than 32 cores, yield to in-progress build
       sleep $(( ( RANDOM % 11 ) + 10 ))
-      while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
-        sleep 5m
-      done
+      if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+        local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+        local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+        local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+        while gsutil ls -L "${gcs_tarball}.building" ; do
+          local now_epoch="$(date -u +%s)"
+          if (( now_epoch > timeout_epoch )) ; then
+            # detect unexpected build failure after 45m
+            gsutil rm "${gcs_tarball}.building"
+            break
+          fi
+          sleep 5m
+        done
+      fi
     fi
 
     output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
@@ -641,6 +653,7 @@ function install_nvidia_nccl() {
       # build and cache
       touch "${local_tarball}.building"
       gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+      building_file="${gcs_tarball}.building"
       pushd nccl
       # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
       install_build_dependencies
@@ -688,6 +701,7 @@ function install_nvidia_nccl() {
       tar xzvf "${local_tarball}"
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
       if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+      building_file=""
       rm "${local_tarball}"
     fi
   }
@@ -735,17 +749,17 @@ function install_nvidia_cudnn() {
         add_repo_cuda
 
         apt-get update -qq
-	# Ignore version requested and use the latest version in the package index
-	cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"
+        # Ignore version requested and use the latest version in the package index
+        cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"
 
         execute_with_retries \
           apt-get -y install --no-install-recommends \
             "libcudnn8=${cudnn_pkg_version}" \
             "libcudnn8-dev=${cudnn_pkg_version}"
 
-	sync
+        sync
       elif is_cudnn9 ; then
-	install_cuda_keyring_pkg
+        install_cuda_keyring_pkg
 
         apt-get update -qq
 
@@ -755,7 +769,7 @@ function install_nvidia_cudnn() {
           "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
           "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
 
-	sync
+        sync
       else
         echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
       fi
@@ -788,11 +802,22 @@ function install_pytorch() {
   local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
 
   if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
-    # do not build in tests with < 32 cores
+    # when running with fewer than 32 cores, yield to in-progress build
     sleep $(( ( RANDOM % 11 ) + 10 ))
-    while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
-      sleep 5m
-    done
+    if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+      local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+      local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+      local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+      while gsutil ls -L "${gcs_tarball}.building" ; do
+        local now_epoch="$(date -u +%s)"
+        if (( now_epoch > timeout_epoch )) ; then
+          # detect unexpected build failure after 45m
+          gsutil rm "${gcs_tarball}.building"
+          break
+        fi
+        sleep 5m
+      done
+    fi
   fi
 
   output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
@@ -804,6 +829,7 @@ function install_pytorch() {
   else
     touch "${local_tarball}.building"
     gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+    building_file="${gcs_tarball}.building"
     local verb=create
     if test -d "${envpath}" ; then verb=install ; fi
     cudart_spec="cuda-cudart"
@@ -824,6 +850,7 @@ function install_pytorch() {
     popd
     gcloud storage cp "${local_tarball}" "${gcs_tarball}"
     if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+    building_file=""
   fi
 
   # register the environment as a selectable kernel
@@ -960,7 +987,7 @@ function add_repo_cuda() {
       local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
       echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
       | sudo tee "${sources_list_path}"
-      curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
+      curl ${curl_retry_args} "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
         -o "${kr_path}"
     else
       install_cuda_keyring_pkg # 11.7+, 12.0+
@@ -978,7 +1005,7 @@ function build_driver_from_github() {
   pushd "${workdir}"
   test -d "${workdir}/open-gpu-kernel-modules" || {
     tarball_fn="${DRIVER_VERSION}.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    curl ${curl_retry_args} \
       "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
       | tar xz
     mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
@@ -996,11 +1023,22 @@ function build_driver_from_github() {
     local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
     if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
-      # do not build in tests with < 32 cores
+      # when running with fewer than 32 cores, yield to in-progress build
       sleep $(( ( RANDOM % 11 ) + 10 ))
-      while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
-        sleep 5m
-      done
+      if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+        local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+        local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+        local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+        while gsutil ls -L "${gcs_tarball}.building" ; do
+          local now_epoch="$(date -u +%s)"
+          if (( now_epoch > timeout_epoch )) ; then
+            # detect unexpected build failure after 45m
+            gsutil rm "${gcs_tarball}.building"
+            break
+          fi
+          sleep 5m
+        done
+      fi
     fi
 
     if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
@@ -1009,6 +1047,7 @@ function build_driver_from_github() {
       # build the kernel modules
       touch "${local_tarball}.building"
       gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+      building_file="${gcs_tarball}.building"
       pushd open-gpu-kernel-modules
       install_build_dependencies
       if ( is_cuda11 && is_ubuntu22 ) ; then
@@ -1038,6 +1077,7 @@ function build_driver_from_github() {
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
       if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+      building_file=""
       rm "${local_tarball}"
       make clean
       popd
@@ -1128,11 +1168,22 @@ function install_nvidia_userspace_runfile() {
       local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
 
       if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
-        # do not build in tests with < 32 cores
+        # when running with fewer than 32 cores, yield to in-progress build
         sleep $(( ( RANDOM % 11 ) + 10 ))
-        while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
-          sleep 5m
-        done
+        if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then
+          local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")"
+          local build_start_epoch="$(date -d "${build_start_time}" +%s)"
+          local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
+          while gsutil ls -L "${gcs_tarball}.building" ; do
+            local now_epoch="$(date -u +%s)"
+            if (( now_epoch > timeout_epoch )) ; then
+              # detect unexpected build failure after 45m
+              gsutil rm "${gcs_tarball}.building"
+              break
+            fi
+            sleep 5m
+          done
+        fi
       fi
 
       if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
@@ -1145,6 +1196,7 @@ function install_nvidia_userspace_runfile() {
         # build the kernel modules
         touch "${local_tarball}.building"
         gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
+        building_file="${gcs_tarball}.building"
         install_build_dependencies
         configure_dkms_certs
         local signing_options
@@ -1184,6 +1236,7 @@ function install_nvidia_userspace_runfile() {
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
       if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
+      building_file=""
     fi
   fi
 
@@ -1316,7 +1369,7 @@ function install_ops_agent(){
   mkdir -p /opt/google
   cd /opt/google
   # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
-  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+  curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
   execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
   mark_complete ops-agent
@@ -1332,9 +1385,9 @@ function install_gpu_agent() {
   fi
   local install_dir=/opt/gpu-utilization-agent
   mkdir -p "${install_dir}"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+  curl ${curl_retry_args} \
     "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+  curl ${curl_retry_args} \
     "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
@@ -1451,7 +1504,6 @@ function configure_gpu_exclusive_mode() {
   if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
   # include exclusive mode on GPU
   nvsmi -c EXCLUSIVE_PROCESS
-  clear_nvsmi_cache
 }
 
 function fetch_mig_scripts() {
@@ -1653,6 +1705,9 @@ function install_dependencies() {
 function prepare_gpu_env(){
   #set_support_matrix
 
+  # if set, this variable includes a gcs path to a build-in-progress indicator
+  building_file=""
+
   set_cuda_version
   set_driver_version
 
@@ -1763,7 +1818,7 @@ function main() {
       #Install GPU metrics collection in Stackdriver if needed
       if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
         #install_ops_agent
-	install_gpu_agent
+        install_gpu_agent
         echo 'GPU metrics agent successfully deployed.'
       else
         echo 'GPU metrics agent will not be installed.'
@@ -1775,22 +1830,22 @@ function main() {
       done
 
       if test -n "$(nvsmi -L)" ; then
-	# cache the result of the gpu query
+        # cache the result of the gpu query
         ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
         echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
-	chmod a+r "/var/run/nvidia-gpu-index.txt"
+        chmod a+r "/var/run/nvidia-gpu-index.txt"
       fi
       MIG_GPU_LIST="$(nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n "")"
       NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
       if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
         # enable MIG on every GPU
-	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
+        for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
           if version_le "${CUDA_VERSION}" "11.6" ; then
             nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
           else
-	    nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
+            nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
           fi
-	done
+        done
 
         NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
         MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
@@ -1825,7 +1880,7 @@ function cache_fetched_package() {
   if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
     time gcloud storage cp "${gcs_fn}" "${local_fn}"
   else
-    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+    time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \
            gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
   fi
 }
@@ -1854,7 +1909,7 @@ function clean_up_sources_lists() {
 
     local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
     rm -f "${bigtop_kr_path}"
-    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+    curl ${curl_retry_args} \
       "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
 
     sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
@@ -1868,7 +1923,7 @@ function clean_up_sources_lists() {
   local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
   local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
   rm -f "${adoptium_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+  curl ${curl_retry_args} "${key_url}" \
    | gpg --dearmor -o "${adoptium_kr_path}"
   echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
    > /etc/apt/sources.list.d/adoptium.list
@@ -1882,7 +1937,7 @@ function clean_up_sources_lists() {
   local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
 
   rm -f "${docker_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+  curl ${curl_retry_args} "${docker_key_url}" \
     | gpg --dearmor -o "${docker_kr_path}"
   echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
     > ${docker_repo_file}
@@ -1892,7 +1947,7 @@ function clean_up_sources_lists() {
   #
   if ls /etc/apt/sources.list.d/google-cloud*.list ; then
     rm -f /usr/share/keyrings/cloud.google.gpg
-    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    curl ${curl_retry_args} https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
     for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
       list_file="/etc/apt/sources.list.d/${list}.list"
       if [[ -f "${list_file}" ]]; then
@@ -1908,7 +1963,7 @@ function clean_up_sources_lists() {
     keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
     if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
     rm -f /usr/share/keyrings/cran-r.gpg
-    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+    curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
       gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
     sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
   fi
@@ -1918,7 +1973,7 @@ function clean_up_sources_lists() {
   #
   if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
     rm -f /usr/share/keyrings/mysql.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+    curl ${curl_retry_args} 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
       gpg --dearmor -o /usr/share/keyrings/mysql.gpg
     sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
   fi
@@ -1931,6 +1986,11 @@ function exit_handler() {
   # Purge private key material until next grant
   clear_dkms_key
 
+  # clean up incomplete build indicators
+  if test -n "${building_file}" ; then
+    if gcloud storage ls "${building_file}" ; then gcloud storage rm "${building_file}" || true ; fi
+  fi
+
   set +ex
   echo "Exit handler invoked"
 
@@ -2078,9 +2138,11 @@ function harden_sshd_config() {
   # disable sha1 and md5 use in kex and kex-gss features
   declare -A feature_map=(["kex"]="kexalgorithms")
   if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then
-    feature_map["kex-gss"]="gssapikexalgorithms" ; fi
+    feature_map["kex-gss"]="gssapikexalgorithms"
+  fi
   for ftr in "${!feature_map[@]}" ; do
-    export feature=${feature_map[$ftr]}
+    local feature=${feature_map[$ftr]}
+    local sshd_config_line
     sshd_config_line="${feature} $(
       (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
        ssh -Q "${ftr}" ) \
@@ -2089,7 +2151,7 @@ function harden_sshd_config() {
     grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
     echo "$sshd_config_line" >> /tmp/sshd_config_new
     # TODO: test whether sshd will reload with this change before mv
-    mv /tmp/sshd_config_new /etc/ssh/sshd_config
+    mv -f /tmp/sshd_config_new /etc/ssh/sshd_config
   done
   local svc=ssh
   if is_rocky ; then svc="sshd" ; fi
@@ -2101,6 +2163,8 @@ function prepare_to_install(){
   check_os
   check_secure_boot
 
+  curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30"
+
   prepare_gpu_env
 
   workdir=/opt/install-dpgce
@@ -2178,6 +2242,9 @@ function check_os() {
     if test -v DATAPROC_VERSION ; then
       DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
     else
+      # When building custom-images, neither of the above variables
+      # are defined and we need to make a reasonable guess
+
       if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
       elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
       elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
@@ -2213,9 +2280,8 @@ function dnf_add_repo() {
   local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
   local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
 
-  curl -s -L "${repo_url}" \
+  curl ${curl_retry_args} "${repo_url}" \
     | dd of="${repo_path}" status=progress
-#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
 }
 
 #
@@ -2233,7 +2299,7 @@ function os_add_repo() {
 
   mkdir -p "$(dirname "${kr_path}")"
 
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+  curl ${curl_retry_args} "${signing_key_url}" \
     | gpg --import --no-default-keyring --keyring "${kr_path}"
 
   if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index e9c2d92ad..3d6dbd416 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -64,6 +64,12 @@ def verify_pytorch(self, name):
     self.upload_test_file(test_filename, name)
 
     conda_env="dpgce"
+
+    # until the numa node is selected, every time the GPU is accessed
+    # from pytorch, log noise about numa node not being selected is
+    # printed to the console. Selecting numa node before the python is
+    # executed improves readability of the diagnostic information.
+
     verify_cmd = \
       "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \
       "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \
@@ -77,8 +83,9 @@ def verify_tensorflow(self, name):
                                self.TF_TEST_SCRIPT_FILE_NAME)
     self.upload_test_file(test_filename, name)
     # all on a single numa node
+    conda_env="dpgce"
     verify_cmd = \
-      "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format("dpgce") + \
+      "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \
       "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \
       "${envpath}/bin/python {}".format(
         self.TF_TEST_SCRIPT_FILE_NAME)
@@ -144,41 +151,6 @@ def verify_driver_signature(self, name):
 """
     self.assert_instance_command( name, cert_verification_cmd.format(cert_path) )
 
-  @parameterized.parameters(
-      ("SINGLE",   ["m"], GPU_T4, None, None),
-#      ("STANDARD", ["m"], GPU_T4, None, None),
-      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"),
-  )
-  def test_install_gpu_default_agent(self, configuration, machine_suffixes,
-                                     master_accelerator, worker_accelerator,
-                                     driver_provider):
-    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
-
-    if configuration == 'SINGLE' \
-    and self.getImageOs() == 'rocky' \
-    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
-      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      self.skipTest("known to fail")
-
-    metadata = None
-    if driver_provider is not None:
-      metadata = "gpu-driver-provider={}".format(driver_provider)
-    self.createCluster(
-        configuration,
-        self.INIT_ACTIONS,
-        machine_type="n1-highmem-32",
-        master_accelerator=master_accelerator,
-        worker_accelerator=worker_accelerator,
-        metadata=metadata,
-        timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl
-        boot_disk_size="60GB")
-    for machine_suffix in machine_suffixes:
-      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
-      self.verify_instance(machine_name)
-      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
-      self.verify_instance_pyspark(machine_name)
-      self.verify_instance_spark()
-
   @parameterized.parameters(
       ("SINGLE", ["m"], GPU_T4, None, None),
   )
@@ -252,9 +224,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
@@ -344,9 +313,6 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     if configuration == 'SINGLE' \
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
@@ -380,9 +346,6 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
@@ -430,9 +393,6 @@ def untested_driver_signing(self, configuration, machine_suffixes,
                            master_accelerator, worker_accelerator,
                            cuda_version, image_os, image_version):
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')

From 3384a4de73b9e10105d894331f28b8ee19bb263f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 6 Feb 2025 11:48:25 -0800
Subject: [PATCH 112/112] reverting changes to presubmit.sh

---
 cloudbuild/presubmit.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index f796dd1f8..eec7adb76 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -70,7 +70,6 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
-      continue # to be removed before merge
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0
@@ -105,6 +104,7 @@ run_tests() {
   bazel test \
     --jobs="${max_parallel_tests}" \
     --local_test_jobs="${max_parallel_tests}" \
+    --flaky_test_attempts=3 \
     --action_env="INTERNAL_IP_SSH=true" \
     --test_output="all" \
     --noshow_progress \