From c0ea6318e975de4b7a517c56c72180bcd656f65f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 7 Dec 2024 15:01:23 -0800 Subject: [PATCH 001/112] [gpu] toward a more consistent driver and CUDA install gpu/install_gpu_driver.sh * exclusively using .run file installation method when available * build nccl from source * cache build artifacts from kernel driver and nccl * Tested more CUDA minor versions * gathering CUDA and driver version from URLs if passed * Printing warnings when combination provided is known to fail * waiting on apt lock when it exists * wrapping expensive functions in completion checks to reduce re-run time * fixed a problem with ops agent not installing ; using venv * Installing gcc-12 on ubuntu22 to fix kernel driver FTBFS * setting better spark defaults * skipping proxy setup if http-proxy metadata not set * added function to check secure-boot and os version compatability gpu/manual-test-runner.sh * order commands correctly gpu/test_gpu.py * clearer test skipping logic * added instructions on how to test pyspark --- gpu/install_gpu_driver.sh | 643 ++++++++++++++++++++++++++++---------- gpu/manual-test-runner.sh | 4 +- gpu/test_gpu.py | 25 +- 3 files changed, 497 insertions(+), 175 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 25efb2a49..db6d630a1 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -53,7 +53,7 @@ function os_vercat() ( set +x else os_version ; fi ; ) function repair_old_backports { - if ge_debian12 || ! is_debuntu ; then return ; fi + if ! is_debuntu ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will use archive.debian.org for the oldoldstable repo @@ -94,6 +94,7 @@ function print_metadata_value_if_exists() { return ${return_code} } +# replicates /usr/share/google/get_metadata_value function get_metadata_value() ( set +x local readonly varname=$1 @@ -117,10 +118,21 @@ function get_metadata_attribute() ( get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) -OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" readonly OS_NAME +# Fetch SPARK config +SPARK_VERSION_ENV="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" +readonly SPARK_VERSION_ENV +if version_ge "${SPARK_VERSION_ENV}" "3.0" && \ + version_lt "${SPARK_VERSION_ENV}" "4.0" ; then + readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + readonly SPARK_VERSION="3.0" # try ${SPARK_VERSION_ENV} +else + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 +fi + # node role ROLE="$(get_metadata_attribute dataproc-role)" readonly ROLE @@ -131,13 +143,13 @@ readonly ROLE # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( ["11.8"]="560.35.03" - ["12.0"]="525.60.13" ["12.4"]="560.35.03" ["12.6"]="560.35.03" + ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="560.35.03" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) # https://developer.nvidia.com/cudnn-downloads if is_debuntu ; then readonly -A CUDNN_FOR_CUDA=( ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.6"]="9.5.1.17" + ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" ) elif is_rocky ; then # rocky: @@ -150,34 +162,65 @@ elif is_rocky ; then # 12.6: 9.5.1.17 readonly -A CUDNN_FOR_CUDA=( ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.4"]="9.1.1.17" ["12.6"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" ) fi # https://developer.nvidia.com/nccl/nccl-download # 12.2: 2.19.3, 12.5: 2.21.5 readonly -A NCCL_FOR_CUDA=( ["11.8"]="2.15.5" - ["12.0"]="2.16.5" ["12.4"]="2.23.4" ["12.6"]="2.23.4" + ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( ["11.8"]="11.8.0" - ["12.0"]="12.0.0" ["12.4"]="12.4.1" ["12.6"]="12.6.2" + ["12.0"]="12.0.0" ["12.1"]="12.1.1" ["12.4"]="12.4.1" ["12.5"]="12.5.0" ["12.6"]="12.6.2" +) +# Debian 12 +# 12.3.101, 12.3.52 +# 12.4.127, 12.4.99 +# 12.5.82, 12.5.39 +# 12.6.77, 12.6.68, 12.6.37 + +readonly -A cuda_toolkit_config_version=( + ["12.4"]="12.4.127" ["12.6"]="12.6.77" ) RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') -readonly DEFAULT_CUDA_VERSION='12.4' -CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") -if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then - # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27 - CUDA_VERSION="${DEFAULT_CUDA_VERSION}" -fi -if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then - # Only CUDA 12.0 supported on older debuntu - CUDA_VERSION="12.0" -fi -readonly CUDA_VERSION -readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}" +function set_cuda_version() { + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + + if [[ -n "${cuda_url}" ]] ; then + local CUDA_URL_VERSION + CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" + if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then + DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" + CUDA_FULL_VERSION="${CUDA_URL_VERSION}" + fi + else + DEFAULT_CUDA_VERSION='12.4' + fi + readonly DEFAULT_CUDA_VERSION + + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + readonly CUDA_VERSION + if ( ! test -v CUDA_FULL_VERSION ) ; then + CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} + fi + readonly CUDA_FULL_VERSION + + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then + echo "CUDA 12.3.0 is the minimum CUDA 12 version on Debian 12" + elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then + echo "CUDA 12.1.1 is the maximum CUDA version on ubuntu18. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then + echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then + echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" + fi +} +set_cuda_version function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) @@ -187,17 +230,58 @@ function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) -DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}" -if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then - DEFAULT_DRIVER="560.28.03" ; fi -if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03" ; fi -if ( is_rocky && le_cuda11 ) ; then DEFAULT_DRIVER="525.147.05" ; fi -if ( is_ubuntu20 && le_cuda11 ) ; then DEFAULT_DRIVER="535.183.06" ; fi -if ( is_rocky9 && ge_cuda12 ) ; then DEFAULT_DRIVER="565.57.01" ; fi -DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") +function set_driver_version() { + local gpu_driver_url + gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') + + local cuda_url + cuda_url=$(get_metadata_attribute 'cuda-url' '') + + local DEFAULT_DRIVER + # Take default from gpu-driver-url metadata value + if [[ -n "${gpu_driver_url}" ]] ; then + DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" + if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi + # Take default from cuda-url metadata value as a backup + elif [[ -n "${cuda_url}" ]] ; then + CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" + if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" ; fi + fi + + if ( ! test -v DEFAULT_DRIVER ) ; then + # Otherwise attempt to make an educated guess + DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} +# if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then +# DEFAULT_DRIVER="560.28.03" ; fi +# if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03" ; fi +# if ( is_rocky && le_cuda11 ) ; then DEFAULT_DRIVER="525.147.05" ; fi +# if ( is_ubuntu20 && le_cuda11 ) ; then DEFAULT_DRIVER="535.183.06" ; fi +# if ( is_rocky9 && ge_cuda12 ) ; then DEFAULT_DRIVER="565.57.01" ; fi + fi + + DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") + + readonly DRIVER_VERSION + readonly DRIVER="${DRIVER_VERSION%%.*}" -readonly DRIVER_VERSION -readonly DRIVER=${DRIVER_VERSION%%.*} + export DRIVER_VERSION DRIVER + + gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" + exit 1 + fi + + # Verify that the requested combination is supported + readonly CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run" + cuda_url="https://developer.download.nvidia.com/compute/cuda/${CUDA_FULL_VERSION}/local_installers/${CUDA_RUNFILE}" + if ! curl -s --head "${cuda_url}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${DRIVER_VERSION}, CUDA_VERSION=${CUDA_FULL_VERSION}" + exit 1 + fi +} + +set_driver_version readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" @@ -227,6 +311,11 @@ readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64 readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") +USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" +readonly USERSPACE_FILENAME + +readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + # Short name for urls if is_ubuntu22 ; then # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at @@ -234,14 +323,14 @@ if is_ubuntu22 ; then # use packages from previous release until such time as nvidia # release ubuntu2204 builds - nccl_shortname="ubuntu2004" shortname="$(os_id)$(os_vercat)" + nccl_shortname="ubuntu2004" elif ge_rocky9 ; then # use packages from previous release until such time as nvidia # release rhel9 builds - nccl_shortname="rhel8" shortname="rhel9" + nccl_shortname="rhel8" elif is_rocky ; then shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" nccl_shortname="${shortname}" @@ -261,29 +350,55 @@ readonly NCCL_REPO_URL readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub function set_cuda_runfile_url() { - local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}" - local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}" - - if ge_cuda12 ; then - if ( le_debian11 || le_ubuntu18 ) ; then - RUNFILE_DRIVER_VERSION="525.60.13" - RUNFILE_CUDA_VERSION="12.0.0" - elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then - RUNFILE_DRIVER_VERSION="525.147.05" - RUNFILE_CUDA_VERSION="12.0.0" + local MAX_DRIVER_VERSION + local MAX_CUDA_VERSION + + local MIN_OPEN_DRIVER_VER="515.48.07" + local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" + local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER + + if is_cuda12 ; then + if is_debian12 ; then + MIN_DRIVER_VERSION="545.23.06" + MIN_CUDA_VERSION="12.3.0" + elif is_debian10 ; then + MAX_DRIVER_VERSION="555.42.02" + MAX_CUDA_VERSION="12.5.0" + elif is_ubuntu18 ; then + MAX_DRIVER_VERSION="530.30.02" + MAX_CUDA_VERSION="12.1.1" + fi + elif ge_version "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + if le_debian10 ; then + # cuda 11 is not supported for <= debian10 + MAX_CUDA_VERSION="0" + MAX_DRIVER_VERSION="0" fi else - RUNFILE_DRIVER_VERSION="520.61.05" - RUNFILE_CUDA_VERSION="11.8.0" + echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" fi - readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run" - CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}" - DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}" + if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then + echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" + fi + if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then + echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then + echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" + fi + + CUDA_FILENAME="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run" + local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" + local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_FILENAME}" readonly DEFAULT_NVIDIA_CUDA_URL NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") readonly NVIDIA_CUDA_URL + + CUDA_FILENAME="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_FILENAME } set_cuda_runfile_url @@ -315,8 +430,6 @@ readonly CUDNN_TARBALL_URL GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') readonly GPU_DRIVER_PROVIDER -# Stackdriver GPU agent parameters -readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') readonly INSTALL_GPU_AGENT @@ -336,7 +449,7 @@ function execute_with_retries() ( if [[ "$cmd" =~ "^apt-get install" ]] ; then apt-get -y clean - apt-get -y autoremove + apt-get -o DPkg::Lock::Timeout=60 -y autoremove fi for ((i = 0; i < 3; i++)); do set -x @@ -455,43 +568,92 @@ function uninstall_local_cudnn8_repo() { } function install_nvidia_nccl() { + if test -f "${workdir}/nccl-complete" ; then return ; fi + local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - if is_rocky ; then - execute_with_retries \ - dnf -y -q install \ - "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" - sync - elif is_ubuntu ; then - install_cuda_keyring_pkg + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" - apt-get update -qq + mkdir -p "${workdir}" + pushd "${workdir}" - if is_ubuntu18 ; then - execute_with_retries \ - apt-get install -q -y \ - libnccl2 libnccl-dev - sync + test -d "${workdir}/nccl" || { + local tarball_fn="v${NCCL_VERSION}-1.tar.gz" + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ + | tar xz + mv "nccl-${NCCL_VERSION}-1" nccl + } + + local build_path + if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else + build_path="nccl/build/pkg/rpm/x86_64" ; fi + + test -d "${workdir}/nccl/build" || { + local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" else - execute_with_retries \ - apt-get install -q -y \ - "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" - sync + # build and cache + pushd nccl + # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install + if is_debuntu ; then + # These packages are required to build .deb packages from source + execute_with_retries \ + apt-get install -y -qq build-essential devscripts debhelper fakeroot + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.debian.build + elif is_rocky ; then + # These packages are required to build .rpm packages from source + execute_with_retries \ + dnf -y -q install rpm-build rpmdevtools + export NVCC_GENCODE + execute_with_retries make -j$(nproc) pkg.redhat.build + fi + tar czvf "/${local_tarball}" "../${build_path}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd fi - else - echo "Unsupported OS: '${OS_NAME}'" - # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems - # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz - # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz - # nvhpc_2024_247_Linux_x86_64_cuda_multi/install - return + gcloud storage cat "${gcs_tarball}" | tar xz + } + + if is_debuntu ; then + dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" + elif is_rocky ; then + rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" fi + + popd + touch "${workdir}/nccl-complete" } function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { + if test -f "${workdir}/cudnn-complete" ; then return ; fi + local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -557,6 +719,7 @@ function install_nvidia_cudnn() { ldconfig echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." + touch "${workdir}/cudnn-complete" } CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" @@ -686,19 +849,12 @@ function add_repo_nvidia_container_toolkit() { function add_repo_cuda() { if is_debuntu ; then - local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg - local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" - echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ - | sudo tee "${sources_list_path}" - curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ - -o "${kr_path}" + install_cuda_keyring_pkg # 11.7+, 12.0+ elif is_rocky ; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" - execute_with_retries "dnf clean all" fi } -readonly uname_r=$(uname -r) function build_driver_from_github() { if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv @@ -707,37 +863,58 @@ function build_driver_from_github() { mok_key=/var/lib/dkms/mok.key mok_der=/var/lib/dkms/mok.pub fi - workdir=/opt/install-nvidia-driver - mkdir -p "${workdir}" pushd "${workdir}" + test -d "${workdir}/open-gpu-kernel-modules" || { - tarball_fn="${DRIVER_VERSION}.tar.gz" + local tarball_fn="${DRIVER_VERSION}.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ | tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } - cd open-gpu-kernel-modules - time make -j$(nproc) modules \ - > /var/log/open-gpu-kernel-modules-build.log \ - 2> /var/log/open-gpu-kernel-modules-build_error.log - sync + test -f "${workdir}/open-gpu-kernel-modules/kernel-open/nvidia.ko" || { + local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + echo "cache hit" + else + # build and cache kernel modules + pushd open-gpu-kernel-modules + execute_with_retries make -j$(nproc) modules \ + > kernel-open/build.log \ + 2> kernel-open/build_error.log + tar czvf "${local_tarball}" ../open-gpu-kernel-modules/kernel-open + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" + make clean + popd + fi + gcloud storage cat "${gcs_tarball}" | tar xzv + } + + # Sign kernel modules if [[ -n "${PSN}" ]]; then - #configure_dkms_certs - for module in $(find kernel-open -name '*.ko'); do + for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ "${mok_key}" \ "${mok_der}" \ "${module}" done - #clear_dkms_key fi - make modules_install \ - >> /var/log/open-gpu-kernel-modules-build.log \ - 2>> /var/log/open-gpu-kernel-modules-build_error.log + # install kernel modules + modinfo nvidia > /dev/null 2>&1 || { + pushd open-gpu-kernel-modules + make modules_install \ + >> kernel-open/build.log \ + 2>> kernel-open/build_error.log + depmod -a + popd + } + popd } @@ -776,23 +953,44 @@ function build_driver_from_packages() { #clear_dkms_key } +function cache_fetched_package() { + local src_url="$1" + local gcs_fn="$2" + local local_fn="$3" + + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then + time gcloud storage cp "${gcs_fn}" "${local_fn}" + else + time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + fi +} + function install_nvidia_userspace_runfile() { - if test -f "${tmpdir}/userspace-complete" ; then return ; fi - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${USERSPACE_URL}" -o "${tmpdir}/userspace.run" - execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}" - rm -f "${tmpdir}/userspace.run" - touch "${tmpdir}/userspace-complete" + if test -f "${workdir}/userspace-complete" ; then return ; fi + local local_fn="${tmpdir}/userspace.run" + + cache_fetched_package "${USERSPACE_URL}" \ + "${pkg_bucket}/${USERSPACE_FILENAME}" \ + "${local_fn}" + + execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}" + rm -f "${local_fn}" + touch "${workdir}/userspace-complete" sync } function install_cuda_runfile() { - if test -f "${tmpdir}/cuda-complete" ; then return ; fi - time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run" - execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}" - rm -f "${tmpdir}/cuda.run" - touch "${tmpdir}/cuda-complete" + if test -f "${workdir}/cuda-complete" ; then return ; fi + local local_fn="${tmpdir}/cuda.run" + + cache_fetched_package "${NVIDIA_CUDA_URL}" \ + "${pkg_bucket}/${CUDA_FILENAME}" \ + "${local_fn}" + + execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" + rm -f "${local_fn}" + touch "${workdir}/cuda-complete" sync } @@ -808,12 +1006,11 @@ function install_cuda_toolkit() { if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} - sync elif is_rocky ; then # rocky9: cuda-11-[7,8], cuda-12-[1..6] execute_with_retries dnf -y -q install "${cudatk_package}" - sync fi + sync } function load_kernel_module() { @@ -830,13 +1027,30 @@ function load_kernel_module() { # TODO: if peermem is available, also modprobe nvidia-peermem } +function install_cuda(){ + if test -f "${workdir}/cuda-repo-complete" ; then return ; fi + + if ( ge_debian12 && is_src_os ) ; then + echo "installed with the driver on ${OS_NAME}" + return 0 + fi + + # The OS package distributions are unreliable + install_cuda_runfile + + # Includes cudNN packages + add_repo_cuda + + touch "${workdir}/cuda-repo-complete" +} + # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { + if test -f "${workdir}/gpu-driver-complete" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components add_repo_nvidia_container_toolkit apt-get update -qq - #configure_dkms_certs apt-get -yq install \ nvidia-container-toolkit \ dkms \ @@ -845,42 +1059,38 @@ function install_nvidia_gpu_driver() { nvidia-smi \ libglvnd0 \ libcuda1 - #clear_dkms_key - elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then + echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully" + return 0 + fi - install_nvidia_userspace_runfile + # OS driver packages do not produce reliable driver ; use runfile + install_nvidia_userspace_runfile - build_driver_from_github + build_driver_from_github - install_cuda_runfile - elif is_debuntu ; then - install_cuda_keyring_pkg + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + touch "${workdir}/gpu-driver-complete" +} - build_driver_from_packages +function install_ops_agent(){ + if test -f "${workdir}/ops-agent-complete" ; then return ; fi - install_cuda_toolkit - elif is_rocky ; then - add_repo_cuda + mkdir -p /opt/google + cd /opt/google + # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation + curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - build_driver_from_packages - - install_cuda_toolkit - else - echo "Unsupported OS: '${OS_NAME}'" - exit 1 - fi - ldconfig - if is_src_os ; then - echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully" - else - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - fi + touch "${workdir}/ops-agent-complete" } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { - if ! command -v pip; then - execute_with_retries "apt-get install -y -qq python-pip" + # Stackdriver GPU agent parameters +# local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' + local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' + if ( ! command -v pip && is_debuntu ) ; then + execute_with_retries "apt-get install -y -qq python3-pip" fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" @@ -890,7 +1100,13 @@ function install_gpu_agent() { "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" - execute_with_retries pip install -r "${install_dir}/requirements.txt" + local venv="${install_dir}/venv" + python3 -m venv "${venv}" +( + source "${venv}/bin/activate" + python3 -m pip install --upgrade pip + execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" +) sync # Generate GPU service. @@ -901,7 +1117,7 @@ Description=GPU Utilization Metric Agent [Service] Type=simple PIDFile=/run/gpu_agent.pid -ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"' +ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' User=root Group=root WorkingDirectory=/ @@ -926,8 +1142,9 @@ function set_hadoop_property() { --clobber } -function configure_yarn() { - if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then +function configure_yarn_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" fi set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' @@ -975,7 +1192,7 @@ function configure_gpu_exclusive_mode() { spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) if [[ ${spark_version} != 3.* ]]; then # include exclusive mode on GPU - nvsmi -c EXCLUSIVE_PROCESS + nvidia-smi -c EXCLUSIVE_PROCESS fi } @@ -1023,8 +1240,34 @@ EOF chmod a+rx "${gpus_resources_script}" local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then - echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}" + if version_ge "${SPARK_VERSION}" "3.0" ; then + local gpu_count + gpu_count="$(lspci | grep NVIDIA | wc -l)" + local executor_cores + executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" + local executor_memory + executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" + local task_cpus=2 + local gpu_amount + gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" + + cat >>"${spark_defaults_conf}" <&2 + if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' elif [[ ! -f "${nvsmi}" ]] ; then echo "nvidia-smi not installed" >&2 ; return 0 elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi @@ -1077,11 +1320,18 @@ function nvsmi() { function install_dependencies() { if is_debuntu ; then execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen + if is_ubuntu22 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + fi elif is_rocky ; then execute_with_retries dnf -y -q install pciutils gcc screen local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" - local install_log="${tmpdir}/install.log" set +e eval "${dnf_cmd}" > "${install_log}" 2>&1 local retval="$?" @@ -1109,7 +1359,7 @@ function install_dependencies() { function main() { # This configuration should be run on all nodes # regardless if they have attached GPUs - configure_yarn + configure_yarn_resources # Detect NVIDIA GPU if (lspci | grep -q NVIDIA); then @@ -1133,6 +1383,8 @@ function main() { if [[ $IS_MIG_ENABLED -eq 0 ]]; then install_nvidia_gpu_driver + install_cuda + load_kernel_module if [[ -n ${CUDNN_VERSION} ]]; then @@ -1141,7 +1393,8 @@ function main() { fi #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent + #install_ops_agent + install_gpu_agent echo 'GPU metrics agent successfully deployed.' else echo 'GPU metrics agent will not be installed.' @@ -1316,7 +1569,7 @@ function exit_handler() { if is_debuntu ; then # Clean up OS package cache apt-get -y -qq clean - apt-get -y -qq autoremove + apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove # re-hold systemd package if ge_debian12 ; then apt-mark hold systemd libsystemd0 ; fi @@ -1333,11 +1586,17 @@ function exit_handler() { /usr/local/cuda-1?.? \ /opt/conda/miniconda3 | sort -h elif is_debian ; then - du -hs \ + du -x -hs \ /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /var/lib/{docker,mysql,} \ /usr/lib \ + /opt/nvidia/* \ /usr/local/cuda-1?.? \ - /opt/conda/miniconda3 | sort -h + /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ + /usr/bin \ + /usr \ + /var \ + / 2>/dev/null | sort -h else du -hs \ /var/lib/docker \ @@ -1382,7 +1641,11 @@ print( " samples-taken: ", scalar @siz, $/, } function set_proxy(){ - export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)" + METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + + if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + + export METADATA_HTTP_PROXY export http_proxy="${METADATA_HTTP_PROXY}" export https_proxy="${METADATA_HTTP_PROXY}" export HTTP_PROXY="${METADATA_HTTP_PROXY}" @@ -1402,6 +1665,9 @@ function mount_ramdisk(){ mkdir -p "${tmpdir}" mount -t tmpfs tmpfs "${tmpdir}" + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + # Clear pip cache # TODO: make this conditional on which OSs have pip without cache purge pip cache purge || echo "unable to purge pip cache" @@ -1418,30 +1684,47 @@ function mount_ramdisk(){ } function prepare_to_install(){ + # Verify OS compatability and Secure boot state + check_os_and_secure_boot + + workdir=/opt/install-dpgce nvsmi_works="0" - readonly bdcfg="/usr/local/bin/bdconfig" tmpdir=/tmp/ - if ! is_debuntu && ! is_rocky ; then - echo "Unsupported OS: '$(os_name)'" - exit 1 - fi - - repair_old_backports - + readonly temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + readonly uname_r=$(uname -r) + readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive + mkdir -p "${workdir}" trap exit_handler EXIT + set_proxy mount_ramdisk - install_log="${tmpdir}/install.log" + configure_dkms_certs - set_proxy + readonly install_log="${tmpdir}/install.log" + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION_ENV}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION_ENV}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION_ENV}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi + + if test -f "${workdir}/prepare-complete" ; then return ; fi + + repair_old_backports if is_debuntu ; then clean_up_sources_lists apt-get update -qq apt-get -y clean - sleep 5s - apt-get -y -qq autoremove + apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi else @@ -1453,15 +1736,41 @@ function prepare_to_install(){ time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) fi - configure_dkms_certs - install_dependencies # Monitor disk usage in a screen session df / > "/run/disk-usage.log" touch "/run/keep-running-df" - screen -d -m -US keep-running-df \ + screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" + + touch "${workdir}/prepare-complete" +} + +# Verify if compatible linux distros and secure boot options are used +function check_os_and_secure_boot() { + local SECURE_BOOT="disabled" + SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then + echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." + exit 1 + elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then + echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." + exit 1 + elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then + echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." + exit 1 + fi + + if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then + echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." + exit 1 + elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then + echo "Secure boot is enabled, but no signing material provided." + echo "Please either disable secure boot or provide signing material as per" + echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" + return 1 + fi } prepare_to_install diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh index 7545c1a1e..0c5b2fed1 100644 --- a/gpu/manual-test-runner.sh +++ b/gpu/manual-test-runner.sh @@ -5,8 +5,8 @@ # To run the script, the following will bootstrap # # git clone git@github.com:LLC-Technologies-Collier/initialization-actions -# git checkout gpu-20241121 # cd initialization-actions +# git checkout gpu-20241121 # cp gpu/env.json.sample env.json # vi env.json # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest . @@ -16,9 +16,7 @@ # To see a list of screen windows, press ^a " # Num Name # -# 0 monitor # 1 2.0-debian10 -# 2 sh readonly timestamp="$(date +%F-%H-%M)" diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f8438915f..ec316b345 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -25,6 +25,12 @@ def verify_pyspark(self, name): # Verify that pyspark works self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + def verify_pytorch(self, name): + # Verify that pytorch works + self.assert_instance_command(name, "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node", 1) + #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node + #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python /tmp/prakasha-spark-test.py + def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -64,6 +70,7 @@ def verify_instance_spark(self): def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") @@ -82,7 +89,13 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) - if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ): + if ( configuration == 'SINGLE' and \ + self.getImageOs() == 'rocky' and \ + self.getImageVersion() > pkg_resources.parse_version("2.1") ): + # Do not attempt this on single instance rocky clusters + no_op=1 + else: + # verify that pyspark from command prompt works self.verify_pyspark(machine_name) @parameterized.parameters( @@ -239,8 +252,9 @@ def test_gpu_allocation(self, configuration, master_accelerator, if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ - and configuration == 'SINGLE': + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") metadata = None @@ -273,8 +287,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ - and configuration == 'SINGLE': + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty") if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ From f210adf9e650a357797bac7e3fa94dbc7dc967fb Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 7 Dec 2024 15:48:56 -0800 Subject: [PATCH 002/112] correcting driver for cuda 12.4 --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index db6d630a1..bd8ef593f 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -143,7 +143,7 @@ readonly ROLE # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( ["11.8"]="560.35.03" - ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="560.35.03" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.54.15" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) # https://developer.nvidia.com/cudnn-downloads if is_debuntu ; then From f6ff5a3ae75f2a449f0082b2bacf6a3a244654b1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 7 Dec 2024 16:08:21 -0800 Subject: [PATCH 003/112] correcting cuda subversion. 12.4.0 instead of 12.4.1 so that driver and cuda match up --- gpu/install_gpu_driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index bd8ef593f..a273075e8 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -143,7 +143,7 @@ readonly ROLE # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( ["11.8"]="560.35.03" - ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.54.15" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.54.14" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) # https://developer.nvidia.com/cudnn-downloads if is_debuntu ; then @@ -173,7 +173,7 @@ readonly -A NCCL_FOR_CUDA=( ) readonly -A CUDA_SUBVER=( ["11.8"]="11.8.0" - ["12.0"]="12.0.0" ["12.1"]="12.1.1" ["12.4"]="12.4.1" ["12.5"]="12.5.0" ["12.6"]="12.6.2" + ["12.0"]="12.0.0" ["12.1"]="12.1.1" ["12.4"]="12.4.0" ["12.5"]="12.5.0" ["12.6"]="12.6.2" ) # Debian 12 # 12.3.101, 12.3.52 From e36b25bd1385dc7bfcd36d6aa7bbfe7b4347767e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 7 Dec 2024 17:24:06 -0800 Subject: [PATCH 004/112] corrected cannonical 11.8 driver version ; removed extra code and comment ; added better description of what is in the runfile --- gpu/install_gpu_driver.sh | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index a273075e8..07018bc0b 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -142,7 +142,7 @@ readonly ROLE # https://developer.nvidia.com/cuda-downloads # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( - ["11.8"]="560.35.03" + ["11.8"]="520.61.05" ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.54.14" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) # https://developer.nvidia.com/cudnn-downloads @@ -175,15 +175,6 @@ readonly -A CUDA_SUBVER=( ["11.8"]="11.8.0" ["12.0"]="12.0.0" ["12.1"]="12.1.1" ["12.4"]="12.4.0" ["12.5"]="12.5.0" ["12.6"]="12.6.2" ) -# Debian 12 -# 12.3.101, 12.3.52 -# 12.4.127, 12.4.99 -# 12.5.82, 12.5.39 -# 12.6.77, 12.6.68, 12.6.37 - -readonly -A cuda_toolkit_config_version=( - ["12.4"]="12.4.127" ["12.6"]="12.6.77" -) RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') @@ -967,6 +958,17 @@ function cache_fetched_package() { } function install_nvidia_userspace_runfile() { + + # This .run file contains NV's OpenGL implementation as well as + # nvidia optimized implementations of the gtk+ 2,3 stack(s) not + # including glib (https://docs.gtk.org/glib/), and what appears to + # be a copy of the source from the kernel-open directory of for + # example DRIVER_VERSION=560.35.03 + # + # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz + # + # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run + # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. if test -f "${workdir}/userspace-complete" ; then return ; fi local local_fn="${tmpdir}/userspace.run" From a2400a7d844da91df512ef9feb0d210a518e67fe Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 8 Dec 2024 23:20:34 -0800 Subject: [PATCH 005/112] skipping most tests ; using 11.7 from the cuda 11 line instead of the less well supported 11.8 --- gpu/test_gpu.py | 53 ++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index ec316b345..10c491fb6 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -65,11 +65,12 @@ def verify_instance_spark(self): @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), # ("STANDARD", ["m"], GPU_T4, None, None), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), ) def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + self.skipTest("Running only one test to build cache") if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") @@ -80,12 +81,12 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-highmem-32", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB") + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) @@ -104,6 +105,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + self.skipTest("Running only one test to build cache") self.skipTest("No need to regularly test not installing the agent") @@ -134,6 +136,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + self.skipTest("Running only one test to build cache") if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") @@ -157,30 +160,30 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( -# ("SINGLE", ["m"], GPU_T4, None, "12.0"), - ("SINGLE", ["m"], GPU_T4, None, "11.8"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), + ("SINGLE", ["m"], GPU_T4, None, "12.4"), + ("SINGLE", ["m"], GPU_T4, None, "11.7"), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.7"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") +# if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") +# if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ +# and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): +# self.skipTest("CUDA == 12.0 not supported on debian 12") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ - and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ - ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") +# if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ +# and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ +# ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): +# self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ - and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") +# if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ +# and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ +# and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -198,13 +201,14 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.verify_instance_nvcc(machine_name, cuda_version) @parameterized.parameters( - ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), + ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.7"), # ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"), ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"), ) def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): + self.skipTest("Running only one test to build cache") self.skipTest("Test is known to fail. Skipping so that we can exercise others") @@ -249,6 +253,8 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): + self.skipTest("Running only one test to build cache") + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") @@ -274,15 +280,16 @@ def test_gpu_allocation(self, configuration, master_accelerator, self.verify_instance_spark() @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, "11.8"), + ("SINGLE", ["m"], GPU_T4, None, "11.7"), # ("STANDARD", ["m"], GPU_T4, None, "12.0"), ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.7"), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), ) def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): + self.skipTest("Running only one test to build cache") if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") From a137719dce56e0404e4bf67f5e5e81b0876fa2a8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 8 Dec 2024 23:21:30 -0800 Subject: [PATCH 006/112] verified that the cuda and driver versions match up --- gpu/install_gpu_driver.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 07018bc0b..5a2718291 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -142,13 +142,13 @@ readonly ROLE # https://developer.nvidia.com/cuda-downloads # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( - ["11.8"]="520.61.05" + ["11.7"]="515.65.01" ["11.8"]="520.61.05" ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.54.14" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) # https://developer.nvidia.com/cudnn-downloads if is_debuntu ; then readonly -A CUDNN_FOR_CUDA=( - ["11.8"]="9.5.1.17" + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" ) elif is_rocky ; then @@ -161,14 +161,14 @@ elif is_rocky ; then # 12.5: 9.2.1.18 # 12.6: 9.5.1.17 readonly -A CUDNN_FOR_CUDA=( - ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" ) fi # https://developer.nvidia.com/nccl/nccl-download # 12.2: 2.19.3, 12.5: 2.21.5 readonly -A NCCL_FOR_CUDA=( - ["11.8"]="2.15.5" + ["11.7"]="2.21.5" ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( From 693bc7fe403907c329433122695414116a418033 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 8 Dec 2024 23:36:46 -0800 Subject: [PATCH 007/112] reducing log capture --- gpu/manual-test-runner.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh index 0c5b2fed1..021528f6c 100644 --- a/gpu/manual-test-runner.sh +++ b/gpu/manual-test-runner.sh @@ -6,7 +6,7 @@ # # git clone git@github.com:LLC-Technologies-Collier/initialization-actions # cd initialization-actions -# git checkout gpu-20241121 +# git checkout gpu-20241207 # cp gpu/env.json.sample env.json # vi env.json # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest . @@ -33,7 +33,7 @@ export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" export REGION="$(jq -r .REGION env.json)" export BUCKET="$(jq -r .BUCKET env.json)" -gcs_log_dir="gs://${BUCKET}/${BUILD_ID}/logs" +gcs_log_dir="gs://${BUCKET}/gpu-dpgce/builds/${BUILD_ID}/logs" function exit_handler() { RED='\\e[0;31m' @@ -44,8 +44,11 @@ function exit_handler() { # TODO: list clusters which match our BUILD_ID and clean them up # TODO: remove any test related resources in the project - echo 'Uploading local logs to GCS bucket.' - gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/" + # We allow the user to monitor the logs from within screen session. + # Logs can be archived if necessary, but won't be unless needed. + +# echo 'Uploading local logs to GCS bucket.' +# gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/" if [[ -f "${tmp_dir}/tests_success" ]]; then echo -e "${GREEN}Workflow succeeded${NC}, check logs at ${log_dir}/ or ${gcs_log_dir}/" From 4ce1efc00cadb7e4554e2d36a4e0865de3c0fb85 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 8 Dec 2024 23:41:00 -0800 Subject: [PATCH 008/112] temporarily increasing machine shape for build caching --- gpu/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 10c491fb6..0ae8aa8bd 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -189,12 +189,12 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-highmem-64", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=30, - boot_disk_size="50GB") + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) From 05b3e2ba9ffe12ff418cd61795aeef9ab8f1830e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 9 Dec 2024 10:31:00 -0800 Subject: [PATCH 009/112] 64 is too many for a single T4 --- gpu/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 0ae8aa8bd..309a4ae56 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -189,7 +189,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-64", + machine_type="n1-highmem-32", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, From e2ab509fdefedcdaa0766bb073516affeaed2475 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 9 Dec 2024 10:53:19 -0800 Subject: [PATCH 010/112] added a subversion for 11.7 --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 5a2718291..f4ee157fc 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -172,7 +172,7 @@ readonly -A NCCL_FOR_CUDA=( ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( - ["11.8"]="11.8.0" + ["11.7"]="11.7.1" ["11.8"]="11.8.0" ["12.0"]="12.0.0" ["12.1"]="12.1.1" ["12.4"]="12.4.0" ["12.5"]="12.5.0" ["12.6"]="12.6.2" ) From 1a39be64424e3795640ba9e508e9982407799135 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 9 Dec 2024 10:54:34 -0800 Subject: [PATCH 011/112] add more tests to the install function --- gpu/test_gpu.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 309a4ae56..5c69ea903 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -195,10 +195,13 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, metadata=metadata, timeout_in_minutes=30, boot_disk_size="60GB") + for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_nvcc(machine_name, cuda_version) + self.verify_instance_pyspark(machine_name) + self.verify_instance_spark() @parameterized.parameters( ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.7"), From 41ae06905b269d75e2e7bf84486fbb1c0f136c25 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 9 Dec 2024 11:14:50 -0800 Subject: [PATCH 012/112] only including architectures supported by this version of CUDA --- gpu/install_gpu_driver.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index f4ee157fc..c00dcdfb9 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -359,7 +359,7 @@ function set_cuda_runfile_url() { MAX_DRIVER_VERSION="530.30.02" MAX_CUDA_VERSION="12.1.1" fi - elif ge_version "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then + elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then if le_debian10 ; then # cuda 11 is not supported for <= debian10 MAX_CUDA_VERSION="0" @@ -576,8 +576,12 @@ function install_nvidia_nccl() { # Blackwell: SM_100, compute_100 NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" + fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" + fi mkdir -p "${workdir}" pushd "${workdir}" From 39ac28118239fb58c1e0f4045edc52d24295116a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 00:47:02 -0800 Subject: [PATCH 013/112] pinning down versions better ; more caching ; more ram disks ; new pytorch and tensorflow test functions --- gpu/install_gpu_driver.sh | 200 +++++++++++++++++++++++--------------- gpu/test_gpu.py | 72 ++++++++++++-- gpu/verify_pytorch.py | 8 ++ gpu/verify_tensorflow.py | 28 ++++++ 4 files changed, 221 insertions(+), 87 deletions(-) create mode 100644 gpu/verify_pytorch.py create mode 100644 gpu/verify_tensorflow.py diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index c00dcdfb9..738960a74 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -138,18 +138,26 @@ ROLE="$(get_metadata_attribute dataproc-role)" readonly ROLE # CUDA version and Driver version +# https://docs.nvidia.com/deploy/cuda-compatibility/ # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html # https://developer.nvidia.com/cuda-downloads + +# Minimum supported version for open kernel driver is 515.43.04 +# https://github.com/NVIDIA/open-gpu-kernel-modules/tags # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( - ["11.7"]="515.65.01" ["11.8"]="520.61.05" - ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.54.14" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["11.7"]="515.65.01" ["11.8"]="525.60.13" + ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.67" ["12.5"]="555.42.02" ["12.6"]="560.35.03" +) +readonly -A DRIVER_SUBVER=( + ["515"]="515.48.07" ["520"]="520.56.06" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" + ["545"]="545.29.06" ["550"]="550.127.05" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" ) # https://developer.nvidia.com/cudnn-downloads if is_debuntu ; then readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" ) elif is_rocky ; then # rocky: @@ -161,19 +169,19 @@ elif is_rocky ; then # 12.5: 9.2.1.18 # 12.6: 9.5.1.17 readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" ) fi # https://developer.nvidia.com/nccl/nccl-download # 12.2: 2.19.3, 12.5: 2.21.5 readonly -A NCCL_FOR_CUDA=( - ["11.7"]="2.21.5" ["11.8"]="2.21.5" - ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" + ["11.7"]="2.21.5" ["11.8"]="2.21.5" + ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( - ["11.7"]="11.7.1" ["11.8"]="11.8.0" - ["12.0"]="12.0.0" ["12.1"]="12.1.1" ["12.4"]="12.4.0" ["12.5"]="12.5.0" ["12.6"]="12.6.2" + ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" ) RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') @@ -181,15 +189,17 @@ RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') function set_cuda_version() { local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') - if [[ -n "${cuda_url}" ]] ; then + # if cuda-url metadata variable has been passed, extract default version from url local CUDA_URL_VERSION CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" CUDA_FULL_VERSION="${CUDA_URL_VERSION}" fi - else + fi + + if ( ! test -v DEFAULT_CUDA_VERSION ) ; then DEFAULT_CUDA_VERSION='12.4' fi readonly DEFAULT_CUDA_VERSION @@ -200,17 +210,8 @@ function set_cuda_version() { CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} fi readonly CUDA_FULL_VERSION - - if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then - echo "CUDA 12.3.0 is the minimum CUDA 12 version on Debian 12" - elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then - echo "CUDA 12.1.1 is the maximum CUDA version on ubuntu18. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then - echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" - elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then - echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" - fi } + set_cuda_version function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) @@ -235,19 +236,23 @@ function set_driver_version() { if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi # Take default from cuda-url metadata value as a backup elif [[ -n "${cuda_url}" ]] ; then - CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" - if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" ; fi + local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" + if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then + major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" + driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} + if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the version indicated by the cuda url as the default if it exists + DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" + elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then + # use the maximum sub-version available for the major version indicated in cuda url as the default + DEFAULT_DRIVER="${driver_max_maj_version}" + fi + fi fi if ( ! test -v DEFAULT_DRIVER ) ; then - # Otherwise attempt to make an educated guess + # If a default driver version has not been extracted, use the default for this version of CUDA DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} -# if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then -# DEFAULT_DRIVER="560.28.03" ; fi -# if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03" ; fi -# if ( is_rocky && le_cuda11 ) ; then DEFAULT_DRIVER="525.147.05" ; fi -# if ( is_ubuntu20 && le_cuda11 ) ; then DEFAULT_DRIVER="535.183.06" ; fi -# if ( is_rocky9 && ge_cuda12 ) ; then DEFAULT_DRIVER="565.57.01" ; fi fi DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") @@ -262,14 +267,6 @@ function set_driver_version() { echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" exit 1 fi - - # Verify that the requested combination is supported - readonly CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run" - cuda_url="https://developer.download.nvidia.com/compute/cuda/${CUDA_FULL_VERSION}/local_installers/${CUDA_RUNFILE}" - if ! curl -s --head "${cuda_url}" | grep -E -q '^HTTP.*200\s*$' ; then - echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${DRIVER_VERSION}, CUDA_VERSION=${CUDA_FULL_VERSION}" - exit 1 - fi } set_driver_version @@ -380,16 +377,46 @@ function set_cuda_runfile_url() { echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" fi - CUDA_FILENAME="cuda_${CUDA_FULL_VERSION}_${DRIVER_VERSION}_linux.run" + # driver version named in cuda runfile filename + # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) + readonly -A drv_for_cuda=( + ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" + ["11.8.0"]="520.61.05" + ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" + ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" + ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" + ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" + ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ) + + # Verify that the file with the indicated combination exists + local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} + CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" - local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_FILENAME}" - readonly DEFAULT_NVIDIA_CUDA_URL + local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") readonly NVIDIA_CUDA_URL - CUDA_FILENAME="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" - readonly CUDA_FILENAME + CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_RUNFILE + + if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then + echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" + exit 1 + fi + + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then + echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" + elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then + echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then + echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" + elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then + echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" + fi } set_cuda_runfile_url @@ -469,8 +496,23 @@ function uninstall_cuda_keyring_pkg() { CUDA_KEYRING_PKG_INSTALLED="0" } -CUDA_LOCAL_REPO_INSTALLED="0" +function cache_fetched_package() { + local src_url="$1" + local gcs_fn="$2" + local local_fn="$3" + + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then + time gcloud storage cp "${gcs_fn}" "${local_fn}" + else + time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + fi +} + + function install_local_cuda_repo() { + if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi + if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi CUDA_LOCAL_REPO_INSTALLED="1" pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" @@ -491,20 +533,21 @@ function install_local_cuda_repo() { "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi + + touch "${workdir}/install-local-cuda-repo-complete" } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - CUDA_LOCAL_REPO_INSTALLED="0" + rm -f "${workdir}/install-local-cuda-repo-complete" } -CUDNN_LOCAL_REPO_INSTALLED="0" CUDNN_PKG_NAME="" function install_local_cudnn_repo() { - if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi - pkgname="cudnn-local-repo-${shortname}-${CUDNN}" + if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi + pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ @@ -514,20 +557,21 @@ function install_local_cudnn_repo() { rm -f "${tmpdir}/local-installer.deb" - cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings + cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - CUDNN_LOCAL_REPO_INSTALLED="1" + touch "${workdir}/install-local-cudnn-repo-complete" } function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" - CUDNN_LOCAL_REPO_INSTALLED="0" + rm -f "${workdir}/install-local-cudnn-repo-complete" } CUDNN8_LOCAL_REPO_INSTALLED="0" CUDNN8_PKG_NAME="" function install_local_cudnn8_repo() { - if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi + if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi + if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" else return 0 ; fi @@ -541,21 +585,31 @@ function install_local_cudnn8_repo() { deb_fn="${pkgname}_1.0-1_amd64.deb" local_deb_fn="${tmpdir}/${deb_fn}" - local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o "${local_deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" + + # cache the cudnn package + cache_fetched_package "${local_deb_url}" \ + "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ + "${local_deb_fn}" + + local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" + # If we are using a ram disk, mount another where we will unpack the cudnn local installer + if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then + mkdir -p "${cudnn_path}" + mount -t tmpfs tmpfs "${cudnn_path}" + fi dpkg -i "${local_deb_fn}" rm -f "${local_deb_fn}" - cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - CUDNN8_LOCAL_REPO_INSTALLED="1" + cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings + touch "${workdir}/install-local-cudnn8-repo-complete" } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" - CUDNN8_LOCAL_REPO_INSTALLED="0" + rm -f "${workdir}/install-local-cudnn8-repo-complete" } function install_nvidia_nccl() { @@ -569,8 +623,12 @@ function install_nvidia_nccl() { # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver # Volta: SM_70,SM_72, compute_70,compute_72 # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ # Ada: SM_89, compute_89 # Hopper: SM_90,SM_90a compute_90,compute_90a # Blackwell: SM_100, compute_100 @@ -672,7 +730,6 @@ function install_nvidia_cudnn() { if ge_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else - local CUDNN="${CUDNN_VERSION%.*}" if is_cudnn8 ; then install_local_cudnn8_repo @@ -682,6 +739,8 @@ function install_nvidia_cudnn() { apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" + + uninstall_local_cudnn8_repo sync elif is_cudnn9 ; then install_cuda_keyring_pkg @@ -948,19 +1007,6 @@ function build_driver_from_packages() { #clear_dkms_key } -function cache_fetched_package() { - local src_url="$1" - local gcs_fn="$2" - local local_fn="$3" - - if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then - time gcloud storage cp "${gcs_fn}" "${local_fn}" - else - time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ - gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) - fi -} - function install_nvidia_userspace_runfile() { # This .run file contains NV's OpenGL implementation as well as @@ -991,7 +1037,7 @@ function install_cuda_runfile() { local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ - "${pkg_bucket}/${CUDA_FILENAME}" \ + "${pkg_bucket}/${CUDA_RUNFILE}" \ "${local_fn}" execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" @@ -1562,8 +1608,8 @@ function exit_handler() { pip config unset global.cache-dir || echo "unable to unset global pip cache" # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do - if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then umount -f ${shmdir} fi done diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 5c69ea903..536c7b4bf 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -6,16 +6,46 @@ from integration_tests.dataproc_test_case import DataprocTestCase +DEFAULT_TIMEOUT = 15 # minutes class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" - GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a + GPU_V100 = "type=nvidia-tesla-v100" GPU_A100 = "type=nvidia-tesla-a100" GPU_H100 = "type=nvidia-h100-80gb,count=8" + # Tests for PyTorch + TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py" + + # Tests for TensorFlow + TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py" + + def assert_instance_command(self, + instance, + cmd, + timeout_in_minutes=DEFAULT_TIMEOUT): + + retry_count = 5 + + ssh_cmd='gcloud compute ssh {} --zone={} --command="{}"'.format( + instance, self.cluster_zone, cmd) + + while retry_count > 0: + try: + ret_code, stdout, stderr = self.assert_command( ssh_cmd, timeout_in_minutes ) + return ret_code, stdout, stderr + except Exception as e: + print("An error occurred: ", e) + retry_count -= 1 + if retry_count > 0: + time.sleep(10) + continue + else: + raise + def verify_instance(self, name): # Verify that nvidia-smi works time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience @@ -26,10 +56,24 @@ def verify_pyspark(self, name): self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) def verify_pytorch(self, name): - # Verify that pytorch works - self.assert_instance_command(name, "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node", 1) - #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node - #echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python /tmp/prakasha-spark-test.py + test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), + self.TORCH_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) + + verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + self.TORCH_TEST_SCRIPT_FILE_NAME) + self.assert_instance_command(name, verify_cmd) + self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) + + def verify_tensorflow(self, name): + test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), + self.TF_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) + + verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + self.TF_TEST_SCRIPT_FILE_NAME) + self.assert_instance_command(name, verify_cmd) + self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) def verify_mig_instance(self, name): self.assert_instance_command(name, @@ -47,6 +91,14 @@ def verify_instance_nvcc(self, name, cuda_version): self.assert_instance_command( name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) ) + def verify_instance_cuda_version(self, name, cuda_version): + self.assert_instance_command( + name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) ) + + def verify_instance_driver_version(self, name, driver_version): + self.assert_instance_command( + name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) ) + def verify_instance_spark(self): self.assert_dataproc_job( self.getClusterName(), @@ -161,9 +213,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "12.4"), - ("SINGLE", ["m"], GPU_T4, None, "11.7"), + ("SINGLE", ["m"], GPU_T4, None, "11.8"), # ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.7"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -204,7 +256,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.verify_instance_spark() @parameterized.parameters( - ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.7"), + ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), # ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"), ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"), ) @@ -283,10 +335,10 @@ def test_gpu_allocation(self, configuration, master_accelerator, self.verify_instance_spark() @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, "11.7"), + ("SINGLE", ["m"], GPU_T4, None, "11.8"), # ("STANDARD", ["m"], GPU_T4, None, "12.0"), ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.7"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), ) def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, diff --git a/gpu/verify_pytorch.py b/gpu/verify_pytorch.py new file mode 100644 index 000000000..dd4910d97 --- /dev/null +++ b/gpu/verify_pytorch.py @@ -0,0 +1,8 @@ +import torch +print("get CUDA details : == : ") +use_cuda = torch.cuda.is_available() +if use_cuda: + print('__CUDNN VERSION:', torch.backends.cudnn.version()) + print('__Number CUDA Devices:', torch.cuda.device_count()) + print('__CUDA Device Name:',torch.cuda.get_device_name(0)) + print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9) diff --git a/gpu/verify_tensorflow.py b/gpu/verify_tensorflow.py new file mode 100644 index 000000000..2faf2c717 --- /dev/null +++ b/gpu/verify_tensorflow.py @@ -0,0 +1,28 @@ +import tensorflow as tf +print("Get GPU Details : ") +print(tf.config.list_physical_devices('GPU')) +#print(tf.test.is_gpu_available()) + +if tf.test.gpu_device_name(): + print('Default GPU Device:{}'.format(tf.test.gpu_device_name())) + print("Please install GPU version of TF") + +gpu_available = tf.config.list_physical_devices('GPU') +print("gpu_available : " + str(gpu_available)) + +#is_cuda_gpu_available = tf.config.list_physical_devices('GPU',cuda_only=True) +is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True) +print("is_cuda_gpu_available : " + str(is_cuda_gpu_available)) + +#is_cuda_gpu_min_3 = tf.config.list_physical_devices('GPU',True, (3,0)) +is_cuda_gpu_min_3 = tf.test.is_gpu_available(True, (3,0)) +print("is_cuda_gpu_min_3 : " + str(is_cuda_gpu_min_3)) + +from tensorflow.python.client import device_lib + +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] + +print("Run GPU Functions Below : ") +print(get_available_gpus()) From f116717a826f3fc3c9b1e5901bc2108bc040ad47 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 11:31:49 -0800 Subject: [PATCH 014/112] using maximum from 8.9 series on rocky for 11.7 --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 738960a74..911c2633a 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -169,7 +169,7 @@ elif is_rocky ; then # 12.5: 9.2.1.18 # 12.6: 9.5.1.17 readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" ) fi From 976f869df675ae606b1cdeec96b035e1052e3e35 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 12:34:36 -0800 Subject: [PATCH 015/112] skip full build --- cloudbuild/presubmit.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index eec7adb76..882acc4db 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,6 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + continue echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 From 6ef2fdba48106c8e53f9537a799192d3ca40e72a Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 12:37:58 -0800 Subject: [PATCH 016/112] pinning to bazel-7.4.0 --- gpu/Dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gpu/Dockerfile b/gpu/Dockerfile index 1127293e1..23668a189 100644 --- a/gpu/Dockerfile +++ b/gpu/Dockerfile @@ -24,10 +24,15 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \ | dd of=/etc/apt/sources.list.d/bazel.list status=none \ && apt-get update -qq -RUN apt-get autoremove -y -qq && \ - apt-get install -y -qq default-jdk python3-setuptools bazel > /dev/null 2>&1 && \ +RUN apt-get update -y -qq && \ + apt-get autoremove -y -qq && \ + apt-get install -y -qq default-jdk python3-setuptools bazel-7.4.0 > /dev/null 2>&1 && \ apt-get clean +# Set bazel-7.4.0 as the default bazel alternative in this container +RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-7.4.0 7 && \ + update-alternatives --set bazel /usr/bin/bazel-7.4.0 + # Install here any utilities you find useful when troubleshooting RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-get clean From 1539cdbae71a132a39a242f1f0eb2e48ff1b5fb0 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 13:37:23 -0800 Subject: [PATCH 017/112] NCCL requires gcc-11 for cuda11 --- gpu/install_gpu_driver.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 911c2633a..53c73e850 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1379,6 +1379,12 @@ function install_dependencies() { update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 update-alternatives --set gcc /usr/bin/gcc-12 + elif is_debian12 && is_cuda11 ; then + # On debian12, the default compiler does not build NCCL + execute_with_retries apt-get install -y -qq gcc-11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-11 fi elif is_rocky ; then execute_with_retries dnf -y -q install pciutils gcc screen From 9a54f4c947b7aff49a8f4260204f8c3482503498 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 16:53:34 -0800 Subject: [PATCH 018/112] rocky8 is now building from the source in the .run file --- gpu/install_gpu_driver.sh | 73 ++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 53c73e850..ee5109241 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -147,10 +147,10 @@ readonly ROLE # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( ["11.7"]="515.65.01" ["11.8"]="525.60.13" - ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.67" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.67" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) readonly -A DRIVER_SUBVER=( - ["515"]="515.48.07" ["520"]="520.56.06" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" + ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.127.05" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" ) # https://developer.nvidia.com/cudnn-downloads @@ -669,6 +669,7 @@ function install_nvidia_nccl() { # build and cache pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install + install_build_dependencies if is_debuntu ; then # These packages are required to build .deb packages from source execute_with_retries \ @@ -910,13 +911,8 @@ function add_repo_cuda() { } function build_driver_from_github() { - if is_ubuntu ; then - mok_key=/var/lib/shim-signed/mok/MOK.priv - mok_der=/var/lib/shim-signed/mok/MOK.der - else - mok_key=/var/lib/dkms/mok.key - mok_der=/var/lib/dkms/mok.pub - fi + # closed driver will have been built on rocky8 + if is_rocky8 ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { @@ -937,6 +933,7 @@ function build_driver_from_github() { else # build and cache kernel modules pushd open-gpu-kernel-modules + install_build_dependencies execute_with_retries make -j$(nproc) modules \ > kernel-open/build.log \ 2> kernel-open/build_error.log @@ -1026,7 +1023,22 @@ function install_nvidia_userspace_runfile() { "${pkg_bucket}/${USERSPACE_FILENAME}" \ "${local_fn}" - execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}" + if is_rocky8 ; then + install_build_dependencies + + # build non-open driver + execute_with_retries bash "${local_fn}" \ + --module-signing-hash sha256 \ + --module-signing-x509-hash sha256 \ + --module-signing-secret-key "${mok_key}" \ + --module-signing-public-key "${mok_der}" \ + --module-signing-script "/lib/modules/${uname_r}/build/scripts/sign-file" \ + --no-dkms \ + --install-libglvnd --silent --tmpdir="${tmpdir}" + else + # prepare to build from github + execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}" + fi rm -f "${local_fn}" touch "${workdir}/userspace-complete" sync @@ -1369,25 +1381,17 @@ function nvsmi() { "${nvsmi}" $* } -function install_dependencies() { +function install_build_dependencies() { + if test -f "${workdir}/build-dependencies-complete" ; then return ; fi + if is_debuntu ; then - execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen - if is_ubuntu22 ; then - # On ubuntu22, the default compiler does not build some kernel module versions - # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 - execute_with_retries apt-get install -y -qq gcc-12 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-12 - elif is_debian12 && is_cuda11 ; then - # On debian12, the default compiler does not build NCCL - execute_with_retries apt-get install -y -qq gcc-11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-11 - fi + execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" "gcc-${gcc_ver}" screen + + update-alternatives --install /usr/bin/gcc gcc "/usr/bin/gcc-${gcc_ver}" "${gcc_ver}" + update-alternatives --set gcc "/usr/bin/gcc-${gcc_ver}" + elif is_rocky ; then - execute_with_retries dnf -y -q install pciutils gcc screen + execute_with_retries dnf -y -q install gcc local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" set +e @@ -1412,6 +1416,13 @@ function install_dependencies() { execute_with_retries "${dnf_cmd}" fi + touch "${workdir}/build-dependencies-complete" +} + +function install_dependencies() { + pkg_list="pciutils screen" + if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} + elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi } function main() { @@ -1754,6 +1765,14 @@ function prepare_to_install(){ readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive + if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv + mok_der=/var/lib/shim-signed/mok/MOK.der + else mok_key=/var/lib/dkms/mok.key + mok_der=/var/lib/dkms/mok.pub ; fi + + if is_cuda11 ; then gcc_ver="11" + elif is_cuda12 ; then gcc_ver="12" ; fi + mkdir -p "${workdir}" trap exit_handler EXIT set_proxy From 33165186e54fa2720acef8e5ceaa0815da3cb2e1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 17:07:16 -0800 Subject: [PATCH 019/112] reverting to previous state of only selecting a compiler version on latest releases --- gpu/install_gpu_driver.sh | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index ee5109241..2a8d8a7f8 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1385,10 +1385,20 @@ function install_build_dependencies() { if test -f "${workdir}/build-dependencies-complete" ; then return ; fi if is_debuntu ; then - execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" "gcc-${gcc_ver}" screen - - update-alternatives --install /usr/bin/gcc gcc "/usr/bin/gcc-${gcc_ver}" "${gcc_ver}" - update-alternatives --set gcc "/usr/bin/gcc-${gcc_ver}" + if is_ubuntu22 && is_cuda12 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + elif is_debian12 && is_cuda11 ; then + # On debian12, the default compiler does not build NCCL + execute_with_retries apt-get install -y -qq gcc-11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-11 + fi elif is_rocky ; then execute_with_retries dnf -y -q install gcc From 722e4363e9fd77457c792e58f08e17d2a87d6d85 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 18:16:23 -0800 Subject: [PATCH 020/112] replaced literal path names with variable values ; indexing builds by the signing key used --- gpu/install_gpu_driver.sh | 50 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 2a8d8a7f8..c3c50a0d8 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -793,28 +793,27 @@ function configure_dkms_certs() { echo "Private key material exists" local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then + expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) + modulus_md5sum="${expected_modulus_md5sum}" - else - modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3" - fi - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then echo "unmatched rsa key modulus" - fi - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key + fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then echo "unmatched x509 cert modulus" + fi + else + modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" fi return fi - # Retrieve cloud secrets keys local sig_priv_secret_name sig_priv_secret_name="${PSN}" @@ -841,16 +840,14 @@ function configure_dkms_certs() { | base64 --decode \ | dd status=none of="${CA_TMPDIR}/db.der" - # symlink private key and copy public cert from volatile storage for DKMS - if is_ubuntu ; then - mkdir -p /var/lib/shim-signed/mok - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv - cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der - else - mkdir -p /var/lib/dkms/ - ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key - cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub - fi + local mok_directory="$(dirname "${mok_key}")" + mkdir -p "${mok_directory}" + + # symlink private key and copy public cert from volatile storage to DKMS directory + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + + modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" } function clear_dkms_key { @@ -858,7 +855,7 @@ function clear_dkms_key { echo "No signing secret provided. skipping" >&2 return 0 fi - rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv + rm -rf "${CA_TMPDIR}" "${mok_key}" } function add_contrib_component() { @@ -926,7 +923,11 @@ function build_driver_from_github() { test -f "${workdir}/open-gpu-kernel-modules/kernel-open/nvidia.ko" || { local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + local build_dir + if [[ -n "${modulus_md5sum}" ]] ; then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi + + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" @@ -959,6 +960,7 @@ function build_driver_from_github() { # install kernel modules modinfo nvidia > /dev/null 2>&1 || { pushd open-gpu-kernel-modules + install_build_dependencies make modules_install \ >> kernel-open/build.log \ 2>> kernel-open/build_error.log From f42fee6af0fea53c8376b183427165e18a41439e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 10 Dec 2024 20:53:18 -0800 Subject: [PATCH 021/112] moved variable definition to prepare function ; moved driver signing to build phase --- gpu/install_gpu_driver.sh | 40 ++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index c3c50a0d8..f16226d4d 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -777,9 +777,6 @@ function install_nvidia_cudnn() { touch "${workdir}/cudnn-complete" } -CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" -PSN="$(get_metadata_attribute private_secret_name)" -readonly PSN function configure_dkms_certs() { if [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping"; @@ -794,22 +791,22 @@ function configure_dkms_certs() { local expected_modulus_md5sum expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) - + if [[ -n "${expected_modulus_md5sum}" ]]; then modulus_md5sum="${expected_modulus_md5sum}" # Verify that cert md5sum matches expected md5sum if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key modulus" + echo "unmatched rsa key" fi - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" # Verify that key md5sum matches expected md5sum if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert modulus" + echo "unmatched x509 cert" fi else modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" return fi @@ -938,6 +935,15 @@ function build_driver_from_github() { execute_with_retries make -j$(nproc) modules \ > kernel-open/build.log \ 2> kernel-open/build_error.log + # Sign kernel modules + if [[ -n "${PSN}" ]]; then + for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do + "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ + "${mok_key}" \ + "${mok_der}" \ + "${module}" + done + fi tar czvf "${local_tarball}" ../open-gpu-kernel-modules/kernel-open gcloud storage cp "${local_tarball}" "${gcs_tarball}" rm "${local_tarball}" @@ -947,16 +953,6 @@ function build_driver_from_github() { gcloud storage cat "${gcs_tarball}" | tar xzv } - # Sign kernel modules - if [[ -n "${PSN}" ]]; then - for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do - "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ - "${mok_key}" \ - "${mok_der}" \ - "${module}" - done - fi - # install kernel modules modinfo nvidia > /dev/null 2>&1 || { pushd open-gpu-kernel-modules @@ -1771,11 +1767,17 @@ function prepare_to_install(){ workdir=/opt/install-dpgce nvsmi_works="0" tmpdir=/tmp/ - readonly temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" - readonly uname_r=$(uname -r) + uname_r=$(uname -r) + readonly uname_r readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive + CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" + readonly CA_TMPDIR + PSN="$(get_metadata_attribute private_secret_name)" + readonly PSN if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv mok_der=/var/lib/shim-signed/mok/MOK.der From a13122eac8d13c43e456b1fbe6a4d7b65518c1d5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 13:37:40 -0800 Subject: [PATCH 022/112] test whether variable is defined before checking its value --- gpu/install_gpu_driver.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index f16226d4d..c2aee539a 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -778,7 +778,7 @@ function install_nvidia_cudnn() { } function configure_dkms_certs() { - if [[ -z "${PSN}" ]]; then + if test -v PSN && [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping"; return 0 fi @@ -921,8 +921,9 @@ function build_driver_from_github() { local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local build_dir - if [[ -n "${modulus_md5sum}" ]] ; then build_dir="${modulus_md5sum}" - else build_dir="unsigned" ; fi + if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] + then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}" From 3b720484d8c11ea1fb7e97f130fafac64e8039a9 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 15:52:26 -0800 Subject: [PATCH 023/112] cache only the bins and logs --- gpu/install_gpu_driver.sh | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index c2aee539a..3c10579d4 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -147,7 +147,7 @@ readonly ROLE # Rocky8: 12.0: 525.147.05 readonly -A DRIVER_FOR_CUDA=( ["11.7"]="515.65.01" ["11.8"]="525.60.13" - ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.67" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.127.05" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) readonly -A DRIVER_SUBVER=( ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" @@ -905,7 +905,7 @@ function add_repo_cuda() { } function build_driver_from_github() { - # closed driver will have been built on rocky8 + # non-GPL driver will have been built on rocky8 if is_rocky8 ; then return 0 ; fi pushd "${workdir}" @@ -917,8 +917,9 @@ function build_driver_from_github() { mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } - test -f "${workdir}/open-gpu-kernel-modules/kernel-open/nvidia.ko" || { - local build_tarball="kmod-build_${_shortname}_${DRIVER_VERSION}.tar.gz" + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local build_dir if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] @@ -930,7 +931,7 @@ function build_driver_from_github() { if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" else - # build and cache kernel modules + # build the kernel modules pushd open-gpu-kernel-modules install_build_dependencies execute_with_retries make -j$(nproc) modules \ @@ -945,26 +946,23 @@ function build_driver_from_github() { "${module}" done fi - tar czvf "${local_tarball}" ../open-gpu-kernel-modules/kernel-open + make modules_install \ + >> kernel-open/build.log \ + 2>> kernel-open/build_error.log + depmod -a + # Collect build logs and installed binaries + tar czvf "${local_tarball}" \ + "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" rm "${local_tarball}" make clean popd fi - gcloud storage cat "${gcs_tarball}" | tar xzv - } - - # install kernel modules - modinfo nvidia > /dev/null 2>&1 || { - pushd open-gpu-kernel-modules - install_build_dependencies - make modules_install \ - >> kernel-open/build.log \ - 2>> kernel-open/build_error.log - depmod -a - popd + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv } + install_kernel_modules popd } From 2cc19ce34b16cb76b113bad5eaede5b73c194bf6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 16:25:50 -0800 Subject: [PATCH 024/112] build index of kernel modules after unpacking ; remove call to non-existent function --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 3c10579d4..004df710b 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -960,9 +960,9 @@ function build_driver_from_github() { popd fi gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a } - install_kernel_modules popd } From 5a2d78395b11f00f4ef3b22ac49b8b03612bcc2e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 17:37:49 -0800 Subject: [PATCH 025/112] only build module dependency index once --- gpu/install_gpu_driver.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 004df710b..8a9b75413 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -949,7 +949,6 @@ function build_driver_from_github() { make modules_install \ >> kernel-open/build.log \ 2>> kernel-open/build_error.log - depmod -a # Collect build logs and installed binaries tar czvf "${local_tarball}" \ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ From 1cf12ab52132a911018592986e78f0f3c3e15fa4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 21:44:47 -0800 Subject: [PATCH 026/112] skipping CUDA 11 NCCL build on debian12 --- gpu/install_gpu_driver.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 8a9b75413..ddaba8dcd 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -210,6 +210,7 @@ function set_cuda_version() { CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} fi readonly CUDA_FULL_VERSION + } set_cuda_version @@ -615,6 +616,11 @@ function uninstall_local_cudnn8_repo() { function install_nvidia_nccl() { if test -f "${workdir}/nccl-complete" ; then return ; fi + if is_cuda11 && is_debian12 ; then + echo "NCCL cannot be compiled for CUDA 11 on ${OS_NAME}" + return + fi + local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" # https://github.com/NVIDIA/nccl/blob/master/README.md @@ -1388,12 +1394,6 @@ function install_build_dependencies() { update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 update-alternatives --set gcc /usr/bin/gcc-12 - elif is_debian12 && is_cuda11 ; then - # On debian12, the default compiler does not build NCCL - execute_with_retries apt-get install -y -qq gcc-11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-11 fi elif is_rocky ; then From 77a95ff5442bb97096d7ec8d779c49b264f56778 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 22:01:02 -0800 Subject: [PATCH 027/112] skip cuda11 on debian12, rocky9 --- gpu/test_gpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 536c7b4bf..ed8e82008 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -232,10 +232,10 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, # ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): # self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") -# if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ -# and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ -# and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( From 0b2da1410f4819a643c3eb0dea19db28cdb47be6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 22:53:51 -0800 Subject: [PATCH 028/112] renamed verify_pyspark to verify_instance_pyspark --- gpu/test_gpu.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index ed8e82008..d154d6a55 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -51,10 +51,6 @@ def verify_instance(self, name): time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience self.assert_instance_command(name, "nvidia-smi", 1) - def verify_pyspark(self, name): - # Verify that pyspark works - self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) - def verify_pytorch(self, name): test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), self.TORCH_TEST_SCRIPT_FILE_NAME) @@ -91,6 +87,10 @@ def verify_instance_nvcc(self, name, cuda_version): self.assert_instance_command( name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) ) + def verify_instance_pyspark(self, name): + # Verify that pyspark works + self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + def verify_instance_cuda_version(self, name, cuda_version): self.assert_instance_command( name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) ) @@ -148,8 +148,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, # Do not attempt this on single instance rocky clusters no_op=1 else: - # verify that pyspark from command prompt works - self.verify_pyspark(machine_name) + # verify that pyspark works from command prompt + self.verify_instance_pyspark(machine_name) @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), From 0c1df7f92a5a3fe3ea237e31f602e885894922bd Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 23:16:56 -0800 Subject: [PATCH 029/112] failing somewhat gracefully ; skipping tests that would fail --- gpu/install_gpu_driver.sh | 4 ++++ gpu/test_gpu.py | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index ddaba8dcd..b65417bf2 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -940,6 +940,10 @@ function build_driver_from_github() { # build the kernel modules pushd open-gpu-kernel-modules install_build_dependencies + if is_cuda11 && is_ubuntu22 ; then + echo "Kernel modules cannot be compiled for CUDA 11 on ${OS_NAME}" + exit 1 + fi execute_with_retries make -j$(nproc) modules \ > kernel-open/build.log \ 2> kernel-open/build_error.log diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index d154d6a55..4611ea5d6 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -233,9 +233,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, # self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest("CUDA < 12 not supported on Dataproc 2.2") metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( From ce60b035ce0c23decd405fa14c065a01ae04eae8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 11 Dec 2024 23:41:56 -0800 Subject: [PATCH 030/112] skipping single node tests for rocky8 --- gpu/test_gpu.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 4611ea5d6..de08b4827 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -232,6 +232,11 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, # ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): # self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("CUDA < 12 not supported on Dataproc 2.2") From d16e625b729b6c34c7362eda322c0a91331f62cb Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 00:03:00 -0800 Subject: [PATCH 031/112] re-enable other tests --- gpu/test_gpu.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index de08b4827..10b66c194 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -122,11 +122,6 @@ def verify_instance_spark(self): def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("Running only one test to build cache") - - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - metadata = None if driver_provider is not None: metadata = "gpu-driver-provider={}".format(driver_provider) @@ -157,13 +152,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("Running only one test to build cache") - self.skipTest("No need to regularly test not installing the agent") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - metadata = "install-gpu-agent=false" if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) @@ -188,9 +178,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("Running only one test to build cache") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -220,8 +207,6 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): -# if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") # if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ # and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): @@ -267,13 +252,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): - self.skipTest("Running only one test to build cache") self.skipTest("Test is known to fail. Skipping so that we can exercise others") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): self.skipTest("CUDA == 12.0 not supported on debian 12") @@ -312,10 +293,6 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("Running only one test to build cache") - - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ @@ -348,10 +325,6 @@ def test_gpu_allocation(self, configuration, master_accelerator, def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): - self.skipTest("Running only one test to build cache") - - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ From 7284ad746535d7263849bb0cb712660c2c700e04 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 00:10:22 -0800 Subject: [PATCH 032/112] Specifying bazel version with variable --- gpu/Dockerfile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/gpu/Dockerfile b/gpu/Dockerfile index 23668a189..05724eb8c 100644 --- a/gpu/Dockerfile +++ b/gpu/Dockerfile @@ -15,8 +15,10 @@ RUN apt-get -qq update \ curl jq less screen > /dev/null 2>&1 && apt-get clean # Install bazel signing key, repo and package -ENV bazel_kr_path=/usr/share/keyrings/bazel-release.pub.gpg -ENV bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" +ENV bazel_kr_path=/usr/share/keyrings/bazel-keyring.gpg \ + bazel_version=7.4.0 \ + bazel_repo_data="http://storage.googleapis.com/bazel-apt stable jdk1.8" \ + DEBIAN_FRONTEND=noninteractive RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \ | gpg --dearmor -o "${bazel_kr_path}" \ @@ -24,14 +26,13 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg \ | dd of=/etc/apt/sources.list.d/bazel.list status=none \ && apt-get update -qq -RUN apt-get update -y -qq && \ - apt-get autoremove -y -qq && \ - apt-get install -y -qq default-jdk python3-setuptools bazel-7.4.0 > /dev/null 2>&1 && \ +RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ + apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ apt-get clean -# Set bazel-7.4.0 as the default bazel alternative in this container -RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-7.4.0 7 && \ - update-alternatives --set bazel /usr/bin/bazel-7.4.0 +# Set bazel-${bazel_version} as the default bazel alternative in this container +RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \ + update-alternatives --set bazel /usr/bin/bazel-${bazel_version} # Install here any utilities you find useful when troubleshooting RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-get clean From 35e4ba243b5259f9cb671ded62c11c2aab371834 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 00:40:04 -0800 Subject: [PATCH 033/112] fixing up some skip logic --- gpu/test_gpu.py | 60 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 10b66c194..60d51541e 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -200,9 +200,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "12.4"), - ("SINGLE", ["m"], GPU_T4, None, "11.8"), +# ("SINGLE", ["m"], GPU_T4, None, "11.8"), # ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -212,30 +212,30 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, # and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): # self.skipTest("CUDA == 12.0 not supported on debian 12") -# if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ -# and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ -# ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): -# self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ + and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ + ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") + + if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("CUDA < 12 not supported on Dataproc 2.2") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Dataproc 2.2") - metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-32", + machine_type="n1-highmem-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=30, - boot_disk_size="60GB") + boot_disk_size="50GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) @@ -255,19 +255,18 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, self.skipTest("Test is known to fail. Skipping so that we can exercise others") - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") +# if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ +# and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): +# self.skipTest("CUDA == 12.0 not supported on debian 12") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest("CUDA < 12 not supported on Dataproc 2.2") metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -326,24 +325,23 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator, worker_accelerator, cuda_version): - if configuration == 'SINGLE' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty") - - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") +# if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ +# and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): +# self.skipTest("CUDA == 12.0 not supported on debian 12") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest("CUDA < 12 not supported on Dataproc 2.2") + + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( From be3756926037610dc4454bdbfea9e30c30b6a98b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 01:08:11 -0800 Subject: [PATCH 034/112] replaced OS_NAME with _shortname --- gpu/install_gpu_driver.sh | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b65417bf2..82953f3cd 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -617,7 +617,7 @@ function install_nvidia_nccl() { if test -f "${workdir}/nccl-complete" ; then return ; fi if is_cuda11 && is_debian12 ; then - echo "NCCL cannot be compiled for CUDA 11 on ${OS_NAME}" + echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" return fi @@ -764,22 +764,14 @@ function install_nvidia_cudnn() { echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi fi - elif is_ubuntu ; then - local -a packages - packages=( - "libcudnn${major_version}=${cudnn_pkg_version}" - "libcudnn${major_version}-dev=${cudnn_pkg_version}") - execute_with_retries \ - apt-get install -q -y --no-install-recommends "${packages[*]}" - sync else - echo "Unsupported OS: '${OS_NAME}'" + echo "Unsupported OS: '${_shortname}'" exit 1 fi ldconfig - echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." + echo "NVIDIA cuDNN successfully installed for ${_shortname}." touch "${workdir}/cudnn-complete" } @@ -941,7 +933,7 @@ function build_driver_from_github() { pushd open-gpu-kernel-modules install_build_dependencies if is_cuda11 && is_ubuntu22 ; then - echo "Kernel modules cannot be compiled for CUDA 11 on ${OS_NAME}" + echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" exit 1 fi execute_with_retries make -j$(nproc) modules \ @@ -1101,7 +1093,7 @@ function install_cuda(){ if test -f "${workdir}/cuda-repo-complete" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then - echo "installed with the driver on ${OS_NAME}" + echo "installed with the driver on ${_shortname}" return 0 fi @@ -1129,7 +1121,7 @@ function install_nvidia_gpu_driver() { nvidia-smi \ libglvnd0 \ libcuda1 - echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully" + echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" return 0 fi From c9d1d958d2fb4447423284a1b9e300a2dc95fb7b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 09:33:29 -0800 Subject: [PATCH 035/112] skip more single instance tests for rocky8 --- gpu/test_gpu.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 60d51541e..d43071d3e 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -122,6 +122,11 @@ def verify_instance_spark(self): def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + metadata = None if driver_provider is not None: metadata = "gpu-driver-provider={}".format(driver_provider) @@ -137,12 +142,6 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) - if ( configuration == 'SINGLE' and \ - self.getImageOs() == 'rocky' and \ - self.getImageVersion() > pkg_resources.parse_version("2.1") ): - # Do not attempt this on single instance rocky clusters - no_op=1 - else: # verify that pyspark works from command prompt self.verify_instance_pyspark(machine_name) From b63ae1704d82a4306beb59599a721abd3131e71d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 10:30:46 -0800 Subject: [PATCH 036/112] fixing indentation ; skipping redundant test --- gpu/test_gpu.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index d43071d3e..3649a865c 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -142,8 +142,9 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) - # verify that pyspark works from command prompt - self.verify_instance_pyspark(machine_name) + self.verify_instance_nvcc(machine_name, cuda_version) + self.verify_instance_pyspark(machine_name) + self.verify_instance_spark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), @@ -177,6 +178,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + self.skipTest("No need to regularly installing the agent on its own cluster ; this is exercised elsewhere") metadata = "install-gpu-agent=true" if driver_provider is not None: From 94c1f13f237bd7d00356974f47f72c9313bc1c0b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 10:57:31 -0800 Subject: [PATCH 037/112] remove retries of flakey tests --- cloudbuild/presubmit.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index 882acc4db..d9ae3c9bb 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -105,7 +105,6 @@ run_tests() { bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ - --flaky_test_attempts=3 \ --action_env="INTERNAL_IP_SSH=true" \ --test_output="all" \ --noshow_progress \ From ac477b3ea6456e33b20e5b03c09417caf2c7602f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 11:07:14 -0800 Subject: [PATCH 038/112] oops ; need to define the cuda version to test for --- gpu/test_gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 3649a865c..35f08f801 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -10,6 +10,7 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" + DEFAULT_CUDA_VERSION = "12.4" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" @@ -142,7 +143,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) - self.verify_instance_nvcc(machine_name, cuda_version) + self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) self.verify_instance_pyspark(machine_name) self.verify_instance_spark() From db7aacf301dad1e5e46e4d0fbd39758d60e324c0 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 13:25:17 -0800 Subject: [PATCH 039/112] passing -q to gcloud to generate empty passphrase if no ssh key exists ; selecting a more modern version of the 550 driver --- gpu/install_gpu_driver.sh | 78 +++++++++++++++++++++------------------ gpu/test_gpu.py | 2 +- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 82953f3cd..19c578d38 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -137,52 +137,58 @@ fi ROLE="$(get_metadata_attribute dataproc-role)" readonly ROLE -# CUDA version and Driver version -# https://docs.nvidia.com/deploy/cuda-compatibility/ -# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html -# https://developer.nvidia.com/cuda-downloads - -# Minimum supported version for open kernel driver is 515.43.04 -# https://github.com/NVIDIA/open-gpu-kernel-modules/tags -# Rocky8: 12.0: 525.147.05 -readonly -A DRIVER_FOR_CUDA=( +function set_support_matrix() { + # CUDA version and Driver version + # https://docs.nvidia.com/deploy/cuda-compatibility/ + # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html + # https://developer.nvidia.com/cuda-downloads + + # Minimum supported version for open kernel driver is 515.43.04 + # https://github.com/NVIDIA/open-gpu-kernel-modules/tags + # Rocky8: 12.0: 525.147.05 + local latest + latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" + readonly -A DRIVER_FOR_CUDA=( ["11.7"]="515.65.01" ["11.8"]="525.60.13" - ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.127.05" ["12.5"]="555.42.02" ["12.6"]="560.35.03" -) -readonly -A DRIVER_SUBVER=( + ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ) + readonly -A DRIVER_SUBVER=( ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" - ["545"]="545.29.06" ["550"]="550.127.05" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" -) -# https://developer.nvidia.com/cudnn-downloads -if is_debuntu ; then -readonly -A CUDNN_FOR_CUDA=( + ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" + ) + # https://developer.nvidia.com/cudnn-downloads + if is_debuntu ; then + readonly -A CUDNN_FOR_CUDA=( ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" -) -elif is_rocky ; then -# rocky: -# 12.0: 8.8.1.3 -# 12.1: 8.9.3.28 -# 12.2: 8.9.7.29 -# 12.3: 9.0.0.312 -# 12.4: 9.1.1.17 -# 12.5: 9.2.1.18 -# 12.6: 9.5.1.17 -readonly -A CUDNN_FOR_CUDA=( + ) + elif is_rocky ; then + # rocky: + # 12.0: 8.8.1.3 + # 12.1: 8.9.3.28 + # 12.2: 8.9.7.29 + # 12.3: 9.0.0.312 + # 12.4: 9.1.1.17 + # 12.5: 9.2.1.18 + # 12.6: 9.5.1.17 + readonly -A CUDNN_FOR_CUDA=( ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" -) -fi -# https://developer.nvidia.com/nccl/nccl-download -# 12.2: 2.19.3, 12.5: 2.21.5 -readonly -A NCCL_FOR_CUDA=( + ) + fi + # https://developer.nvidia.com/nccl/nccl-download + # 12.2: 2.19.3, 12.5: 2.21.5 + readonly -A NCCL_FOR_CUDA=( ["11.7"]="2.21.5" ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" -) -readonly -A CUDA_SUBVER=( + ) + readonly -A CUDA_SUBVER=( ["11.7"]="11.7.1" ["11.8"]="11.8.0" ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" -) + ) +} + +set_support_matrix RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 35f08f801..dc0332ce9 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -31,7 +31,7 @@ def assert_instance_command(self, retry_count = 5 - ssh_cmd='gcloud compute ssh {} --zone={} --command="{}"'.format( + ssh_cmd='gcloud compute -q ssh {} --zone={} --command="{}"'.format( instance, self.cluster_zone, cmd) while retry_count > 0: From e152fd81525ac9dc6d127741d29da0cf66b9197c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 16:05:49 -0800 Subject: [PATCH 040/112] including instructions on how to create a secure-boot key pair --- gpu/create-key-pair.sh | 135 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 gpu/create-key-pair.sh diff --git a/gpu/create-key-pair.sh b/gpu/create-key-pair.sh new file mode 100644 index 000000000..8f2a42a70 --- /dev/null +++ b/gpu/create-key-pair.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script creates a key pair and publishes to cloud secrets or +# fetches an already published key pair from cloud secrets + +set -e + +# https://github.com/glevand/secure-boot-utils + +# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#adding-shielded-image + +# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#generating-security-keys-certificates + +# https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys + +ITERATION=042 + +CURRENT_PROJECT_ID="$(gcloud config get project)" +if [[ -z "${CURRENT_PROJECT_ID}" ]]; then + echo 'project is not set. please set with `gcloud config set project ${PROJECT_ID}`' >&2 + exit -1 +fi +PROJECT_ID="${CURRENT_PROJECT_ID}" + +function create_key () { + local EFI_VAR_NAME="$1" + local CN_VAL="$2" + local PRIVATE_KEY="tls/${EFI_VAR_NAME}.rsa" + local CACERT="tls/${EFI_VAR_NAME}.pem" + local CACERT_DER="tls/${EFI_VAR_NAME}.der" + CA_KEY_SECRET_NAME="efi-${EFI_VAR_NAME}-priv-key-${ITERATION}" + CA_CERT_SECRET_NAME="efi-${EFI_VAR_NAME}-pub-key-${ITERATION}" + # If the secrets exist in secret manager, populate the tls/ directory + if [[ ! -f "${PRIVATE_KEY}" ]] && gcloud secrets describe "${CA_CERT_SECRET_NAME}" > /dev/null ; then + mkdir -p tls + + gcloud secrets versions access "1" \ + --project="${PROJECT_ID}" \ + --secret="${CA_KEY_SECRET_NAME}" \ + | dd of="${PRIVATE_KEY}" status=none + + gcloud secrets versions access "1" \ + --project="${PROJECT_ID}" \ + --secret="${CA_CERT_SECRET_NAME}" \ + | base64 --decode \ + | dd of="${CACERT_DER}" status=none + + # Create a PEM-format version of the cert + openssl x509 \ + -inform DER \ + -in "${CACERT_DER}" \ + -outform PEM \ + -out "${CACERT}" + + MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt" + curl -s -L -o "${MS_UEFI_CA}" 'https://go.microsoft.com/fwlink/p/?linkid=321194' + + echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt + echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt + modulus_md5sum="$(openssl rsa -noout -modulus -in ${PRIVATE_KEY} | openssl md5 | awk '{print $2}' | tee tls/modulus-md5sum.txt)" + return + fi + + if [[ -f "${PRIVATE_KEY}" ]]; then + modulus_md5sum="$(cat tls/modulus-md5sum.txt)" + return + fi + mkdir -p tls + + echo "generating '${CN_VAL}' '${CACERT}', '${CACERT_DER}' and '${PRIVATE_KEY}'" >&2 + # Generate new x.509 key and cert + openssl req \ + -newkey rsa:3072 \ + -nodes \ + -keyout "${PRIVATE_KEY}" \ + -new \ + -x509 \ + -sha256 \ + -days 3650 \ + -subj "/CN=${CN_VAL}/" \ + -out "${CACERT}" + + # Create a DER-format version of the cert + openssl x509 \ + -outform DER \ + -in "${CACERT}" \ + -outform DER \ + -in "${CACERT}" \ + -out "${CACERT_DER}" + + # Create a new secret containing private key + gcloud secrets create "${CA_KEY_SECRET_NAME}" \ + --project="${PROJECT_ID}" \ + --replication-policy="automatic" \ + --data-file="${PRIVATE_KEY}" + + echo "Private key secret name: '${CA_KEY_SECRET_NAME}'" >&2 + echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt + + # Create a new secret containing public key + cat "${CACERT_DER}" | base64 > "${CACERT_DER}.base64" + gcloud secrets create "${CA_CERT_SECRET_NAME}" \ + --project="${PROJECT_ID}" \ + --replication-policy="automatic" \ + --data-file="${CACERT_DER}.base64" + + modulus_md5sum="$(openssl x509 -noout -modulus -in ${CACERT} | openssl md5 | awk '{print $2}')" + echo "modulus-md5sum: ${modulus_md5sum}" >&2 + echo "${modulus_md5sum}" > tls/modulus-md5sum.txt + echo "Public key secret name: '${CA_CERT_SECRET_NAME}'" >&2 + echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt + +} + +EFI_VAR_NAME=db + +create_key "${EFI_VAR_NAME}" "Cloud Dataproc Custom Image CA ${ITERATION}" + +echo "modulus_md5sum=${modulus_md5sum}" +echo "private_secret_name=${CA_KEY_SECRET_NAME}" +echo "public_secret_name=${CA_CERT_SECRET_NAME}" +echo "secret_project=${PROJECT_ID}" +echo "secret_version=1" From f113ef8517bcd83aa3313fccef263fd99183d377 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 17:47:31 -0800 Subject: [PATCH 041/112] -e for expert, not -p for pro --- {gpu => cloudbuild}/create-key-pair.sh | 0 gpu/install_gpu_driver.sh | 30 +++++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) rename {gpu => cloudbuild}/create-key-pair.sh (100%) diff --git a/gpu/create-key-pair.sh b/cloudbuild/create-key-pair.sh similarity index 100% rename from gpu/create-key-pair.sh rename to cloudbuild/create-key-pair.sh diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 19c578d38..c048aa5ef 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1030,18 +1030,32 @@ function install_nvidia_userspace_runfile() { if is_rocky8 ; then install_build_dependencies - # build non-open driver - execute_with_retries bash "${local_fn}" \ - --module-signing-hash sha256 \ + local signing_options + signing_options="" + if [[ -n "${PSN}" ]]; then + signing_options="--module-signing-hash sha256 \ --module-signing-x509-hash sha256 \ --module-signing-secret-key "${mok_key}" \ --module-signing-public-key "${mok_der}" \ --module-signing-script "/lib/modules/${uname_r}/build/scripts/sign-file" \ + " + fi + + # build non-open driver + execute_with_retries bash "${local_fn}" -e -q \ + ${signing_options} \ --no-dkms \ - --install-libglvnd --silent --tmpdir="${tmpdir}" + --install-libglvnd \ + --ui=none \ + --tmpdir="${tmpdir}" \ + || { + cat /var/log/nvidia-installer.log + echo "unable to build kernel modules from runfile" + exit 1 + } else # prepare to build from github - execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --silent --tmpdir="${tmpdir}" + execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --tmpdir="${tmpdir}" fi rm -f "${local_fn}" touch "${workdir}/userspace-complete" @@ -1618,12 +1632,12 @@ function clean_up_sources_lists() { } function exit_handler() { - set +ex - echo "Exit handler invoked" - # Purge private key material until next grant clear_dkms_key + set +ex + echo "Exit handler invoked" + # Clear pip cache pip cache purge || echo "unable to purge pip cache" From dfc433de40c697e4e517fac00dd5375375aeba79 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 19:08:05 -0800 Subject: [PATCH 042/112] updated 11.8 and 12.0 driver versions --- gpu/install_gpu_driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index c048aa5ef..87330d0ff 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -149,8 +149,8 @@ function set_support_matrix() { local latest latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" readonly -A DRIVER_FOR_CUDA=( - ["11.7"]="515.65.01" ["11.8"]="525.60.13" - ["12.0"]="525.60.13" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["11.7"]="515.65.01" ["11.8"]="525.147.05" + ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) readonly -A DRIVER_SUBVER=( ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" From 77fc42af5afe0c692e8e4fb6e3b86d819d19e063 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 19:09:12 -0800 Subject: [PATCH 043/112] added a signature check test which allows granular selection of platform to test, but does not yet verify signatures --- gpu/test_gpu.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index dc0332ce9..9766d804f 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -244,7 +244,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.verify_instance(machine_name) self.verify_instance_nvcc(machine_name, cuda_version) self.verify_instance_pyspark(machine_name) - self.verify_instance_spark() + self.verify_instance_spark() @parameterized.parameters( ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), @@ -346,6 +346,39 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) + self.createCluster( + configuration, + self.INIT_ACTIONS, + machine_type="n1-highmem-8", + master_accelerator=master_accelerator, + worker_accelerator=worker_accelerator, + metadata=metadata, + timeout_in_minutes=30, + boot_disk_size="50GB", + scopes="https://www.googleapis.com/auth/monitoring.write") + + for machine_suffix in machine_suffixes: + self.verify_instance("{}-{}".format(self.getClusterName(),machine_suffix)) + self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),machine_suffix)) + + self.verify_instance_spark() + + @parameterized.parameters( +# ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), +# ("STANDARD", ["m"], GPU_T4, None, "12.0"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), + ) + def tests_driver_signing(self, configuration, machine_suffixes, + master_accelerator, worker_accelerator, + cuda_version, image_os, image_version): + + if self.getImageOs() != image_os: + self.skipTest("This test is only run on os {}".format(image_os)) + if self.getImageVersion() != image_version: + self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os)) + self.createCluster( configuration, self.INIT_ACTIONS, From 8ed498e87a39f2c7fd3784147b70419bcd15595f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 20:06:46 -0800 Subject: [PATCH 044/112] tuning the layout of arguments to userspace.run --- gpu/install_gpu_driver.sh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 87330d0ff..14bafaac3 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1035,9 +1035,9 @@ function install_nvidia_userspace_runfile() { if [[ -n "${PSN}" ]]; then signing_options="--module-signing-hash sha256 \ --module-signing-x509-hash sha256 \ - --module-signing-secret-key "${mok_key}" \ - --module-signing-public-key "${mok_der}" \ - --module-signing-script "/lib/modules/${uname_r}/build/scripts/sign-file" \ + --module-signing-secret-key \"${mok_key}\" \ + --module-signing-public-key \"${mok_der}\" \ + --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ " fi @@ -1045,8 +1045,8 @@ function install_nvidia_userspace_runfile() { execute_with_retries bash "${local_fn}" -e -q \ ${signing_options} \ --no-dkms \ - --install-libglvnd \ --ui=none \ + --install-libglvnd \ --tmpdir="${tmpdir}" \ || { cat /var/log/nvidia-installer.log @@ -1055,7 +1055,11 @@ function install_nvidia_userspace_runfile() { } else # prepare to build from github - execute_with_retries bash "${local_fn}" --no-kernel-modules --install-libglvnd --tmpdir="${tmpdir}" + execute_with_retries bash "${local_fn}" -e -q \ + --no-kernel-modules \ + --ui=none \ + --install-libglvnd \ + --tmpdir="${tmpdir}" fi rm -f "${local_fn}" touch "${workdir}/userspace-complete" From 842d7e5725b40f0d91a95aec2843f0ab9f798e7c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 20:56:06 -0800 Subject: [PATCH 045/112] scoping DEFAULT_CUDA_VERSION correctly ; exercising rocky including kerberos on 12.6 --- gpu/test_gpu.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 9766d804f..f4182519d 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -7,10 +7,10 @@ from integration_tests.dataproc_test_case import DataprocTestCase DEFAULT_TIMEOUT = 15 # minutes +DEFAULT_CUDA_VERSION = "12.4" class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" - DEFAULT_CUDA_VERSION = "12.4" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" @@ -138,7 +138,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, + timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) @@ -366,7 +366,10 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf @parameterized.parameters( # ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), # ("STANDARD", ["m"], GPU_T4, None, "12.0"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'), ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'), + ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), ) @@ -386,7 +389,7 @@ def tests_driver_signing(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=30, # this test expects driver and nccl cache to be built and stashed before its run boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: From bb35d11c98f7e4b15e2ed5f0bd0a66f946313a76 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 21:47:21 -0800 Subject: [PATCH 046/112] add a connect timeout to the ssh call instead of trying to patch around a longer than expected connection delay --- gpu/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f4182519d..b876a2b05 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -31,7 +31,7 @@ def assert_instance_command(self, retry_count = 5 - ssh_cmd='gcloud compute -q ssh {} --zone={} --command="{}"'.format( + ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format( instance, self.cluster_zone, cmd) while retry_count > 0: From 2541a6f5b1a88d5b9bc777c260cf54fee8b97b5f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 21:51:42 -0800 Subject: [PATCH 047/112] add some entropy to the process --- gpu/test_gpu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index b876a2b05..7386e111e 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -49,7 +49,9 @@ def assert_instance_command(self, def verify_instance(self, name): # Verify that nvidia-smi works - time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience + import random + # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions + time.sleep( 3 + random.randint(1, 10) ) self.assert_instance_command(name, "nvidia-smi", 1) def verify_pytorch(self, name): From ab668ffe88fa75ac8010ef1540d22ae5765a18e8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 22:38:26 -0800 Subject: [PATCH 048/112] perhaps a re-run would have fixed 2.0-rocky8 on that last run --- cloudbuild/presubmit.sh | 8 ++++++++ gpu/run-bazel-tests.sh | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index d9ae3c9bb..0139636cb 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -105,6 +105,7 @@ run_tests() { bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ + --flaky_test_attempts=3 \ --action_env="INTERNAL_IP_SSH=true" \ --test_output="all" \ --noshow_progress \ @@ -115,6 +116,13 @@ run_tests() { main() { cd /init-actions + +# TODO: once service account is granted permission to access the cloud +# secrets, we can source this file and set signing material metadata +# variables from the environment in the python code. + +# eval "$(bash cloudbuild/create-key-pair.sh | sed -e 's/^/export /g')" + configure_gcloud configure_gcloud_ssh_key initialize_git_repo diff --git a/gpu/run-bazel-tests.sh b/gpu/run-bazel-tests.sh index 8e7cd663d..ae717bf5b 100644 --- a/gpu/run-bazel-tests.sh +++ b/gpu/run-bazel-tests.sh @@ -17,7 +17,6 @@ declare -a TESTS_TO_RUN=('gpu:test_gpu') time bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ - --flaky_test_attempts=3 \ --action_env="INTERNAL_IP_SSH=true" \ --test_output="errors" \ --test_arg="--image_version=${IMAGE_VERSION}" \ From 934289a3d35f5b200a8b419ef24dd2a4bf506d81 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 12 Dec 2024 23:49:58 -0800 Subject: [PATCH 049/112] increasing init action timeout to account for uncached builds --- gpu/test_gpu.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 7386e111e..61f0315ad 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -167,7 +167,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB") for machine_suffix in machine_suffixes: self.verify_instance("{}-{}".format(self.getClusterName(), @@ -193,7 +193,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: @@ -238,7 +238,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB") for machine_suffix in machine_suffixes: @@ -282,7 +282,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", startup_script="gpu/mig.sh") @@ -314,7 +314,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, boot_disk_size="50GB", - timeout_in_minutes=30) + timeout_in_minutes=90) self.verify_instance_spark() @@ -355,7 +355,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") @@ -391,7 +391,7 @@ def tests_driver_signing(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, # this test expects driver and nccl cache to be built and stashed before its run + timeout_in_minutes=90, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: From e5920f8fd2c83d3cc0f0aa40fdd6346122ba6391 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 17:15:44 -0800 Subject: [PATCH 050/112] cache non-open kernel build results --- gpu/install_gpu_driver.sh | 84 +++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 29 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 14bafaac3..e45bf8496 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1027,40 +1027,66 @@ function install_nvidia_userspace_runfile() { "${pkg_bucket}/${USERSPACE_FILENAME}" \ "${local_fn}" + local runfile_args + runfile_args="" + local cache_hit="0" + local local_tarball + if is_rocky8 ; then - install_build_dependencies - - local signing_options - signing_options="" - if [[ -n "${PSN}" ]]; then - signing_options="--module-signing-hash sha256 \ - --module-signing-x509-hash sha256 \ - --module-signing-secret-key \"${mok_key}\" \ - --module-signing-public-key \"${mok_der}\" \ - --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ - " - fi + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" + test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" + local_tarball="${workdir}/${build_tarball}" + local build_dir + if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] + then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi + + local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}" + + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then + cache_hit="1" + runfile_args="--no-kernel-modules" + echo "cache hit" + else + install_build_dependencies + + local signing_options + signing_options="" + if [[ -n "${PSN}" ]]; then + signing_options="--module-signing-hash sha256 \ + --module-signing-x509-hash sha256 \ + --module-signing-secret-key \"${mok_key}\" \ + --module-signing-public-key \"${mok_der}\" \ + --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ + " + fi - # build non-open driver - execute_with_retries bash "${local_fn}" -e -q \ - ${signing_options} \ - --no-dkms \ - --ui=none \ - --install-libglvnd \ - --tmpdir="${tmpdir}" \ - || { - cat /var/log/nvidia-installer.log - echo "unable to build kernel modules from runfile" - exit 1 + runfile_args="--no-dkms ${signing_options}" + fi } else - # prepare to build from github - execute_with_retries bash "${local_fn}" -e -q \ - --no-kernel-modules \ - --ui=none \ - --install-libglvnd \ - --tmpdir="${tmpdir}" + runfile_args="--no-kernel-modules" + fi + + execute_with_retries bash "${local_fn}" -e -q \ + ${runfile_args} \ + --ui=none \ + --install-libglvnd \ + --tmpdir="${tmpdir}" + + if is_rocky8 ; then + if [[ "${cache_hit}" == "1" ]] ; then + gcloud storage cat "${gcs_tarball}" | tar -C / -xzv + depmod -a + else + tar czvf "${local_tarball}" \ + /var/log/nvidia-installer.log \ + $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi fi + rm -f "${local_fn}" touch "${workdir}/userspace-complete" sync From 386177d2b433064f19f5ed21a8921eabc6cd4d52 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 17:48:12 -0800 Subject: [PATCH 051/112] per-kernel sub-directory for kmod tarballs --- gpu/install_gpu_driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index e45bf8496..a42c7f440 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -930,7 +930,7 @@ function build_driver_from_github() { then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" @@ -1042,7 +1042,7 @@ function install_nvidia_userspace_runfile() { then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_dir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then cache_hit="1" From b9668e0ef08d0a93f637561ae166e2605a499c28 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 19:45:19 -0800 Subject: [PATCH 052/112] using upstream repo and branch --- gpu/manual-test-runner.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh index 021528f6c..2527d6fd9 100644 --- a/gpu/manual-test-runner.sh +++ b/gpu/manual-test-runner.sh @@ -4,9 +4,9 @@ # # To run the script, the following will bootstrap # -# git clone git@github.com:LLC-Technologies-Collier/initialization-actions +# git clone git@github.com:GoogleCloudDataproc/initialization-actions # cd initialization-actions -# git checkout gpu-20241207 +# git checkout 2024.11 # cp gpu/env.json.sample env.json # vi env.json # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest . From 2f0148a43f51ea841bf2c4d9d402197912277692 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 19:55:23 -0800 Subject: [PATCH 053/112] corrected grammar error --- gpu/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 61f0315ad..7c090ddea 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -181,7 +181,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("No need to regularly installing the agent on its own cluster ; this is exercised elsewhere") + self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") metadata = "install-gpu-agent=true" if driver_provider is not None: From 19b9ddb44c07f9b427cac0e5cb86b1fe93ace4a7 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 20:13:18 -0800 Subject: [PATCH 054/112] testing Kerberos some more --- gpu/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 7c090ddea..1f1e472fa 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -174,7 +174,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), + ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), # ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), ) From 1e5fc0f3c2d94c40405b6a93c572c26536cbe73d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 20:21:32 -0800 Subject: [PATCH 055/112] better implementation of numa node selection --- gpu/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 1f1e472fa..a9093a2ba 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -59,7 +59,7 @@ def verify_pytorch(self, name): self.TORCH_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) - verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( self.TORCH_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) @@ -69,7 +69,7 @@ def verify_tensorflow(self, name): self.TF_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) - verify_cmd = "echo 0 | dd of=/sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( self.TF_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) From 4023031c0e4a3a5517119997cdf35dad8f619e68 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 20:27:13 -0800 Subject: [PATCH 056/112] this time with a test which is exercised --- gpu/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index a9093a2ba..404dab004 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -206,7 +206,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, ("SINGLE", ["m"], GPU_T4, None, "12.4"), # ("SINGLE", ["m"], GPU_T4, None, "11.8"), # ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), + ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, From 03f59a6ef028b5e172c51d3b31ca3f1ceecc44b5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Dec 2024 23:16:40 -0800 Subject: [PATCH 057/112] skip debian11 on Kerberos --- gpu/test_gpu.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 404dab004..940c43c25 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -125,6 +125,8 @@ def verify_instance_spark(self): def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): + self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") + if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): @@ -183,6 +185,11 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, driver_provider): self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") + if configuration == 'KERBEROS' \ + and self.getImageOs() == 'debian' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + self.skipTest("KERBEROS fails on debian11") + metadata = "install-gpu-agent=true" if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) @@ -212,6 +219,11 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): + if configuration == 'KERBEROS' \ + and self.getImageOs() == 'debian' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + self.skipTest("KERBEROS fails on debian11") + # if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ # and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): # self.skipTest("CUDA == 12.0 not supported on debian 12") @@ -379,6 +391,11 @@ def tests_driver_signing(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version, image_os, image_version): + if configuration == 'KERBEROS' \ + and self.getImageOs() == 'debian' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + self.skipTest("KERBEROS fails on debian11") + if self.getImageOs() != image_os: self.skipTest("This test is only run on os {}".format(image_os)) if self.getImageVersion() != image_version: From f2146e362d802b2e290610ebb4cb4f6cbaa31dde Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 11:55:18 -0800 Subject: [PATCH 058/112] also skipping 2.1-ubuntu20 on kerberos clusters --- gpu/test_gpu.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 940c43c25..b41efb78f 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -186,9 +186,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") if configuration == 'KERBEROS' \ - and self.getImageOs() == 'debian' \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("KERBEROS fails on debian11") + self.skipTest("KERBEROS fails on 2.1 aside from rocky") metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -220,9 +220,9 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, cuda_version): if configuration == 'KERBEROS' \ - and self.getImageOs() == 'debian' \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("KERBEROS fails on debian11") + self.skipTest("KERBEROS fails on 2.1 aside from rocky") # if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ # and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): @@ -392,9 +392,9 @@ def tests_driver_signing(self, configuration, machine_suffixes, cuda_version, image_os, image_version): if configuration == 'KERBEROS' \ - and self.getImageOs() == 'debian' \ + and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("KERBEROS fails on debian11") + self.skipTest("KERBEROS fails on 2.1 aside from rocky") if self.getImageOs() != image_os: self.skipTest("This test is only run on os {}".format(image_os)) From 1cb99f859564918e617dcd49a73f72938e33caf2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 12:27:36 -0800 Subject: [PATCH 059/112] re-adjusting tests to be performed ; adjusting rather than skipping known failure cases --- gpu/test_gpu.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index b41efb78f..5efce6381 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -130,7 +130,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty + configuration='STANDARD' metadata = None if driver_provider is not None: @@ -160,6 +161,12 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, self.skipTest("No need to regularly test not installing the agent") metadata = "install-gpu-agent=false" + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty + configuration='STANDARD' + if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) self.createCluster( @@ -188,7 +195,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("KERBEROS fails on 2.1 aside from rocky") + # KERBEROS fails on 2.1 aside from rocky + configuration="STANDARD" metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -210,7 +218,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, machine_suffix)) @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, "12.4"), + ("SINGLE", ["m"], GPU_T4, None, "12.0"), # ("SINGLE", ["m"], GPU_T4, None, "11.8"), # ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), @@ -222,7 +230,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("KERBEROS fails on 2.1 aside from rocky") + # KERBEROS fails on 2.1 aside from rocky + configuration="STANDARD" # if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ # and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): @@ -240,7 +249,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty + configuration='STANDARD' metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -312,7 +322,8 @@ def test_gpu_allocation(self, configuration, master_accelerator, if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty + configuration='STANDARD' metadata = None if driver_provider is not None: @@ -357,7 +368,8 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty + configuration='STANDARD' metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -394,7 +406,8 @@ def tests_driver_signing(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - self.skipTest("KERBEROS fails on 2.1 aside from rocky") + # KERBEROS fails on 2.1 aside from rocky + configuration="STANDARD" if self.getImageOs() != image_os: self.skipTest("This test is only run on os {}".format(image_os)) From 3a238d18aeeebbb6336cdb65d1eb86a6990391d6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 12:37:47 -0800 Subject: [PATCH 060/112] more temporal variance --- gpu/test_gpu.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 5efce6381..6c3c703ec 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -51,7 +51,7 @@ def verify_instance(self, name): # Verify that nvidia-smi works import random # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions - time.sleep( 3 + random.randint(1, 10) ) + time.sleep( 3 + random.randint(1, 30) ) self.assert_instance_command(name, "nvidia-smi", 1) def verify_pytorch(self, name): @@ -179,8 +179,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, timeout_in_minutes=90, boot_disk_size="50GB") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) @parameterized.parameters( ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), @@ -212,10 +212,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "12.0"), From cc16aa8c9b82e4bb8d47a31f6396ab4b44506cbe Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 13:38:00 -0800 Subject: [PATCH 061/112] skipping CUDA=12.0 for ubuntu22 --- gpu/test_gpu.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 6c3c703ec..8aa955c45 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -362,7 +362,16 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + # CUDA < 12 not supported on Dataproc 2.2 self.skipTest("CUDA < 12 not supported on Dataproc 2.2") +# cuda_version="12.0" # consider this instead + + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + and self.getImageOs() == 'ubuntu': + # CUDA <= 12 not supported on Dataproc 2.2 with ubuntu + self.skipTest("CUDA <= 12 not supported on Dataproc 2.2 with ubuntu") +# cuda_version="12.1" # consider this instead + if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ From 3ac04bc8e1ecbf5469f753d77784c0a91eb3aaf1 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 13:56:41 -0800 Subject: [PATCH 062/112] kerberos not known to succeed on 2.0-rocky8 --- gpu/test_gpu.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 8aa955c45..40bb64952 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -198,6 +198,12 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, # KERBEROS fails on 2.1 aside from rocky configuration="STANDARD" + if configuration == 'KERBEROS' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.0"): + # KERBEROS fails on 2.0 with rocky + configuration="STANDARD" + metadata = "install-gpu-agent=true" if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) @@ -232,6 +238,12 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, # KERBEROS fails on 2.1 aside from rocky configuration="STANDARD" + if configuration == 'KERBEROS' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.0"): + # KERBEROS fails on 2.0 with rocky + configuration="STANDARD" + # if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ # and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): # self.skipTest("CUDA == 12.0 not supported on debian 12") @@ -417,6 +429,12 @@ def tests_driver_signing(self, configuration, machine_suffixes, # KERBEROS fails on 2.1 aside from rocky configuration="STANDARD" + if configuration == 'KERBEROS' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.0"): + # KERBEROS fails on 2.0 with rocky + configuration="STANDARD" + if self.getImageOs() != image_os: self.skipTest("This test is only run on os {}".format(image_os)) if self.getImageVersion() != image_version: From c6bf91a1e1952ee37b5eaad3eb7b336048820162 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 15:19:29 -0800 Subject: [PATCH 063/112] 2.2 dataproc images do not support CUDA <= 12.0 --- gpu/test_gpu.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 40bb64952..c27eadb05 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -244,18 +244,14 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, # KERBEROS fails on 2.0 with rocky configuration="STANDARD" -# if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ -# and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): -# self.skipTest("CUDA == 12.0 not supported on debian 12") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Dataproc 2.2") + self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ @@ -301,9 +297,9 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Dataproc 2.2") + self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2") metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -372,16 +368,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - # CUDA < 12 not supported on Dataproc 2.2 - self.skipTest("CUDA < 12 not supported on Dataproc 2.2") -# cuda_version="12.0" # consider this instead - if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ - and self.getImageOs() == 'ubuntu': - # CUDA <= 12 not supported on Dataproc 2.2 with ubuntu - self.skipTest("CUDA <= 12 not supported on Dataproc 2.2 with ubuntu") + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2") # cuda_version="12.1" # consider this instead From d1b3d48249901e7456c99878946bb62172ef0098 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 17:32:05 -0800 Subject: [PATCH 064/112] skipping SINGLE configuration for rocky8 again --- gpu/test_gpu.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index c27eadb05..8bd132922 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -257,7 +257,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - configuration='STANDARD' + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty") + metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -330,7 +331,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - configuration='STANDARD' + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty") metadata = None if driver_provider is not None: @@ -378,7 +379,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - configuration='STANDARD' + self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( From 751e7a0ae961a6be518c59dce3ba67fb144a30aa Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 19:06:04 -0800 Subject: [PATCH 065/112] not testing 2.0 --- gpu/test_gpu.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 8bd132922..f9a1cfaaa 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -223,7 +223,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, self.verify_instance_gpu_agent(machine_name) @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, "12.0"), + ("SINGLE", ["m"], GPU_T4, None, "12.4"), # ("SINGLE", ["m"], GPU_T4, None, "11.8"), # ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), @@ -394,9 +394,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(),machine_suffix)) - self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),machine_suffix)) - + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) self.verify_instance_spark() @parameterized.parameters( @@ -441,10 +441,9 @@ def tests_driver_signing(self, configuration, machine_suffixes, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) self.verify_instance_spark() From e5e3a9e0016ff81c114720341fd62c756c42729b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 19:07:41 -0800 Subject: [PATCH 066/112] trying without test retries ; retries should happen within the test, not by re-running the test --- cloudbuild/presubmit.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index 0139636cb..9ed39d0ee 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -105,7 +105,6 @@ run_tests() { bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ - --flaky_test_attempts=3 \ --action_env="INTERNAL_IP_SSH=true" \ --test_output="all" \ --noshow_progress \ From c1cd1d9bc84e6f8077b2e0183b2cda084d7d7628 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 14 Dec 2024 19:51:51 -0800 Subject: [PATCH 067/112] kerberos only works on 2.2 --- gpu/test_gpu.py | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f9a1cfaaa..cc3f4447b 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -193,15 +193,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") if configuration == 'KERBEROS' \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # KERBEROS fails on 2.1 aside from rocky - configuration="STANDARD" - - if configuration == 'KERBEROS' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.0"): - # KERBEROS fails on 2.0 with rocky + # KERBEROS fails on 2.1 configuration="STANDARD" metadata = "install-gpu-agent=true" @@ -233,15 +226,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, cuda_version): if configuration == 'KERBEROS' \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # KERBEROS fails on 2.1 aside from rocky - configuration="STANDARD" - - if configuration == 'KERBEROS' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.0"): - # KERBEROS fails on 2.0 with rocky + # KERBEROS fails on 2.1 configuration="STANDARD" if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ @@ -414,15 +400,8 @@ def tests_driver_signing(self, configuration, machine_suffixes, cuda_version, image_os, image_version): if configuration == 'KERBEROS' \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'ubuntu' ) \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # KERBEROS fails on 2.1 aside from rocky - configuration="STANDARD" - - if configuration == 'KERBEROS' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.0"): - # KERBEROS fails on 2.0 with rocky + # KERBEROS fails on 2.1 configuration="STANDARD" if self.getImageOs() != image_os: From eac2d462468383f7fb96616b79ebf929dd1a9cbb Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 15 Dec 2024 12:51:39 -0800 Subject: [PATCH 068/112] using expectedFailure instead of skipTest for tests which are known to fail --- gpu/test_gpu.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index cc3f4447b..164300f5d 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -233,18 +233,19 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") + # CUDA > 12.4 not supported on older debian/ubuntu releases + self.expectedFailure() if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2") + # CUDA <= 12.0 not supported on Dataproc 2.2 + self.expectedFailure() if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty") - + self.expectedFailure() metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -282,11 +283,13 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") + # CUDA > 12.4 not supported on older debian/ubuntu releases + self.expectedFailure() if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2") + self.expectedFailure() + # CUDA <= 12.0 not supported on Dataproc 2.2 metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -317,7 +320,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty") + self.expectedFailure() metadata = None if driver_provider is not None: @@ -348,24 +351,25 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf # if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ # and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): -# self.skipTest("CUDA == 12.0 not supported on debian 12") +# # CUDA == 12.0 not supported on debian 12 +# self.expectedFailure() if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") + # CUDA > 12.4 not supported on older debian/ubuntu releases + self.expectedFailure() if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA <= 12.0 not supported on Dataproc 2.2") -# cuda_version="12.1" # consider this instead - + # CUDA <= 12.0 not supported on Dataproc 2.2 + self.expectedFailure() if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty") + self.expectedFailure() metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( From bf1f0c60be178dee2de11dae2ed7282bda28e470 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 15 Dec 2024 15:36:28 -0800 Subject: [PATCH 069/112] document one of the failure states --- gpu/test_gpu.py | 82 ++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 164300f5d..a07ade732 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -4,6 +4,8 @@ from absl.testing import absltest from absl.testing import parameterized +import unittest + from integration_tests.dataproc_test_case import DataprocTestCase DEFAULT_TIMEOUT = 15 # minutes @@ -15,7 +17,7 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" GPU_V100 = "type=nvidia-tesla-v100" - GPU_A100 = "type=nvidia-tesla-a100" + GPU_A100 = "type=nvidia-tesla-a100,count=2" GPU_H100 = "type=nvidia-h100-80gb,count=8" # Tests for PyTorch @@ -120,7 +122,7 @@ def verify_instance_spark(self): @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), # ("STANDARD", ["m"], GPU_T4, None, None), -# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), ) def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, @@ -130,8 +132,8 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - configuration='STANDARD' + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) metadata = None if driver_provider is not None: @@ -164,8 +166,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - configuration='STANDARD' + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) @@ -194,8 +196,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # KERBEROS fails on 2.1 - configuration="STANDARD" + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -218,7 +220,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "12.4"), # ("SINGLE", ["m"], GPU_T4, None, "11.8"), -# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, @@ -227,25 +229,25 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # KERBEROS fails on 2.1 - configuration="STANDARD" + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - # CUDA > 12.4 not supported on older debian/ubuntu releases - self.expectedFailure() + # ('CUDA > 12.4 not supported on older debian/ubuntu releases') + unittest.expectedFailure(self) if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - # CUDA <= 12.0 not supported on Dataproc 2.2 - self.expectedFailure() + # ('CUDA <= 12.0 not supported on Dataproc 2.2') + unittest.expectedFailure(self) if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - self.expectedFailure() + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -273,23 +275,24 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): - - self.skipTest("Test is known to fail. Skipping so that we can exercise others") - -# if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ -# and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): -# self.skipTest("CUDA == 12.0 not supported on debian 12") + # Operation [projects/.../regions/.../operations/...] failed: + # Invalid value for field 'resource.machineType': \ + # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \ + # 'machineTypes/a3-highgpu-8g'. \ + # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature.. + # ('This use case not thoroughly tested') + unittest.expectedFailure(self) if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - # CUDA > 12.4 not supported on older debian/ubuntu releases - self.expectedFailure() + # ('CUDA > 12.4 not supported on older debian/ubuntu releases') + unittest.expectedFailure(self) if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.expectedFailure() - # CUDA <= 12.0 not supported on Dataproc 2.2 + # ('CUDA <= 12.0 not supported on Dataproc 2.2') + unittest.expectedFailure(self) metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -319,8 +322,8 @@ def test_gpu_allocation(self, configuration, master_accelerator, if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - self.expectedFailure() + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) metadata = None if driver_provider is not None: @@ -349,27 +352,22 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator, worker_accelerator, cuda_version): -# if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ -# and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): -# # CUDA == 12.0 not supported on debian 12 -# self.expectedFailure() - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - # CUDA > 12.4 not supported on older debian/ubuntu releases - self.expectedFailure() + # ('CUDA > 12.4 not supported on older debian/ubuntu releases') + unittest.expectedFailure(self) if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - # CUDA <= 12.0 not supported on Dataproc 2.2 - self.expectedFailure() + # ('CUDA <= 12.0 not supported on Dataproc 2.2') + unittest.expectedFailure(self) if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail in SINGLE configuration with errors about nodes_include being empty - self.expectedFailure() + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + unittest.expectedFailure(self) metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -405,8 +403,8 @@ def tests_driver_signing(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # KERBEROS fails on 2.1 - configuration="STANDARD" + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) if self.getImageOs() != image_os: self.skipTest("This test is only run on os {}".format(image_os)) From 12e6de99310e54e39cd39c63f34b4df854ab46a6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sun, 15 Dec 2024 18:36:30 -0800 Subject: [PATCH 070/112] skipping expected failures --- gpu/test_gpu.py | 81 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index a07ade732..f260d5927 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -27,9 +27,9 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py" def assert_instance_command(self, - instance, - cmd, - timeout_in_minutes=DEFAULT_TIMEOUT): + instance, + cmd, + timeout_in_minutes=DEFAULT_TIMEOUT): retry_count = 5 @@ -119,6 +119,22 @@ def verify_instance_spark(self): + "spark.yarn.unmanagedAM.enabled=false" ) + def verify_driver_signature(self, name): + cert_path='/var/lib/dkms/mok.pub' + if self.getImageOs() == 'ubuntu': + cert_path='/var/lib/shim-signed/mok/MOK.der' + + cert_verification_cmd = """ +perl -Mv5.10 -e ' +my $cert = ( qx{openssl x509 -inform DER -in {} -text} + =~ /Serial Number:.*? +(.+?)\s*$/ms ); +my $kmod = ( qx{modinfo nvidia} + =~ /^sig_key:\s+(\S+)/ms ); +exit 1 unless $cert eq lc $kmod +' +""" + self.assert_instance_command( name, cert_verification_cmd.format(cert_path) ) + @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), # ("STANDARD", ["m"], GPU_T4, None, None), @@ -134,6 +150,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = None if driver_provider is not None: @@ -168,6 +185,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') unittest.expectedFailure(self) + self.skipTest("known to fail") if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) @@ -198,6 +216,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -231,23 +250,24 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') unittest.expectedFailure(self) + self.skipTest("known to fail") if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - # ('CUDA > 12.4 not supported on older debian/ubuntu releases') - unittest.expectedFailure(self) + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - # ('CUDA <= 12.0 not supported on Dataproc 2.2') - unittest.expectedFailure(self) + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') unittest.expectedFailure(self) + self.skipTest("known to fail") + metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -282,17 +302,16 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature.. # ('This use case not thoroughly tested') unittest.expectedFailure(self) + self.skipTest("known to fail") if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - # ('CUDA > 12.4 not supported on older debian/ubuntu releases') - unittest.expectedFailure(self) + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - # ('CUDA <= 12.0 not supported on Dataproc 2.2') - unittest.expectedFailure(self) + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -324,6 +343,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = None if driver_provider is not None: @@ -355,19 +375,18 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - # ('CUDA > 12.4 not supported on older debian/ubuntu releases') - unittest.expectedFailure(self) + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - # ('CUDA <= 12.0 not supported on Dataproc 2.2') - unittest.expectedFailure(self) + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') unittest.expectedFailure(self) + self.skipTest("known to fail") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -390,10 +409,10 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf @parameterized.parameters( # ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), # ("STANDARD", ["m"], GPU_T4, None, "12.0"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8", 'rocky', '2.0'), ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'), - ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'), +# ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), ) @@ -401,10 +420,29 @@ def tests_driver_signing(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version, image_os, image_version): + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') unittest.expectedFailure(self) + self.skipTest("known to fail") + + kvp_array=[] + import os + + if "private_secret_name" in os.environ: + for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']: + kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) ) + + if kvp_array[0] == "public_secret_name=": + self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") + else: + self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") + + metadata = ",".join( kvp_array ) if self.getImageOs() != image_os: self.skipTest("This test is only run on os {}".format(image_os)) @@ -422,9 +460,10 @@ def tests_driver_signing(self, configuration, machine_suffixes, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_gpu_agent(machine_name) + hostname="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(hostname) + self.verify_instance_gpu_agent(hostname) +# self.verify_driver_signature(hostname) self.verify_instance_spark() From f7bf9abb2081087851260fa00325048b9f43fa8e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 16 Dec 2024 14:27:27 -0800 Subject: [PATCH 071/112] updated manual-test-runner.sh instructions --- gpu/manual-test-runner.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh index 2527d6fd9..0199d62ad 100644 --- a/gpu/manual-test-runner.sh +++ b/gpu/manual-test-runner.sh @@ -6,7 +6,7 @@ # # git clone git@github.com:GoogleCloudDataproc/initialization-actions # cd initialization-actions -# git checkout 2024.11 +# git checkout 2024.12 # cp gpu/env.json.sample env.json # vi env.json # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest . From 47a6e3b7314c11adf59369b161dcd9ce27443828 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Dec 2024 15:35:20 -0800 Subject: [PATCH 072/112] this one generated from template after refactor --- gpu/install_gpu_driver.sh | 1190 +++++++++++++++++++++---------------- 1 file changed, 682 insertions(+), 508 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index a42c7f440..8a483ad40 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +# +# This initialization action is generated from +# initialization-actions/templates/gpu/install_gpu_driver.sh.in +# +# Modifications made directly to the generated file will be lost when +# the template is re-evaluated + # # This script installs NVIDIA GPU drivers and collects GPU utilization metrics. @@ -25,25 +33,30 @@ function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $ function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) -readonly -A supported_os=( - ['debian']="10 11 12" - ['rocky']="8 9" - ['ubuntu']="18.04 20.04 22.04" -) +function define_os_comparison_functions() { + + readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" + ) -# dynamically define OS version test utility functions -if [[ "$(os_id)" == "rocky" ]]; -then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') -else _os_version="$(os_version)"; fi -for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" - - for osver in $(echo "${supported_os["${os_id_val}"]}") ; do - eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" - eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" - eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" + # dynamically define OS version test utility functions + if [[ "$(os_id)" == "rocky" ]]; + then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') + else _os_version="$(os_version)"; fi + for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" + done done -done +} + +define_os_comparison_functions function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) @@ -118,24 +131,346 @@ function get_metadata_attribute() ( get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) -OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" -readonly OS_NAME +function execute_with_retries() ( + set +x + local -r cmd="$*" -# Fetch SPARK config -SPARK_VERSION_ENV="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" -readonly SPARK_VERSION_ENV -if version_ge "${SPARK_VERSION_ENV}" "3.0" && \ - version_lt "${SPARK_VERSION_ENV}" "4.0" ; then - readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 - readonly SPARK_VERSION="3.0" # try ${SPARK_VERSION_ENV} -else - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 -fi + if [[ "$cmd" =~ "^apt-get install" ]] ; then + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + fi + for ((i = 0; i < 3; i++)); do + set -x + time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + set +x + if [[ $retval == 0 ]] ; then return 0 ; fi + sleep 5 + done + return 1 +) + +function cache_fetched_package() { + local src_url="$1" + local gcs_fn="$2" + local local_fn="$3" + + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then + time gcloud storage cp "${gcs_fn}" "${local_fn}" + else + time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + fi +} + +function add_contrib_component() { + if ! is_debuntu ; then return ; fi + if ge_debian12 ; then + # Include in sources file components on which nvidia-kernel-open-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi +} + +function set_hadoop_property() { + local -r config_file=$1 + local -r property=$2 + local -r value=$3 + "${bdcfg}" set_property \ + --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ + --name "${property}" --value "${value}" \ + --clobber +} + +function configure_yarn_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + + # Fix local dirs access permissions + local yarn_local_dirs=() + + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} + +function clean_up_sources_lists() { + # + # bigtop (primary) + # + local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" + + if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then + region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" + + local regional_bigtop_repo_uri + regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | + sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | + grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | + cut -d ' ' -f 2 | + head -1) + + if [[ "${regional_bigtop_repo_uri}" == */ ]]; then + local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" + else + local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + fi + + local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" + rm -f "${bigtop_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ + "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" + + sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + fi + + # + # adoptium + # + # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu + local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" + local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" + rm -f "${adoptium_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ + | gpg --dearmor -o "${adoptium_kr_path}" + echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ + > /etc/apt/sources.list.d/adoptium.list + + + # + # docker + # + local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" + local docker_repo_file="/etc/apt/sources.list.d/docker.list" + local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" + + rm -f "${docker_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ + | gpg --dearmor -o "${docker_kr_path}" + echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ + > ${docker_repo_file} + + # + # google cloud + logging/monitoring + # + if ls /etc/apt/sources.list.d/google-cloud*.list ; then + rm -f /usr/share/keyrings/cloud.google.gpg + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + for list in google-cloud google-cloud-logging google-cloud-monitoring ; do + list_file="/etc/apt/sources.list.d/${list}.list" + if [[ -f "${list_file}" ]]; then + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" + fi + done + fi + + # + # cran-r + # + if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi + rm -f /usr/share/keyrings/cran-r.gpg + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ + gpg --dearmor -o /usr/share/keyrings/cran-r.gpg + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + fi + + # + # mysql + # + if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then + rm -f /usr/share/keyrings/mysql.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + gpg --dearmor -o /usr/share/keyrings/mysql.gpg + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list + fi + + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + +} + +function set_proxy(){ + METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + + if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + + export METADATA_HTTP_PROXY + export http_proxy="${METADATA_HTTP_PROXY}" + export https_proxy="${METADATA_HTTP_PROXY}" + export HTTP_PROXY="${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="${METADATA_HTTP_PROXY}" + no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" + local no_proxy_svc + for no_proxy_svc in compute secretmanager dns servicedirectory logging \ + bigquery composer pubsub bigquerydatatransfer dataflow \ + storage datafusion ; do + no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" + done + + export NO_PROXY="${no_proxy}" +} + +function mount_ramdisk(){ + local free_mem + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + + # Write to a ramdisk instead of churning the persistent disk + + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + pip cache purge || echo "unable to purge pip cache" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi +} + +function check_os() { + if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then + echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." + exit 1 + elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then + echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." + exit 1 + elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then + echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." + exit 1 + fi + + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + readonly SPARK_VERSION + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 + fi + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi +} + +# +# Generate repo file under /etc/apt/sources.list.d/ +# +function apt_add_repo() { + local -r repo_name="$1" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local -r include_src="${4:-yes}" + local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" + + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + if [[ "${include_src}" == "yes" ]] ; then + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + fi + + apt-get update -qq +} + +# +# Generate repo file under /etc/yum.repos.d/ +# +function dnf_add_repo() { + local -r repo_name="$1" + local -r repo_url="$3" # "http(s)://host/path/filename.repo" + local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" + + curl -s -L "${repo_url}" \ + | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ + | dd of="${repo_path}" status=progress +} + +# +# Keyrings default to +# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or +# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) +# +function os_add_repo() { + local -r repo_name="$1" + local -r signing_key_url="$2" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local kr_path + if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi + + mkdir -p "$(dirname "${kr_path}")" + + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + + if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi +} + + +readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' -# node role -ROLE="$(get_metadata_attribute dataproc-role)" -readonly ROLE function set_support_matrix() { # CUDA version and Driver version @@ -190,8 +525,6 @@ function set_support_matrix() { set_support_matrix -RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - function set_cuda_version() { local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') @@ -211,6 +544,10 @@ function set_cuda_version() { readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then + CUDA_FULL_VERSION="${CUDA_VERSION}" + CUDA_VERSION="${CUDA_VERSION%.*}" + fi readonly CUDA_VERSION if ( ! test -v CUDA_FULL_VERSION ) ; then CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} @@ -309,8 +646,6 @@ readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USER USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" readonly USERSPACE_FILENAME -readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" - # Short name for urls if is_ubuntu22 ; then # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at @@ -459,33 +794,10 @@ readonly GPU_DRIVER_PROVIDER INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') readonly INSTALL_GPU_AGENT -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' - NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 -function execute_with_retries() ( - set +x - local -r cmd="$*" - - if [[ "$cmd" =~ "^apt-get install" ]] ; then - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - fi - for ((i = 0; i < 3; i++)); do - set -x - time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } - set +x - if [[ $retval == 0 ]] ; then return 0 ; fi - sleep 5 - done - return 1 -) - CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi @@ -503,20 +815,6 @@ function uninstall_cuda_keyring_pkg() { CUDA_KEYRING_PKG_INSTALLED="0" } -function cache_fetched_package() { - local src_url="$1" - local gcs_fn="$2" - local local_fn="$3" - - if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then - time gcloud storage cp "${gcs_fn}" "${local_fn}" - else - time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ - gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) - fi -} - - function install_local_cuda_repo() { if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi @@ -719,7 +1017,6 @@ function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { if test -f "${workdir}/cudnn-complete" ; then return ; fi - local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -743,132 +1040,42 @@ function install_nvidia_cudnn() { if ge_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else - if is_cudnn8 ; then - install_local_cudnn8_repo - - apt-get update -qq - - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn8=${cudnn_pkg_version}" \ - "libcudnn8-dev=${cudnn_pkg_version}" - - uninstall_local_cudnn8_repo - sync - elif is_cudnn9 ; then - install_cuda_keyring_pkg - - apt-get update -qq - - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" - sync - else - echo "Unsupported cudnn version: [${CUDNN_VERSION}]" - fi - fi - else - echo "Unsupported OS: '${_shortname}'" - exit 1 - fi - - ldconfig - - echo "NVIDIA cuDNN successfully installed for ${_shortname}." - touch "${workdir}/cudnn-complete" -} - -function configure_dkms_certs() { - if test -v PSN && [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping"; - return 0 - fi - - mkdir -p "${CA_TMPDIR}" - - # If the private key exists, verify it - if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then - echo "Private key material exists" - - local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then - modulus_md5sum="${expected_modulus_md5sum}" - - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key" - fi - - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert" - fi - else - modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" - fi - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - - return - fi - - # Retrieve cloud secrets keys - local sig_priv_secret_name - sig_priv_secret_name="${PSN}" - local sig_pub_secret_name - sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" - local sig_secret_project - sig_secret_project="$(get_metadata_attribute secret_project)" - local sig_secret_version - sig_secret_version="$(get_metadata_attribute secret_version)" - - # If metadata values are not set, do not write mok keys - if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi - - # Write private material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_priv_secret_name}" \ - | dd status=none of="${CA_TMPDIR}/db.rsa" - - # Write public material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_pub_secret_name}" \ - | base64 --decode \ - | dd status=none of="${CA_TMPDIR}/db.der" + if is_cudnn8 ; then + install_local_cudnn8_repo - local mok_directory="$(dirname "${mok_key}")" - mkdir -p "${mok_directory}" + apt-get update -qq - # symlink private key and copy public cert from volatile storage to DKMS directory - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn8=${cudnn_pkg_version}" \ + "libcudnn8-dev=${cudnn_pkg_version}" - modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" -} + uninstall_local_cudnn8_repo + sync + elif is_cudnn9 ; then + install_cuda_keyring_pkg -function clear_dkms_key { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping" >&2 - return 0 + apt-get update -qq + + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" + sync + else + echo "Unsupported cudnn version: [${CUDNN_VERSION}]" + fi + fi + else + echo "Unsupported OS: '${_shortname}'" + exit 1 fi - rm -rf "${CA_TMPDIR}" "${mok_key}" -} -function add_contrib_component() { - if ge_debian12 ; then - # Include in sources file components on which nvidia-kernel-open-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib" + ldconfig - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list - fi + echo "NVIDIA cuDNN successfully installed for ${_shortname}." + touch "${workdir}/cudnn-complete" } function add_nonfree_components() { @@ -884,20 +1091,21 @@ function add_nonfree_components() { fi } +# +# Install package signing key and add corresponding repository +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html function add_repo_nvidia_container_toolkit() { - if is_debuntu ; then - local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg - local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list - # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html - test -f "${kr_path}" || - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ - | gpg --dearmor -o "${kr_path}" - - test -f "${sources_list_path}" || - curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ - | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \ - | tee "${sources_list_path}" - fi + local nvctk_root="https://nvidia.github.io/libnvidia-container" + local signing_key_url="${nvctk_root}/gpgkey" + local repo_data + + if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" + else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi + + os_add_repo nvidia-container-toolkit \ + "${signing_key_url}" \ + "${repo_data}" \ + "no" } function add_repo_cuda() { @@ -1150,27 +1358,44 @@ function install_cuda(){ # The OS package distributions are unreliable install_cuda_runfile - # Includes cudNN packages + # Includes CUDA packages add_repo_cuda touch "${workdir}/cuda-repo-complete" } +function install_nvidia_container_toolkit() { + local container_runtime_default + if command -v docker ; then container_runtime_default='docker' + elif command -v containerd ; then container_runtime_default='containerd' + elif command -v crio ; then container_runtime_default='crio' + else container_runtime_default='' ; fi + CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") + + if test -z "${CONTAINER_RUNTIME}" ; then return ; fi + + add_repo_nvidia_container_toolkit + if is_debuntu ; then + execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else + execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi + nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" + systemctl restart "${CONTAINER_RUNTIME}" +} + # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { if test -f "${workdir}/gpu-driver-complete" ; then return ; fi + if ( ge_debian12 && is_src_os ) ; then add_nonfree_components - add_repo_nvidia_container_toolkit apt-get update -qq apt-get -yq install \ - nvidia-container-toolkit \ - dkms \ - nvidia-open-kernel-dkms \ - nvidia-open-kernel-support \ - nvidia-smi \ - libglvnd0 \ - libcuda1 + dkms \ + nvidia-open-kernel-dkms \ + nvidia-open-kernel-support \ + nvidia-smi \ + libglvnd0 \ + libcuda1 echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" return 0 fi @@ -1244,60 +1469,6 @@ EOF systemctl --no-reload --now enable gpu-utilization-agent.service } -function set_hadoop_property() { - local -r config_file=$1 - local -r property=$2 - local -r value=$3 - "${bdcfg}" set_property \ - --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ - --name "${property}" --value "${value}" \ - --clobber -} - -function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts - if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - - # Fix local dirs access permissions - local yarn_local_dirs=() - - readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - - if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" - fi -} - function configure_gpu_exclusive_mode() { # check if running spark 3, if not, enable GPU exclusive mode local spark_version @@ -1429,53 +1600,239 @@ function nvsmi() { "${nvsmi}" $* } -function install_build_dependencies() { - if test -f "${workdir}/build-dependencies-complete" ; then return ; fi +function install_build_dependencies() { + if test -f "${workdir}/build-dependencies-complete" ; then return ; fi + + if is_debuntu ; then + if is_ubuntu22 && is_cuda12 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + fi + + elif is_rocky ; then + execute_with_retries dnf -y -q install gcc + + local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" + set +e + eval "${dnf_cmd}" > "${install_log}" 2>&1 + local retval="$?" + set -e + + if [[ "${retval}" == "0" ]] ; then return ; fi + + if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then + # this kernel-devel may have been migrated to the vault + local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" + local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" + dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ + "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" + )" + fi + + execute_with_retries "${dnf_cmd}" + fi + touch "${workdir}/build-dependencies-complete" +} + +function install_dependencies() { + pkg_list="pciutils screen" + if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} + elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi +} + +function prepare_gpu_env(){ + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + + readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 + nvsmi_works="0" + + if is_cuda11 ; then gcc_ver="11" + elif is_cuda12 ; then gcc_ver="12" ; fi +} + +# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades +# Users should run apt-mark unhold before they wish to upgrade these packages +function hold_nvidia_packages() { + apt-mark hold nvidia-* + apt-mark hold libnvidia-* + if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then + apt-mark hold xserver-xorg-video-nvidia* + fi +} + +function delete_mig_instances() ( + # delete all instances + set +e + nvidia-smi mig -dci + + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No compute instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac + + nvidia-smi mig -dgi + case "${?}" in + "0" ) echo "compute instances deleted" ;; + "2" ) echo "invalid argument" ;; + "6" ) echo "No GPU instances found to delete" ;; + * ) echo "unrecognized return code" ;; + esac +) + +# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles +function configure_mig_cgi() { + delete_mig_instances + META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" + if test -n "${META_MIG_CGI_VALUE}"; then + nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C + else + if lspci | grep -q H100 ; then + # run the following command to list placement profiles + # nvidia-smi mig -lgipp + # + # This is the result when using H100 instances on 20241220 + # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 + # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 + # GPU 0 Profile ID 14 Placements: {0,2,4}:2 + # GPU 0 Profile ID 9 Placements: {0,4}:4 + # GPU 0 Profile ID 5 Placement : {0}:4 + # GPU 0 Profile ID 0 Placement : {0}:8 + + # For H100 3D controllers, use profile 19, 7x1G instances + nvidia-smi mig -cgi 19 -C + elif lspci | grep -q A100 ; then + # Dataproc only supports A100s right now split in 2 if not specified + # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances + nvidia-smi mig -cgi 9,9 -C + else + echo "unrecognized 3D controller" + fi + fi +} + +function enable_mig() { + nvidia-smi -mig 1 +} + + +function configure_dkms_certs() { + if test -v PSN && [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key" + fi + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert" + fi + else + modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" + fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + + return + fi + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + local mok_directory="$(dirname "${mok_key}")" + mkdir -p "${mok_directory}" + + # symlink private key and copy public cert from volatile storage to DKMS directory + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + + modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >&2 + return 0 + fi + rm -rf "${CA_TMPDIR}" "${mok_key}" +} - if is_debuntu ; then - if is_ubuntu22 && is_cuda12 ; then - # On ubuntu22, the default compiler does not build some kernel module versions - # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 - execute_with_retries apt-get install -y -qq gcc-12 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-12 - fi +function check_secure_boot() { + local SECURE_BOOT="disabled" + SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') - elif is_rocky ; then - execute_with_retries dnf -y -q install gcc + PSN="$(get_metadata_attribute private_secret_name)" + readonly PSN - local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" - set +e - eval "${dnf_cmd}" > "${install_log}" 2>&1 - local retval="$?" - set -e + if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then + echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." + exit 1 + elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then + echo "Secure boot is enabled, but no signing material provided." + echo "Please either disable secure boot or provide signing material as per" + echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" + return 1 + fi - if [[ "${retval}" == "0" ]] ; then return ; fi + CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" + readonly CA_TMPDIR - if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then - # this kernel-devel may have been migrated to the vault - local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" - local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" - dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ - "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" - )" - fi + if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv + mok_der=/var/lib/shim-signed/mok/MOK.der + else mok_key=/var/lib/dkms/mok.key + mok_der=/var/lib/dkms/mok.pub ; fi - execute_with_retries "${dnf_cmd}" - fi - touch "${workdir}/build-dependencies-complete" + configure_dkms_certs } -function install_dependencies() { - pkg_list="pciutils screen" - if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} - elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi -} function main() { # This configuration should be run on all nodes @@ -1503,9 +1860,8 @@ function main() { # if mig is enabled drivers would have already been installed if [[ $IS_MIG_ENABLED -eq 0 ]]; then install_nvidia_gpu_driver - + install_nvidia_container_toolkit install_cuda - load_kernel_module if [[ -n ${CUDNN_VERSION} ]]; then @@ -1556,109 +1912,11 @@ function main() { fi # Restart YARN services if they are running already - if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-resourcemanager.service - fi - if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-nodemanager.service - fi -} - -function clean_up_sources_lists() { - # - # bigtop (primary) - # - local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" - - if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then - region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" - - local regional_bigtop_repo_uri - regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | - sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | - grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | - cut -d ' ' -f 2 | - head -1) - - if [[ "${regional_bigtop_repo_uri}" == */ ]]; then - local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" - else - local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + for svc in resourcemanager nodemanager; do + if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then + systemctl restart hadoop-yarn-${svc}.service fi - - local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" - rm -f "${bigtop_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ - "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" - - sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - fi - - # - # adoptium - # - # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu - local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" - local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" - rm -f "${adoptium_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ - | gpg --dearmor -o "${adoptium_kr_path}" - echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ - > /etc/apt/sources.list.d/adoptium.list - - - # - # docker - # - local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" - local docker_repo_file="/etc/apt/sources.list.d/docker.list" - local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" - - rm -f "${docker_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ - | gpg --dearmor -o "${docker_kr_path}" - echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ - > ${docker_repo_file} - - # - # google cloud + logging/monitoring - # - if ls /etc/apt/sources.list.d/google-cloud*.list ; then - rm -f /usr/share/keyrings/cloud.google.gpg - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - for list in google-cloud google-cloud-logging google-cloud-monitoring ; do - list_file="/etc/apt/sources.list.d/${list}.list" - if [[ -f "${list_file}" ]]; then - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" - fi - done - fi - - # - # cran-r - # - if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then - keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" - if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi - rm -f /usr/share/keyrings/cran-r.gpg - curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ - gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list - fi - - # - # mysql - # - if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then - rm -f /usr/share/keyrings/mysql.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ - gpg --dearmor -o /usr/share/keyrings/mysql.gpg - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list - fi - - if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi - + done } function exit_handler() { @@ -1694,6 +1952,7 @@ function exit_handler() { # re-hold systemd package if ge_debian12 ; then apt-mark hold systemd libsystemd0 ; fi + hold_nvidia_packages else dnf clean all fi @@ -1761,55 +2020,21 @@ print( " samples-taken: ", scalar @siz, $/, return 0 } -function set_proxy(){ - METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" - - if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi - - export METADATA_HTTP_PROXY - export http_proxy="${METADATA_HTTP_PROXY}" - export https_proxy="${METADATA_HTTP_PROXY}" - export HTTP_PROXY="${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="${METADATA_HTTP_PROXY}" - export no_proxy=metadata.google.internal,169.254.169.254 - export NO_PROXY=metadata.google.internal,169.254.169.254 -} - -function mount_ramdisk(){ - local free_mem - free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" - if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi - - # Write to a ramdisk instead of churning the persistent disk - - tmpdir="/mnt/shm" - mkdir -p "${tmpdir}" - mount -t tmpfs tmpfs "${tmpdir}" - - # Download conda packages to tmpfs - /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" - - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - pip cache purge || echo "unable to purge pip cache" +function prepare_to_install(){ + # Verify OS compatability and Secure boot state + check_os + check_secure_boot - # Download pip packages to tmpfs - pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + prepare_gpu_env - # Download OS packages to tmpfs - if is_debuntu ; then - mount -t tmpfs tmpfs /var/cache/apt/archives - else - mount -t tmpfs tmpfs /var/cache/dnf - fi -} + OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" + readonly OS_NAME -function prepare_to_install(){ - # Verify OS compatability and Secure boot state - check_os_and_secure_boot + # node role + ROLE="$(get_metadata_attribute dataproc-role)" + readonly ROLE workdir=/opt/install-dpgce - nvsmi_works="0" tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" readonly temp_bucket @@ -1818,39 +2043,14 @@ function prepare_to_install(){ readonly uname_r readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive - CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" - readonly CA_TMPDIR - PSN="$(get_metadata_attribute private_secret_name)" - readonly PSN - - if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv - mok_der=/var/lib/shim-signed/mok/MOK.der - else mok_key=/var/lib/dkms/mok.key - mok_der=/var/lib/dkms/mok.pub ; fi - - if is_cuda11 ; then gcc_ver="11" - elif is_cuda12 ; then gcc_ver="12" ; fi mkdir -p "${workdir}" trap exit_handler EXIT set_proxy mount_ramdisk - configure_dkms_certs readonly install_log="${tmpdir}/install.log" - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION_ENV}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION_ENV}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION_ENV}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi - if test -f "${workdir}/prepare-complete" ; then return ; fi repair_old_backports @@ -1882,32 +2082,6 @@ function prepare_to_install(){ touch "${workdir}/prepare-complete" } -# Verify if compatible linux distros and secure boot options are used -function check_os_and_secure_boot() { - local SECURE_BOOT="disabled" - SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') - if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then - echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." - exit 1 - elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then - echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." - exit 1 - elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then - echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." - exit 1 - fi - - if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then - echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." - exit 1 - elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then - echo "Secure boot is enabled, but no signing material provided." - echo "Please either disable secure boot or provide signing material as per" - echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - return 1 - fi -} - prepare_to_install main From 26719af037ee77ecfb8328dec04931ba5b032abd Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Dec 2024 20:26:25 -0800 Subject: [PATCH 073/112] do not point to local rpm pgp key --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 8a483ad40..d485e19ce 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -437,8 +437,8 @@ function dnf_add_repo() { local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" curl -s -L "${repo_url}" \ - | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ | dd of="${repo_path}" status=progress +# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ } # From 74c09f4e6362b131b2e165eded7869b74c8247da Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 23 Dec 2024 23:58:40 -0800 Subject: [PATCH 074/112] re-ordering to reduce delta from master --- gpu/install_gpu_driver.sh | 1025 ++++++++++++++++++------------------- 1 file changed, 503 insertions(+), 522 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index d485e19ce..8164fc44e 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -12,13 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# -# This initialization action is generated from -# initialization-actions/templates/gpu/install_gpu_driver.sh.in -# -# Modifications made directly to the generated file will be lost when -# the template is re-evaluated - # # This script installs NVIDIA GPU drivers and collects GPU utilization metrics. @@ -33,30 +26,25 @@ function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $ function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) -function define_os_comparison_functions() { - - readonly -A supported_os=( - ['debian']="10 11 12" - ['rocky']="8 9" - ['ubuntu']="18.04 20.04 22.04" - ) +readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" +) - # dynamically define OS version test utility functions - if [[ "$(os_id)" == "rocky" ]]; - then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') - else _os_version="$(os_version)"; fi - for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" - - for osver in $(echo "${supported_os["${os_id_val}"]}") ; do - eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" - eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" - eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" - done +# dynamically define OS version test utility functions +if [[ "$(os_id)" == "rocky" ]]; +then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') +else _os_version="$(os_version)"; fi +for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" done -} - -define_os_comparison_functions +done function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) @@ -131,399 +119,64 @@ function get_metadata_attribute() ( get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) -function execute_with_retries() ( - set +x - local -r cmd="$*" - - if [[ "$cmd" =~ "^apt-get install" ]] ; then - apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove - fi - for ((i = 0; i < 3; i++)); do - set -x - time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } - set +x - if [[ $retval == 0 ]] ; then return 0 ; fi - sleep 5 - done - return 1 +OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +readonly OS_NAME + +# node role +ROLE="$(get_metadata_attribute dataproc-role)" +readonly ROLE + +# CUDA version and Driver version +# https://docs.nvidia.com/deploy/cuda-compatibility/ +# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html +# https://developer.nvidia.com/cuda-downloads + +# Minimum supported version for open kernel driver is 515.43.04 +# https://github.com/NVIDIA/open-gpu-kernel-modules/tags +# Rocky8: 12.0: 525.147.05 +latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" +readonly -A DRIVER_FOR_CUDA=( + ["11.7"]="515.65.01" ["11.8"]="525.147.05" + ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" +) +readonly -A DRIVER_SUBVER=( + ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" + ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" +) +# https://developer.nvidia.com/cudnn-downloads +if is_debuntu ; then +readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" + ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" +) +elif is_rocky ; then +# rocky: +# 12.0: 8.8.1.3 +# 12.1: 8.9.3.28 +# 12.2: 8.9.7.29 +# 12.3: 9.0.0.312 +# 12.4: 9.1.1.17 +# 12.5: 9.2.1.18 +# 12.6: 9.5.1.17 +readonly -A CUDNN_FOR_CUDA=( + ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" +) +fi +# https://developer.nvidia.com/nccl/nccl-download +# 12.2: 2.19.3, 12.5: 2.21.5 +readonly -A NCCL_FOR_CUDA=( + ["11.7"]="2.21.5" ["11.8"]="2.21.5" + ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" +) +readonly -A CUDA_SUBVER=( + ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" ) -function cache_fetched_package() { - local src_url="$1" - local gcs_fn="$2" - local local_fn="$3" - - if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then - time gcloud storage cp "${gcs_fn}" "${local_fn}" - else - time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ - gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) - fi -} - -function add_contrib_component() { - if ! is_debuntu ; then return ; fi - if ge_debian12 ; then - # Include in sources file components on which nvidia-kernel-open-dkms depends - local -r debian_sources="/etc/apt/sources.list.d/debian.sources" - local components="main contrib" - - sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" - elif is_debian ; then - sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list - fi -} - -function set_hadoop_property() { - local -r config_file=$1 - local -r property=$2 - local -r value=$3 - "${bdcfg}" set_property \ - --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ - --name "${property}" --value "${value}" \ - --clobber -} - -function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts - if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - - # Fix local dirs access permissions - local yarn_local_dirs=() - - readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - - if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" - fi -} - -function clean_up_sources_lists() { - # - # bigtop (primary) - # - local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" - - if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then - region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" - - local regional_bigtop_repo_uri - regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | - sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | - grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | - cut -d ' ' -f 2 | - head -1) - - if [[ "${regional_bigtop_repo_uri}" == */ ]]; then - local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" - else - local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" - fi - - local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" - rm -f "${bigtop_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ - "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" - - sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" - fi - - # - # adoptium - # - # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu - local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" - local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" - rm -f "${adoptium_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ - | gpg --dearmor -o "${adoptium_kr_path}" - echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ - > /etc/apt/sources.list.d/adoptium.list - - - # - # docker - # - local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" - local docker_repo_file="/etc/apt/sources.list.d/docker.list" - local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" - - rm -f "${docker_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ - | gpg --dearmor -o "${docker_kr_path}" - echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ - > ${docker_repo_file} - - # - # google cloud + logging/monitoring - # - if ls /etc/apt/sources.list.d/google-cloud*.list ; then - rm -f /usr/share/keyrings/cloud.google.gpg - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - for list in google-cloud google-cloud-logging google-cloud-monitoring ; do - list_file="/etc/apt/sources.list.d/${list}.list" - if [[ -f "${list_file}" ]]; then - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" - fi - done - fi - - # - # cran-r - # - if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then - keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" - if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi - rm -f /usr/share/keyrings/cran-r.gpg - curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ - gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list - fi - - # - # mysql - # - if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then - rm -f /usr/share/keyrings/mysql.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ - gpg --dearmor -o /usr/share/keyrings/mysql.gpg - sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list - fi - - if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi - -} - -function set_proxy(){ - METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" - - if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi - - export METADATA_HTTP_PROXY - export http_proxy="${METADATA_HTTP_PROXY}" - export https_proxy="${METADATA_HTTP_PROXY}" - export HTTP_PROXY="${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="${METADATA_HTTP_PROXY}" - no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" - local no_proxy_svc - for no_proxy_svc in compute secretmanager dns servicedirectory logging \ - bigquery composer pubsub bigquerydatatransfer dataflow \ - storage datafusion ; do - no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" - done - - export NO_PROXY="${no_proxy}" -} - -function mount_ramdisk(){ - local free_mem - free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" - if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi - - # Write to a ramdisk instead of churning the persistent disk - - tmpdir="/mnt/shm" - mkdir -p "${tmpdir}" - mount -t tmpfs tmpfs "${tmpdir}" - - # Download conda packages to tmpfs - /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" - - # Clear pip cache - # TODO: make this conditional on which OSs have pip without cache purge - pip cache purge || echo "unable to purge pip cache" - - # Download pip packages to tmpfs - pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" - - # Download OS packages to tmpfs - if is_debuntu ; then - mount -t tmpfs tmpfs /var/cache/apt/archives - else - mount -t tmpfs tmpfs /var/cache/dnf - fi -} - -function check_os() { - if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then - echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." - exit 1 - elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then - echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." - exit 1 - elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then - echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." - exit 1 - fi - - SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - readonly SPARK_VERSION - if version_lt "${SPARK_VERSION}" "3.1" || \ - version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 - fi - - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi -} - -# -# Generate repo file under /etc/apt/sources.list.d/ -# -function apt_add_repo() { - local -r repo_name="$1" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local -r include_src="${4:-yes}" - local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" - - echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" - if [[ "${include_src}" == "yes" ]] ; then - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" - fi - - apt-get update -qq -} - -# -# Generate repo file under /etc/yum.repos.d/ -# -function dnf_add_repo() { - local -r repo_name="$1" - local -r repo_url="$3" # "http(s)://host/path/filename.repo" - local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - - curl -s -L "${repo_url}" \ - | dd of="${repo_path}" status=progress -# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ -} - -# -# Keyrings default to -# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or -# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) -# -function os_add_repo() { - local -r repo_name="$1" - local -r signing_key_url="$2" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local kr_path - if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi - - mkdir -p "$(dirname "${kr_path}")" - - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ - | gpg --import --no-default-keyring --keyring "${kr_path}" - - if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" - else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi -} - - -readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" - -# Dataproc configurations -readonly HADOOP_CONF_DIR='/etc/hadoop/conf' -readonly HIVE_CONF_DIR='/etc/hive/conf' -readonly SPARK_CONF_DIR='/etc/spark/conf' - - -function set_support_matrix() { - # CUDA version and Driver version - # https://docs.nvidia.com/deploy/cuda-compatibility/ - # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html - # https://developer.nvidia.com/cuda-downloads - - # Minimum supported version for open kernel driver is 515.43.04 - # https://github.com/NVIDIA/open-gpu-kernel-modules/tags - # Rocky8: 12.0: 525.147.05 - local latest - latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" - readonly -A DRIVER_FOR_CUDA=( - ["11.7"]="515.65.01" ["11.8"]="525.147.05" - ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" - ) - readonly -A DRIVER_SUBVER=( - ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" - ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" - ) - # https://developer.nvidia.com/cudnn-downloads - if is_debuntu ; then - readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" - ) - elif is_rocky ; then - # rocky: - # 12.0: 8.8.1.3 - # 12.1: 8.9.3.28 - # 12.2: 8.9.7.29 - # 12.3: 9.0.0.312 - # 12.4: 9.1.1.17 - # 12.5: 9.2.1.18 - # 12.6: 9.5.1.17 - readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" - ) - fi - # https://developer.nvidia.com/nccl/nccl-download - # 12.2: 2.19.3, 12.5: 2.21.5 - readonly -A NCCL_FOR_CUDA=( - ["11.7"]="2.21.5" ["11.8"]="2.21.5" - ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" - ) - readonly -A CUDA_SUBVER=( - ["11.7"]="11.7.1" ["11.8"]="11.8.0" - ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" - ) -} - -set_support_matrix +# Verify SPARK compatability +RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') function set_cuda_version() { local cuda_url @@ -602,7 +255,7 @@ function set_driver_version() { DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") readonly DRIVER_VERSION - readonly DRIVER="${DRIVER_VERSION%%.*}" + readonly DRIVER=${DRIVER_VERSION%%.*} export DRIVER_VERSION DRIVER @@ -653,14 +306,14 @@ if is_ubuntu22 ; then # use packages from previous release until such time as nvidia # release ubuntu2204 builds - shortname="$(os_id)$(os_vercat)" nccl_shortname="ubuntu2004" + shortname="$(os_id)$(os_vercat)" elif ge_rocky9 ; then # use packages from previous release until such time as nvidia # release rhel9 builds - shortname="rhel9" nccl_shortname="rhel8" + shortname="rhel9" elif is_rocky ; then shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" nccl_shortname="${shortname}" @@ -794,10 +447,33 @@ readonly GPU_DRIVER_PROVIDER INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') readonly INSTALL_GPU_AGENT +# Dataproc configurations +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly HIVE_CONF_DIR='/etc/hive/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' + NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 +function execute_with_retries() ( + set +x + local -r cmd="$*" + + if [[ "$cmd" =~ "^apt-get install" ]] ; then + apt-get -y clean + apt-get -o DPkg::Lock::Timeout=60 -y autoremove + fi + for ((i = 0; i < 3; i++)); do + set -x + time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + set +x + if [[ $retval == 0 ]] ; then return 0 ; fi + sleep 5 + done + return 1 +) + CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi @@ -818,8 +494,6 @@ function uninstall_cuda_keyring_pkg() { function install_local_cuda_repo() { if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi - if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi - CUDA_LOCAL_REPO_INSTALLED="1" pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" @@ -1068,14 +742,105 @@ function install_nvidia_cudnn() { fi fi else - echo "Unsupported OS: '${_shortname}'" + echo "Unsupported OS: '${OS_NAME}'" exit 1 fi ldconfig - echo "NVIDIA cuDNN successfully installed for ${_shortname}." touch "${workdir}/cudnn-complete" + echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." +} + +function configure_dkms_certs() { + if test -v PSN && [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping"; + return 0 + fi + + mkdir -p "${CA_TMPDIR}" + + # If the private key exists, verify it + if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then + echo "Private key material exists" + + local expected_modulus_md5sum + expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) + if [[ -n "${expected_modulus_md5sum}" ]]; then + modulus_md5sum="${expected_modulus_md5sum}" + + # Verify that cert md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched rsa key" + fi + + # Verify that key md5sum matches expected md5sum + if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then + echo "unmatched x509 cert" + fi + else + modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" + fi + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + + return + fi + + # Retrieve cloud secrets keys + local sig_priv_secret_name + sig_priv_secret_name="${PSN}" + local sig_pub_secret_name + sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" + local sig_secret_project + sig_secret_project="$(get_metadata_attribute secret_project)" + local sig_secret_version + sig_secret_version="$(get_metadata_attribute secret_version)" + + # If metadata values are not set, do not write mok keys + if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi + + # Write private material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_priv_secret_name}" \ + | dd status=none of="${CA_TMPDIR}/db.rsa" + + # Write public material to volatile storage + gcloud secrets versions access "${sig_secret_version}" \ + --project="${sig_secret_project}" \ + --secret="${sig_pub_secret_name}" \ + | base64 --decode \ + | dd status=none of="${CA_TMPDIR}/db.der" + + local mok_directory="$(dirname "${mok_key}")" + mkdir -p "${mok_directory}" + + # symlink private key and copy public cert from volatile storage to DKMS directory + ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" + cp -f "${CA_TMPDIR}/db.der" "${mok_der}" + + modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" +} + +function clear_dkms_key { + if [[ -z "${PSN}" ]]; then + echo "No signing secret provided. skipping" >&2 + return 0 + fi + rm -rf "${CA_TMPDIR}" "${mok_key}" +} + +function add_contrib_component() { + if ! is_debuntu ; then return ; fi + if ge_debian12 ; then + # Include in sources file components on which nvidia-kernel-open-dkms depends + local -r debian_sources="/etc/apt/sources.list.d/debian.sources" + local components="main contrib" + + sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" + elif is_debian ; then + sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list + fi } function add_nonfree_components() { @@ -1116,13 +881,14 @@ function add_repo_cuda() { fi } +readonly uname_r=$(uname -r) + function build_driver_from_github() { # non-GPL driver will have been built on rocky8 if is_rocky8 ; then return 0 ; fi pushd "${workdir}" - test -d "${workdir}/open-gpu-kernel-modules" || { - local tarball_fn="${DRIVER_VERSION}.tar.gz" + tarball_fn="${DRIVER_VERSION}.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ | tar xz @@ -1469,6 +1235,60 @@ EOF systemctl --no-reload --now enable gpu-utilization-agent.service } +function set_hadoop_property() { + local -r config_file=$1 + local -r property=$2 + local -r value=$3 + "${bdcfg}" set_property \ + --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ + --name "${property}" --value "${value}" \ + --clobber +} + +function configure_yarn_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' +} + +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + + # Fix local dirs access permissions + local yarn_local_dirs=() + + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') + + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} + function configure_gpu_exclusive_mode() { # check if running spark 3, if not, enable GPU exclusive mode local spark_version @@ -1649,8 +1469,6 @@ function install_dependencies() { } function prepare_gpu_env(){ - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 nvsmi_works="0" @@ -1721,90 +1539,12 @@ function configure_mig_cgi() { fi fi } - -function enable_mig() { - nvidia-smi -mig 1 -} - - -function configure_dkms_certs() { - if test -v PSN && [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping"; - return 0 - fi - - mkdir -p "${CA_TMPDIR}" - - # If the private key exists, verify it - if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then - echo "Private key material exists" - - local expected_modulus_md5sum - expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) - if [[ -n "${expected_modulus_md5sum}" ]]; then - modulus_md5sum="${expected_modulus_md5sum}" - - # Verify that cert md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched rsa key" - fi - - # Verify that key md5sum matches expected md5sum - if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then - echo "unmatched x509 cert" - fi - else - modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" - fi - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - - return - fi - - # Retrieve cloud secrets keys - local sig_priv_secret_name - sig_priv_secret_name="${PSN}" - local sig_pub_secret_name - sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" - local sig_secret_project - sig_secret_project="$(get_metadata_attribute secret_project)" - local sig_secret_version - sig_secret_version="$(get_metadata_attribute secret_version)" - - # If metadata values are not set, do not write mok keys - if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi - - # Write private material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_priv_secret_name}" \ - | dd status=none of="${CA_TMPDIR}/db.rsa" - - # Write public material to volatile storage - gcloud secrets versions access "${sig_secret_version}" \ - --project="${sig_secret_project}" \ - --secret="${sig_pub_secret_name}" \ - | base64 --decode \ - | dd status=none of="${CA_TMPDIR}/db.der" - - local mok_directory="$(dirname "${mok_key}")" - mkdir -p "${mok_directory}" - - # symlink private key and copy public cert from volatile storage to DKMS directory - ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" - cp -f "${CA_TMPDIR}/db.der" "${mok_der}" - - modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" -} - -function clear_dkms_key { - if [[ -z "${PSN}" ]]; then - echo "No signing secret provided. skipping" >&2 - return 0 - fi - rm -rf "${CA_TMPDIR}" "${mok_key}" + +function enable_mig() { + nvidia-smi -mig 1 } + function check_secure_boot() { local SECURE_BOOT="disabled" SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') @@ -1919,6 +1659,116 @@ function main() { done } +function cache_fetched_package() { + local src_url="$1" + local gcs_fn="$2" + local local_fn="$3" + + if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then + time gcloud storage cp "${gcs_fn}" "${local_fn}" + else + time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) + fi +} + +function clean_up_sources_lists() { + # + # bigtop (primary) + # + local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" + + if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then + region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" + + local regional_bigtop_repo_uri + regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | + sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | + grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | + cut -d ' ' -f 2 | + head -1) + + if [[ "${regional_bigtop_repo_uri}" == */ ]]; then + local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" + else + local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" + fi + + local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" + rm -f "${bigtop_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ + "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" + + sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" + fi + + # + # adoptium + # + # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu + local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" + local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" + rm -f "${adoptium_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ + | gpg --dearmor -o "${adoptium_kr_path}" + echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ + > /etc/apt/sources.list.d/adoptium.list + + + # + # docker + # + local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" + local docker_repo_file="/etc/apt/sources.list.d/docker.list" + local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" + + rm -f "${docker_kr_path}" + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ + | gpg --dearmor -o "${docker_kr_path}" + echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ + > ${docker_repo_file} + + # + # google cloud + logging/monitoring + # + if ls /etc/apt/sources.list.d/google-cloud*.list ; then + rm -f /usr/share/keyrings/cloud.google.gpg + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + for list in google-cloud google-cloud-logging google-cloud-monitoring ; do + list_file="/etc/apt/sources.list.d/${list}.list" + if [[ -f "${list_file}" ]]; then + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" + fi + done + fi + + # + # cran-r + # + if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi + rm -f /usr/share/keyrings/cran-r.gpg + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ + gpg --dearmor -o /usr/share/keyrings/cran-r.gpg + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list + fi + + # + # mysql + # + if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then + rm -f /usr/share/keyrings/mysql.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + gpg --dearmor -o /usr/share/keyrings/mysql.gpg + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list + fi + + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + +} + function exit_handler() { # Purge private key material until next grant clear_dkms_key @@ -2020,6 +1870,56 @@ print( " samples-taken: ", scalar @siz, $/, return 0 } +function set_proxy(){ + METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + + if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + + export METADATA_HTTP_PROXY + export http_proxy="${METADATA_HTTP_PROXY}" + export https_proxy="${METADATA_HTTP_PROXY}" + export HTTP_PROXY="${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="${METADATA_HTTP_PROXY}" + no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" + local no_proxy_svc + for no_proxy_svc in compute secretmanager dns servicedirectory logging \ + bigquery composer pubsub bigquerydatatransfer dataflow \ + storage datafusion ; do + no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" + done + + export NO_PROXY="${no_proxy}" +} + +function mount_ramdisk(){ + local free_mem + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + + # Write to a ramdisk instead of churning the persistent disk + + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + + # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge + pip cache purge || echo "unable to purge pip cache" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi +} + function prepare_to_install(){ # Verify OS compatability and Secure boot state check_os @@ -2027,20 +1927,11 @@ function prepare_to_install(){ prepare_gpu_env - OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role - ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - workdir=/opt/install-dpgce tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" readonly temp_bucket readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" - uname_r=$(uname -r) - readonly uname_r readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive @@ -2082,6 +1973,96 @@ function prepare_to_install(){ touch "${workdir}/prepare-complete" } +function check_os() { + if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then + echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." + exit 1 + elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then + echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." + exit 1 + elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then + echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." + exit 1 + fi + + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + readonly SPARK_VERSION + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 + fi + + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi +} + +# +# Generate repo file under /etc/apt/sources.list.d/ +# +function apt_add_repo() { + local -r repo_name="$1" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local -r include_src="${4:-yes}" + local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" + + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + if [[ "${include_src}" == "yes" ]] ; then + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + fi + + apt-get update -qq +} + +# +# Generate repo file under /etc/yum.repos.d/ +# +function dnf_add_repo() { + local -r repo_name="$1" + local -r repo_url="$3" # "http(s)://host/path/filename.repo" + local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" + + curl -s -L "${repo_url}" \ + | dd of="${repo_path}" status=progress +# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ +} + +# +# Keyrings default to +# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or +# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) +# +function os_add_repo() { + local -r repo_name="$1" + local -r signing_key_url="$2" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local kr_path + if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi + + mkdir -p "$(dirname "${kr_path}")" + + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + + if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi +} + + +readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + prepare_to_install main From 53c1ef1c0a4ae308347078499457a7658f2cc670 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 09:29:37 -0800 Subject: [PATCH 075/112] custom image usage can come later --- cloudbuild/presubmit.sh | 132 ---------------------------------------- 1 file changed, 132 deletions(-) delete mode 100644 cloudbuild/presubmit.sh diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh deleted file mode 100644 index 9ed39d0ee..000000000 --- a/cloudbuild/presubmit.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -# Declare global variable for passing tests between functions -declare -a TESTS_TO_RUN - -configure_gcloud() { - gcloud config set core/disable_prompts TRUE - gcloud config set compute/region us-central1 -} - -configure_gcloud_ssh_key() { - mkdir "${HOME}/.ssh" - - gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \ - --ciphertext-file=cloudbuild/ssh-key.enc \ - --plaintext-file="${HOME}/.ssh/google_compute_engine" - - gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \ - --ciphertext-file=cloudbuild/ssh-key.pub.enc \ - --plaintext-file="${HOME}/.ssh/google_compute_engine.pub" - - chmod 600 "${HOME}/.ssh/google_compute_engine" -} - -# Fetches master branch from GitHub and "resets" local changes to be relative to it, -# so we can diff what changed relatively to master branch. -initialize_git_repo() { - rm -fr .git - git config --global init.defaultBranch main - git init - - git config user.email "ia-tests@presubmit.example.com" - git config user.name "ia-tests" - - git remote add origin "https://github.com/GoogleCloudDataproc/initialization-actions.git" - git fetch origin master - # Fetch all PRs to get history for PRs created from forked repos - git fetch origin +refs/pull/*/merge:refs/remotes/origin/pr/* > /dev/null 2>&1 - - git reset --hard "${COMMIT_SHA}" - - git rebase origin/master -} - -# This function adds all changed files to git "index" and diffs them against master branch -# to determine all changed files and looks for tests in directories with changed files. -determine_tests_to_run() { - # Infer the files that changed - mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD) - mapfile -t CHANGED_FILES < <(git diff origin/master --name-only) - echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}" - echo "Changed files: ${CHANGED_FILES[*]}" - - # Run all tests if common directories modified by deleting files - if [[ "${#DELETED_BUILD_FILES[@]}" -gt 0 ]]; then - echo "All tests will be run: the following BUILD files '${DELETED_BUILD_FILES[*]}' were removed" - TESTS_TO_RUN=(":DataprocInitActionsTestSuite") - return 0 - fi - - set +x - # Determines init actions directories that were changed - declare -a changed_dirs - for changed_file in "${CHANGED_FILES[@]}"; do - local changed_dir - changed_dir="$(dirname "${changed_file}")/" - # Convert `init/internal/` dir to `init/` - changed_dir="${changed_dir%%/*}/" - # Run all tests if common directories modified - if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then - continue - echo "All tests will be run: '${changed_dir}' was changed" - TESTS_TO_RUN=(":DataprocInitActionsTestSuite") - return 0 - fi - # Hack to workaround empty array expansion on old versions of Bash. - # See: https://stackoverflow.com/a/7577209/3227693 - if [[ $changed_dir != ./ ]] && [[ ${changed_dirs[*]+" ${changed_dirs[*]} "} != *" ${changed_dir} "* ]]; then - changed_dirs+=("$changed_dir") - fi - done - echo "Changed directories: ${changed_dirs[*]}" - - # Determines test target in changed init action directories to run - for changed_dir in "${changed_dirs[@]}"; do - # NOTE: The ::-1 removes the trailing '/' - local test_name=${changed_dir::-1} - # Some of our py_tests (that has dashes in the name) are defined in the top-level directory - if [[ $test_name == *"-"* ]]; then - local test_target=":test_${test_name//-/_}" - else - local test_target="${test_name}:test_${test_name}" - fi - TESTS_TO_RUN+=("${test_target}") - done - echo "Tests: ${TESTS_TO_RUN[*]}" - - set -x -} - -run_tests() { - local -r max_parallel_tests=20 - bazel test \ - --jobs="${max_parallel_tests}" \ - --local_test_jobs="${max_parallel_tests}" \ - --action_env="INTERNAL_IP_SSH=true" \ - --test_output="all" \ - --noshow_progress \ - --noshow_loading_progress \ - --test_arg="--image_version=${IMAGE_VERSION}" \ - "${TESTS_TO_RUN[@]}" -} - -main() { - cd /init-actions - -# TODO: once service account is granted permission to access the cloud -# secrets, we can source this file and set signing material metadata -# variables from the environment in the python code. - -# eval "$(bash cloudbuild/create-key-pair.sh | sed -e 's/^/export /g')" - - configure_gcloud - configure_gcloud_ssh_key - initialize_git_repo - determine_tests_to_run - run_tests -} - -main From 97046b13b1ce48bb6a916c8fc3c68cf61af7fbdd Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 09:30:25 -0800 Subject: [PATCH 076/112] see #1283 --- cloudbuild/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index aebaffd84..2ea91e3e5 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -21,8 +21,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \ echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \ dd of="${bazel_repo_file}" status=none && \ apt-get update -qq -RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ - apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ +RUN apt-get autoremove -y -qq && \ + apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \ apt-get clean # Set bazel-${bazel_version} as the default bazel alternative in this container From 484308b2c4e81f19acce7a5dfa045b263192d425 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 09:32:16 -0800 Subject: [PATCH 077/112] replaced incorrectly removed presubmit.sh and removed custom image key creation script intended to be removed in 70f37b638e8309a669625844034946fc1b51037a --- cloudbuild/create-key-pair.sh | 135 ---------------------------------- cloudbuild/presubmit.sh | 125 +++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 135 deletions(-) delete mode 100644 cloudbuild/create-key-pair.sh create mode 100644 cloudbuild/presubmit.sh diff --git a/cloudbuild/create-key-pair.sh b/cloudbuild/create-key-pair.sh deleted file mode 100644 index 8f2a42a70..000000000 --- a/cloudbuild/create-key-pair.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/bin/bash -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This script creates a key pair and publishes to cloud secrets or -# fetches an already published key pair from cloud secrets - -set -e - -# https://github.com/glevand/secure-boot-utils - -# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#adding-shielded-image - -# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#generating-security-keys-certificates - -# https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys - -ITERATION=042 - -CURRENT_PROJECT_ID="$(gcloud config get project)" -if [[ -z "${CURRENT_PROJECT_ID}" ]]; then - echo 'project is not set. please set with `gcloud config set project ${PROJECT_ID}`' >&2 - exit -1 -fi -PROJECT_ID="${CURRENT_PROJECT_ID}" - -function create_key () { - local EFI_VAR_NAME="$1" - local CN_VAL="$2" - local PRIVATE_KEY="tls/${EFI_VAR_NAME}.rsa" - local CACERT="tls/${EFI_VAR_NAME}.pem" - local CACERT_DER="tls/${EFI_VAR_NAME}.der" - CA_KEY_SECRET_NAME="efi-${EFI_VAR_NAME}-priv-key-${ITERATION}" - CA_CERT_SECRET_NAME="efi-${EFI_VAR_NAME}-pub-key-${ITERATION}" - # If the secrets exist in secret manager, populate the tls/ directory - if [[ ! -f "${PRIVATE_KEY}" ]] && gcloud secrets describe "${CA_CERT_SECRET_NAME}" > /dev/null ; then - mkdir -p tls - - gcloud secrets versions access "1" \ - --project="${PROJECT_ID}" \ - --secret="${CA_KEY_SECRET_NAME}" \ - | dd of="${PRIVATE_KEY}" status=none - - gcloud secrets versions access "1" \ - --project="${PROJECT_ID}" \ - --secret="${CA_CERT_SECRET_NAME}" \ - | base64 --decode \ - | dd of="${CACERT_DER}" status=none - - # Create a PEM-format version of the cert - openssl x509 \ - -inform DER \ - -in "${CACERT_DER}" \ - -outform PEM \ - -out "${CACERT}" - - MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt" - curl -s -L -o "${MS_UEFI_CA}" 'https://go.microsoft.com/fwlink/p/?linkid=321194' - - echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt - echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt - modulus_md5sum="$(openssl rsa -noout -modulus -in ${PRIVATE_KEY} | openssl md5 | awk '{print $2}' | tee tls/modulus-md5sum.txt)" - return - fi - - if [[ -f "${PRIVATE_KEY}" ]]; then - modulus_md5sum="$(cat tls/modulus-md5sum.txt)" - return - fi - mkdir -p tls - - echo "generating '${CN_VAL}' '${CACERT}', '${CACERT_DER}' and '${PRIVATE_KEY}'" >&2 - # Generate new x.509 key and cert - openssl req \ - -newkey rsa:3072 \ - -nodes \ - -keyout "${PRIVATE_KEY}" \ - -new \ - -x509 \ - -sha256 \ - -days 3650 \ - -subj "/CN=${CN_VAL}/" \ - -out "${CACERT}" - - # Create a DER-format version of the cert - openssl x509 \ - -outform DER \ - -in "${CACERT}" \ - -outform DER \ - -in "${CACERT}" \ - -out "${CACERT_DER}" - - # Create a new secret containing private key - gcloud secrets create "${CA_KEY_SECRET_NAME}" \ - --project="${PROJECT_ID}" \ - --replication-policy="automatic" \ - --data-file="${PRIVATE_KEY}" - - echo "Private key secret name: '${CA_KEY_SECRET_NAME}'" >&2 - echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt - - # Create a new secret containing public key - cat "${CACERT_DER}" | base64 > "${CACERT_DER}.base64" - gcloud secrets create "${CA_CERT_SECRET_NAME}" \ - --project="${PROJECT_ID}" \ - --replication-policy="automatic" \ - --data-file="${CACERT_DER}.base64" - - modulus_md5sum="$(openssl x509 -noout -modulus -in ${CACERT} | openssl md5 | awk '{print $2}')" - echo "modulus-md5sum: ${modulus_md5sum}" >&2 - echo "${modulus_md5sum}" > tls/modulus-md5sum.txt - echo "Public key secret name: '${CA_CERT_SECRET_NAME}'" >&2 - echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt - -} - -EFI_VAR_NAME=db - -create_key "${EFI_VAR_NAME}" "Cloud Dataproc Custom Image CA ${ITERATION}" - -echo "modulus_md5sum=${modulus_md5sum}" -echo "private_secret_name=${CA_KEY_SECRET_NAME}" -echo "public_secret_name=${CA_CERT_SECRET_NAME}" -echo "secret_project=${PROJECT_ID}" -echo "secret_version=1" diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh new file mode 100644 index 000000000..eec7adb76 --- /dev/null +++ b/cloudbuild/presubmit.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +set -euxo pipefail + +# Declare global variable for passing tests between functions +declare -a TESTS_TO_RUN + +configure_gcloud() { + gcloud config set core/disable_prompts TRUE + gcloud config set compute/region us-central1 +} + +configure_gcloud_ssh_key() { + mkdir "${HOME}/.ssh" + + gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \ + --ciphertext-file=cloudbuild/ssh-key.enc \ + --plaintext-file="${HOME}/.ssh/google_compute_engine" + + gcloud kms decrypt --location=global --keyring=presubmit --key=presubmit \ + --ciphertext-file=cloudbuild/ssh-key.pub.enc \ + --plaintext-file="${HOME}/.ssh/google_compute_engine.pub" + + chmod 600 "${HOME}/.ssh/google_compute_engine" +} + +# Fetches master branch from GitHub and "resets" local changes to be relative to it, +# so we can diff what changed relatively to master branch. +initialize_git_repo() { + rm -fr .git + git config --global init.defaultBranch main + git init + + git config user.email "ia-tests@presubmit.example.com" + git config user.name "ia-tests" + + git remote add origin "https://github.com/GoogleCloudDataproc/initialization-actions.git" + git fetch origin master + # Fetch all PRs to get history for PRs created from forked repos + git fetch origin +refs/pull/*/merge:refs/remotes/origin/pr/* > /dev/null 2>&1 + + git reset --hard "${COMMIT_SHA}" + + git rebase origin/master +} + +# This function adds all changed files to git "index" and diffs them against master branch +# to determine all changed files and looks for tests in directories with changed files. +determine_tests_to_run() { + # Infer the files that changed + mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD) + mapfile -t CHANGED_FILES < <(git diff origin/master --name-only) + echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}" + echo "Changed files: ${CHANGED_FILES[*]}" + + # Run all tests if common directories modified by deleting files + if [[ "${#DELETED_BUILD_FILES[@]}" -gt 0 ]]; then + echo "All tests will be run: the following BUILD files '${DELETED_BUILD_FILES[*]}' were removed" + TESTS_TO_RUN=(":DataprocInitActionsTestSuite") + return 0 + fi + + set +x + # Determines init actions directories that were changed + declare -a changed_dirs + for changed_file in "${CHANGED_FILES[@]}"; do + local changed_dir + changed_dir="$(dirname "${changed_file}")/" + # Convert `init/internal/` dir to `init/` + changed_dir="${changed_dir%%/*}/" + # Run all tests if common directories modified + if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + echo "All tests will be run: '${changed_dir}' was changed" + TESTS_TO_RUN=(":DataprocInitActionsTestSuite") + return 0 + fi + # Hack to workaround empty array expansion on old versions of Bash. + # See: https://stackoverflow.com/a/7577209/3227693 + if [[ $changed_dir != ./ ]] && [[ ${changed_dirs[*]+" ${changed_dirs[*]} "} != *" ${changed_dir} "* ]]; then + changed_dirs+=("$changed_dir") + fi + done + echo "Changed directories: ${changed_dirs[*]}" + + # Determines test target in changed init action directories to run + for changed_dir in "${changed_dirs[@]}"; do + # NOTE: The ::-1 removes the trailing '/' + local test_name=${changed_dir::-1} + # Some of our py_tests (that has dashes in the name) are defined in the top-level directory + if [[ $test_name == *"-"* ]]; then + local test_target=":test_${test_name//-/_}" + else + local test_target="${test_name}:test_${test_name}" + fi + TESTS_TO_RUN+=("${test_target}") + done + echo "Tests: ${TESTS_TO_RUN[*]}" + + set -x +} + +run_tests() { + local -r max_parallel_tests=20 + bazel test \ + --jobs="${max_parallel_tests}" \ + --local_test_jobs="${max_parallel_tests}" \ + --flaky_test_attempts=3 \ + --action_env="INTERNAL_IP_SSH=true" \ + --test_output="all" \ + --noshow_progress \ + --noshow_loading_progress \ + --test_arg="--image_version=${IMAGE_VERSION}" \ + "${TESTS_TO_RUN[@]}" +} + +main() { + cd /init-actions + configure_gcloud + configure_gcloud_ssh_key + initialize_git_repo + determine_tests_to_run + run_tests +} + +main From 61b94da8ad289fa51bba8528ab3744bf321002ac Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 09:36:14 -0800 Subject: [PATCH 078/112] revert nearly to master --- gpu/manual-test-runner.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh index 0199d62ad..3f126670b 100644 --- a/gpu/manual-test-runner.sh +++ b/gpu/manual-test-runner.sh @@ -5,18 +5,21 @@ # To run the script, the following will bootstrap # # git clone git@github.com:GoogleCloudDataproc/initialization-actions -# cd initialization-actions # git checkout 2024.12 +# cd initialization-actions # cp gpu/env.json.sample env.json # vi env.json # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest . # time docker run -it gpu-init-actions-runner:latest gpu/manual-test-runner.sh # # The bazel run(s) happen in separate screen windows. +# To create a new screen window, press ^a c # To see a list of screen windows, press ^a " # Num Name # +# 0 monitor # 1 2.0-debian10 +# 2 sh readonly timestamp="$(date +%F-%H-%M)" @@ -33,7 +36,7 @@ export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" export REGION="$(jq -r .REGION env.json)" export BUCKET="$(jq -r .BUCKET env.json)" -gcs_log_dir="gs://${BUCKET}/gpu-dpgce/builds/${BUILD_ID}/logs" +gcs_log_dir="gs://${BUCKET}/${BUILD_ID}/logs" function exit_handler() { RED='\\e[0;31m' @@ -44,11 +47,8 @@ function exit_handler() { # TODO: list clusters which match our BUILD_ID and clean them up # TODO: remove any test related resources in the project - # We allow the user to monitor the logs from within screen session. - # Logs can be archived if necessary, but won't be unless needed. - -# echo 'Uploading local logs to GCS bucket.' -# gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/" + echo 'Uploading local logs to GCS bucket.' + gsutil -m rsync -r "${log_dir}/" "${gcs_log_dir}/" if [[ -f "${tmp_dir}/tests_success" ]]; then echo -e "${GREEN}Workflow succeeded${NC}, check logs at ${log_dir}/ or ${gcs_log_dir}/" From 8b4f4f8623d241d4a022a9346e893279d32cd1ce Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 09:39:58 -0800 Subject: [PATCH 079/112] can include extended test suite later --- gpu/verify_pytorch.py | 8 -------- gpu/verify_tensorflow.py | 28 ---------------------------- 2 files changed, 36 deletions(-) delete mode 100644 gpu/verify_pytorch.py delete mode 100644 gpu/verify_tensorflow.py diff --git a/gpu/verify_pytorch.py b/gpu/verify_pytorch.py deleted file mode 100644 index dd4910d97..000000000 --- a/gpu/verify_pytorch.py +++ /dev/null @@ -1,8 +0,0 @@ -import torch -print("get CUDA details : == : ") -use_cuda = torch.cuda.is_available() -if use_cuda: - print('__CUDNN VERSION:', torch.backends.cudnn.version()) - print('__Number CUDA Devices:', torch.cuda.device_count()) - print('__CUDA Device Name:',torch.cuda.get_device_name(0)) - print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9) diff --git a/gpu/verify_tensorflow.py b/gpu/verify_tensorflow.py deleted file mode 100644 index 2faf2c717..000000000 --- a/gpu/verify_tensorflow.py +++ /dev/null @@ -1,28 +0,0 @@ -import tensorflow as tf -print("Get GPU Details : ") -print(tf.config.list_physical_devices('GPU')) -#print(tf.test.is_gpu_available()) - -if tf.test.gpu_device_name(): - print('Default GPU Device:{}'.format(tf.test.gpu_device_name())) - print("Please install GPU version of TF") - -gpu_available = tf.config.list_physical_devices('GPU') -print("gpu_available : " + str(gpu_available)) - -#is_cuda_gpu_available = tf.config.list_physical_devices('GPU',cuda_only=True) -is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True) -print("is_cuda_gpu_available : " + str(is_cuda_gpu_available)) - -#is_cuda_gpu_min_3 = tf.config.list_physical_devices('GPU',True, (3,0)) -is_cuda_gpu_min_3 = tf.test.is_gpu_available(True, (3,0)) -print("is_cuda_gpu_min_3 : " + str(is_cuda_gpu_min_3)) - -from tensorflow.python.client import device_lib - -def get_available_gpus(): - local_device_protos = device_lib.list_local_devices() - return [x.name for x in local_device_protos if x.device_type == 'GPU'] - -print("Run GPU Functions Below : ") -print(get_available_gpus()) From 3bc45ff78d525ba1c562c9a0f0d7ad27d5365d7e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 09:42:45 -0800 Subject: [PATCH 080/112] order commands correctly --- gpu/manual-test-runner.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/manual-test-runner.sh b/gpu/manual-test-runner.sh index 3f126670b..37982bfe4 100644 --- a/gpu/manual-test-runner.sh +++ b/gpu/manual-test-runner.sh @@ -5,8 +5,8 @@ # To run the script, the following will bootstrap # # git clone git@github.com:GoogleCloudDataproc/initialization-actions -# git checkout 2024.12 # cd initialization-actions +# git checkout 2024.12 # cp gpu/env.json.sample env.json # vi env.json # docker build -f gpu/Dockerfile -t gpu-init-actions-runner:latest . From 6a76b4ec05bd1e55752d82b4e0d377c12bf4b8f6 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 24 Dec 2024 09:56:01 -0800 Subject: [PATCH 081/112] placing all completion files in a common directory --- gpu/install_gpu_driver.sh | 112 +++++++++----------------------------- 1 file changed, 27 insertions(+), 85 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 8164fc44e..212aa6fbe 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -492,7 +492,7 @@ function uninstall_cuda_keyring_pkg() { } function install_local_cuda_repo() { - if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" @@ -513,16 +513,16 @@ function install_local_cuda_repo() { -o /etc/apt/preferences.d/cuda-repository-pin-600 fi - touch "${workdir}/install-local-cuda-repo-complete" + touch "${workdir}/complete/install-local-cuda-repo" } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - rm -f "${workdir}/install-local-cuda-repo-complete" + rm -f "${workdir}/complete/install-local-cuda-repo" } CUDNN_PKG_NAME="" function install_local_cudnn_repo() { - if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" @@ -538,18 +538,18 @@ function install_local_cudnn_repo() { cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/install-local-cudnn-repo-complete" + touch "${workdir}/complete/install-local-cudnn-repo" } function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" - rm -f "${workdir}/install-local-cudnn-repo-complete" + rm -f "${workdir}/complete/install-local-cudnn-repo" } CUDNN8_LOCAL_REPO_INSTALLED="0" CUDNN8_PKG_NAME="" function install_local_cudnn8_repo() { - if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" @@ -583,16 +583,16 @@ function install_local_cudnn8_repo() { rm -f "${local_deb_fn}" cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/install-local-cudnn8-repo-complete" + touch "${workdir}/complete/install-local-cudnn8-repo" } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" - rm -f "${workdir}/install-local-cudnn8-repo-complete" + rm -f "${workdir}/complete/install-local-cudnn8-repo" } function install_nvidia_nccl() { - if test -f "${workdir}/nccl-complete" ; then return ; fi + if test -f "${workdir}/complete/nccl" ; then return ; fi if is_cuda11 && is_debian12 ; then echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" @@ -683,14 +683,14 @@ function install_nvidia_nccl() { fi popd - touch "${workdir}/nccl-complete" + touch "${workdir}/complete/nccl" } function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { - if test -f "${workdir}/cudnn-complete" ; then return ; fi + if test -f "${workdir}/complete/cudnn" ; then return ; fi local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -748,7 +748,7 @@ function install_nvidia_cudnn() { ldconfig - touch "${workdir}/cudnn-complete" + touch "${workdir}/complete/cudnn" echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." } @@ -994,7 +994,7 @@ function install_nvidia_userspace_runfile() { # # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - if test -f "${workdir}/userspace-complete" ; then return ; fi + if test -f "${workdir}/complete/userspace" ; then return ; fi local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ @@ -1062,12 +1062,12 @@ function install_nvidia_userspace_runfile() { fi rm -f "${local_fn}" - touch "${workdir}/userspace-complete" + touch "${workdir}/complete/userspace" sync } function install_cuda_runfile() { - if test -f "${workdir}/cuda-complete" ; then return ; fi + if test -f "${workdir}/complete/cuda" ; then return ; fi local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ @@ -1076,7 +1076,7 @@ function install_cuda_runfile() { execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" rm -f "${local_fn}" - touch "${workdir}/cuda-complete" + touch "${workdir}/complete/cuda" sync } @@ -1114,7 +1114,7 @@ function load_kernel_module() { } function install_cuda(){ - if test -f "${workdir}/cuda-repo-complete" ; then return ; fi + if test -f "${workdir}/complete/cuda-repo" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" @@ -1127,7 +1127,7 @@ function install_cuda(){ # Includes CUDA packages add_repo_cuda - touch "${workdir}/cuda-repo-complete" + touch "${workdir}/complete/cuda-repo" } function install_nvidia_container_toolkit() { @@ -1150,7 +1150,7 @@ function install_nvidia_container_toolkit() { # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if test -f "${workdir}/gpu-driver-complete" ; then return ; fi + if test -f "${workdir}/complete/gpu-driver" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components @@ -1172,7 +1172,7 @@ function install_nvidia_gpu_driver() { build_driver_from_github echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - touch "${workdir}/gpu-driver-complete" + touch "${workdir}/complete/gpu-driver" } function install_ops_agent(){ @@ -1184,7 +1184,7 @@ function install_ops_agent(){ curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - touch "${workdir}/ops-agent-complete" + touch "${workdir}/complete/ops-agent" } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics @@ -1421,7 +1421,7 @@ function nvsmi() { } function install_build_dependencies() { - if test -f "${workdir}/build-dependencies-complete" ; then return ; fi + if test -f "${workdir}/complete/build-dependencies" ; then return ; fi if is_debuntu ; then if is_ubuntu22 && is_cuda12 ; then @@ -1459,7 +1459,7 @@ function install_build_dependencies() { execute_with_retries "${dnf_cmd}" fi - touch "${workdir}/build-dependencies-complete" + touch "${workdir}/complete/build-dependencies" } function install_dependencies() { @@ -1487,64 +1487,6 @@ function hold_nvidia_packages() { fi } -function delete_mig_instances() ( - # delete all instances - set +e - nvidia-smi mig -dci - - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No compute instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac - - nvidia-smi mig -dgi - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No GPU instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac -) - -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles -function configure_mig_cgi() { - delete_mig_instances - META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" - if test -n "${META_MIG_CGI_VALUE}"; then - nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C - else - if lspci | grep -q H100 ; then - # run the following command to list placement profiles - # nvidia-smi mig -lgipp - # - # This is the result when using H100 instances on 20241220 - # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 - # GPU 0 Profile ID 14 Placements: {0,2,4}:2 - # GPU 0 Profile ID 9 Placements: {0,4}:4 - # GPU 0 Profile ID 5 Placement : {0}:4 - # GPU 0 Profile ID 0 Placement : {0}:8 - - # For H100 3D controllers, use profile 19, 7x1G instances - nvidia-smi mig -cgi 19 -C - elif lspci | grep -q A100 ; then - # Dataproc only supports A100s right now split in 2 if not specified - # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances - nvidia-smi mig -cgi 9,9 -C - else - echo "unrecognized 3D controller" - fi - fi -} - -function enable_mig() { - nvidia-smi -mig 1 -} - - function check_secure_boot() { local SECURE_BOOT="disabled" SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') @@ -1935,14 +1877,14 @@ function prepare_to_install(){ readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive - mkdir -p "${workdir}" + mkdir -p "${workdir}/complete" trap exit_handler EXIT set_proxy mount_ramdisk readonly install_log="${tmpdir}/install.log" - if test -f "${workdir}/prepare-complete" ; then return ; fi + if test -f "${workdir}/complete/prepare" ; then return ; fi repair_old_backports @@ -1970,7 +1912,7 @@ function prepare_to_install(){ screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - touch "${workdir}/prepare-complete" + touch "${workdir}/complete/prepare" } function check_os() { From e59214640d65f4f807eb4865af3ddc71daea0986 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 13 Jan 2025 13:39:47 -0800 Subject: [PATCH 082/112] extend supported version list to include latest release of each minor version and their associated driver --- gpu/install_gpu_driver.sh | 89 ++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 212aa6fbe..9d6bfc135 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -137,42 +137,51 @@ readonly ROLE # Rocky8: 12.0: 525.147.05 latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" readonly -A DRIVER_FOR_CUDA=( - ["11.7"]="515.65.01" ["11.8"]="525.147.05" - ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01" + ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31" + ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" + ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" + ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08" + ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) readonly -A DRIVER_SUBVER=( - ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" - ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" + ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" + ["430"]="430.64" ["435"]="435.21" ["440"]="440.100" + ["450"]="450.119.03" ["455"]="455.45.01" ["460"]="460.91.03" + ["465"]="465.31" ["470"]="470.256.02" ["495"]="495.46" + ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05" + ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06" + ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03" + ["565"]="565.77" ) # https://developer.nvidia.com/cudnn-downloads -if is_debuntu ; then readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" - ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" + ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" + ["11.0"]="8.0.4" ["11.1"]="8.0.5" ["11.2"]="8.1.1" + ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" ["11.5"]="8.3.1.22" + ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" + ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5" + ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18" + ["12.6"]="9.6.0.74" ) -elif is_rocky ; then -# rocky: -# 12.0: 8.8.1.3 -# 12.1: 8.9.3.28 -# 12.2: 8.9.7.29 -# 12.3: 9.0.0.312 -# 12.4: 9.1.1.17 -# 12.5: 9.2.1.18 -# 12.6: 9.5.1.17 -readonly -A CUDNN_FOR_CUDA=( - ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" - ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" -) -fi # https://developer.nvidia.com/nccl/nccl-download # 12.2: 2.19.3, 12.5: 2.21.5 readonly -A NCCL_FOR_CUDA=( - ["11.7"]="2.21.5" ["11.8"]="2.21.5" - ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" + ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3" + ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" + ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12" + ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" + ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4" + ["12.5"]="2.22.3" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( - ["11.7"]="11.7.1" ["11.8"]="11.8.0" - ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" + ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" + ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2" + ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2" + ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" + ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" + ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" + ["12.6"]="12.6.3" ) # Verify SPARK compatability @@ -375,15 +384,25 @@ function set_cuda_runfile_url() { # driver version named in cuda runfile filename # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) readonly -A drv_for_cuda=( - ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" - ["11.8.0"]="520.61.05" - ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" - ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" - ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" - ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" - ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ - ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not - ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ["10.0.130"]="410.48" + ["10.1.234"]="418.87.00" + ["10.2.89"]="440.33.01" + ["11.0.3"]="450.51.06" + ["11.1.1"]="455.42.00" + ["11.2.2"]="460.32.03" + ["11.3.1"]="465.19.01" + ["11.4.4"]="470.82.01" + ["11.5.2"]="495.29.05" + ["11.6.2"]="510.47.03" + ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" + ["11.8.0"]="520.61.05" + ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" + ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" + ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" + ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" + ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ) # Verify that the file with the indicated combination exists @@ -1890,7 +1909,7 @@ function prepare_to_install(){ if is_debuntu ; then clean_up_sources_lists - apt-get update -qq + apt-get --allow-releaseinfo-change update apt-get -y clean apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then From 4559ecc1ce4f1979658d73d4302bf3e45d856012 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 13 Jan 2025 13:41:14 -0800 Subject: [PATCH 083/112] tested with CUDA 11.6.2/510.108.03 * nccl build completes successfully on debian10 * account for nvidia-smi ABI change post 11.6 --- gpu/install_gpu_driver.sh | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 9d6bfc135..71bef8293 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -345,7 +345,7 @@ function set_cuda_runfile_url() { local MAX_DRIVER_VERSION local MAX_CUDA_VERSION - local MIN_OPEN_DRIVER_VER="515.48.07" + MIN_OPEN_DRIVER_VER="515.43.04" local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER @@ -904,7 +904,7 @@ readonly uname_r=$(uname -r) function build_driver_from_github() { # non-GPL driver will have been built on rocky8 - if is_rocky8 ; then return 0 ; fi + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { tarball_fn="${DRIVER_VERSION}.tar.gz" @@ -1025,7 +1025,7 @@ function install_nvidia_userspace_runfile() { local cache_hit="0" local local_tarball - if is_rocky8 ; then + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" @@ -1039,7 +1039,9 @@ function install_nvidia_userspace_runfile() { if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then cache_hit="1" - runfile_args="--no-kernel-modules" + if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then + runfile_args="${runfile_args} --no-kernel-modules" + fi echo "cache hit" else install_build_dependencies @@ -1054,11 +1056,13 @@ function install_nvidia_userspace_runfile() { --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ " fi - - runfile_args="--no-dkms ${signing_options}" + runfile_args="${signing_options}" + if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then + runfile_args="${runfile_args} --no-dkms" + fi fi } - else + elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="--no-kernel-modules" fi @@ -1499,8 +1503,8 @@ function prepare_gpu_env(){ # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades # Users should run apt-mark unhold before they wish to upgrade these packages function hold_nvidia_packages() { - apt-mark hold nvidia-* - apt-mark hold libnvidia-* +# apt-mark hold nvidia-* +# apt-mark hold libnvidia-* if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then apt-mark hold xserver-xorg-video-nvidia* fi From 16c8485b54e8ed1fe5fa4c5610f4ff29768a4761 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 13 Jan 2025 18:35:06 -0800 Subject: [PATCH 084/112] exercised with cuda 11.1 * cleaned up nccl build and pack code a bit * no longer installing cudnn from local debian repo * unpacking nccl from cache immediately rather than waiting until later in the code * determine cudnn version by what is available in the repo * less noise from apt-mark hold * nccl build tested on 11.1 and 11.6 * account for abi change in nvidia-smi --- gpu/install_gpu_driver.sh | 87 ++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 71bef8293..373fe664a 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -277,7 +277,8 @@ function set_driver_version() { set_driver_version -readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" +readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39" +readonly DEFAULT_CUDNN8_VERSION="8.3.1.22" readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" # Parameters for NVIDIA-provided cuDNN library @@ -285,9 +286,9 @@ readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) -# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} -if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" +# The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION} +if is_rocky && (version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}" elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then # cuDNN v8 is not distribution for ubuntu20+, debian12 CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" @@ -620,30 +621,6 @@ function install_nvidia_nccl() { local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" - # https://github.com/NVIDIA/nccl/blob/master/README.md - # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Fermi: SM_20, compute_30 - # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 - # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 - # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 - - # The following architectures are suppored by open kernel driver - # Volta: SM_70,SM_72, compute_70,compute_72 - # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 - - # The following architectures are supported by CUDA v11.8+ - # Ada: SM_89, compute_89 - # Hopper: SM_90,SM_90a compute_90,compute_90a - # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" - if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" - fi - if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" - fi - mkdir -p "${workdir}" pushd "${workdir}" @@ -668,11 +645,37 @@ function install_nvidia_nccl() { if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" + gcloud storage cat "${gcs_tarball}" | tar xvz else # build and cache pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install install_build_dependencies + + # https://github.com/NVIDIA/nccl/blob/master/README.md + # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Fermi: SM_20, compute_30 + # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 + # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 + # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 + + # The following architectures are suppored by open kernel driver + # Volta: SM_70,SM_72, compute_70,compute_72 + # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 + + # The following architectures are supported by CUDA v11.8+ + # Ada: SM_89, compute_89 + # Hopper: SM_90,SM_90a compute_90,compute_90a + # Blackwell: SM_100, compute_100 + NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" + if version_gt "${CUDA_VERSION}" "11.6" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi + if version_ge "${CUDA_VERSION}" "11.8" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi + if version_ge "${CUDA_VERSION}" "12.0" ; then + NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi + if is_debuntu ; then # These packages are required to build .deb packages from source execute_with_retries \ @@ -686,13 +689,13 @@ function install_nvidia_nccl() { export NVCC_GENCODE execute_with_retries make -j$(nproc) pkg.redhat.build fi - tar czvf "/${local_tarball}" "../${build_path}" - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - rm "${local_tarball}" + tar czvf "${local_tarball}" "../${build_path}" make clean popd + tar xzvf "${local_tarball}" + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + rm "${local_tarball}" fi - gcloud storage cat "${gcs_tarball}" | tar xz } if is_debuntu ; then @@ -734,16 +737,17 @@ function install_nvidia_cudnn() { apt-get -y install nvidia-cudnn else if is_cudnn8 ; then - install_local_cudnn8_repo + add_repo_cuda apt-get update -qq + # Ignore version requested and use the latest version in the package index + cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" - uninstall_local_cudnn8_repo sync elif is_cudnn9 ; then install_cuda_keyring_pkg @@ -1503,8 +1507,10 @@ function prepare_gpu_env(){ # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades # Users should run apt-mark unhold before they wish to upgrade these packages function hold_nvidia_packages() { -# apt-mark hold nvidia-* -# apt-mark hold libnvidia-* + if ! is_debuntu ; then return ; fi + + apt-mark hold nvidia-* > /dev/null 2>&1 + apt-mark hold libnvidia-* > /dev/null 2>&1 if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then apt-mark hold xserver-xorg-video-nvidia* fi @@ -1587,17 +1593,22 @@ function main() { rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" done - MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")" if test -n "$(nvsmi -L)" ; then # cache the result of the gpu query ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" + chmod a+r "/var/run/nvidia-gpu-index.txt" fi + MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e V100 -e A100 -e H100 || echo -n "")" NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then # enable MIG on every GPU for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do - nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + if version_le "${CUDA_VERSION}" "11.6" ; then + nvsmi -i "${GPU_ID}" --multi-instance-gpu=1 + else + nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + fi done NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' From afd5f2f4f15cb1edcf0d6c7e9e0a1b94e08700f3 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 14 Jan 2025 14:33:27 -0800 Subject: [PATCH 085/112] reverting cloudbuild/Dockerfile to master --- cloudbuild/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index 2ea91e3e5..aebaffd84 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -21,8 +21,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \ echo "deb [arch=amd64 signed-by=${bazel_kr_path}] ${bazel_repo_data}" | \ dd of="${bazel_repo_file}" status=none && \ apt-get update -qq -RUN apt-get autoremove -y -qq && \ - apt-get install -y -qq openjdk-8-jdk python3-setuptools bazel >/dev/null 2>&1 && \ +RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ + apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ apt-get clean # Set bazel-${bazel_version} as the default bazel alternative in this container From 2272f97cb1c9ebbd29491311a289247ae33720d5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 14 Jan 2025 16:10:57 -0800 Subject: [PATCH 086/112] nvidia is 404ing for download.nvidia.com ; using us.download.nvidia.com --- gpu/install_gpu_driver.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 373fe664a..f93992cb4 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -246,10 +246,10 @@ function set_driver_version() { if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then + if curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the version indicated by the cuda url as the default if it exists DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then + elif curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default DEFAULT_DRIVER="${driver_max_maj_version}" fi @@ -268,7 +268,7 @@ function set_driver_version() { export DRIVER_VERSION DRIVER - gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + gpu_driver_url="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" exit 1 @@ -302,7 +302,7 @@ readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) # Parameters for NVIDIA-provided Debian GPU driver -readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" +readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") @@ -383,7 +383,7 @@ function set_cuda_runfile_url() { fi # driver version named in cuda runfile filename - # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) + # (these may not be actual driver versions - see https://us.download.nvidia.com/XFree86/Linux-x86_64/) readonly -A drv_for_cuda=( ["10.0.130"]="410.48" ["10.1.234"]="418.87.00" @@ -401,7 +401,7 @@ function set_cuda_runfile_url() { ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" - ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ + ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ) @@ -1599,11 +1599,11 @@ function main() { echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" chmod a+r "/var/run/nvidia-gpu-index.txt" fi - MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e V100 -e A100 -e H100 || echo -n "")" + MIG_GPU_LIST="$(nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n "")" NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then # enable MIG on every GPU - for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do + for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do if version_le "${CUDA_VERSION}" "11.6" ; then nvsmi -i "${GPU_ID}" --multi-instance-gpu=1 else From 3b2dc66fdd366d43a9b769c6769308e608870de4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 14 Jan 2025 21:45:20 -0800 Subject: [PATCH 087/112] skipping rocky9 --- gpu/test_gpu.py | 54 ++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f260d5927..e1ced1f41 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -4,8 +4,6 @@ from absl.testing import absltest from absl.testing import parameterized -import unittest - from integration_tests.dataproc_test_case import DataprocTestCase DEFAULT_TIMEOUT = 15 # minutes @@ -18,7 +16,7 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): GPU_T4 = "type=nvidia-tesla-t4" GPU_V100 = "type=nvidia-tesla-v100" GPU_A100 = "type=nvidia-tesla-a100,count=2" - GPU_H100 = "type=nvidia-h100-80gb,count=8" + GPU_H100 = "type=nvidia-h100-80gb,count=2" # Tests for PyTorch TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py" @@ -56,12 +54,20 @@ def verify_instance(self, name): time.sleep( 3 + random.randint(1, 30) ) self.assert_instance_command(name, "nvidia-smi", 1) + def verify_pyspark(self, name): + # Verify that pyspark works + self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + def verify_pytorch(self, name): test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), self.TORCH_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) - verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + conda_env="dpgce" + verify_cmd = \ + "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ + "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ + "${envpath}/bin/python {}".format( self.TORCH_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) @@ -70,8 +76,11 @@ def verify_tensorflow(self, name): test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), self.TF_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) - - verify_cmd = "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 | dd of=${f} ; done ; /opt/conda/miniconda3/envs/pytorch/bin/python {}".format( + # all on a single numa node + verify_cmd = \ + "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format("dpgce") + \ + "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ + "${envpath}/bin/python {}".format( self.TF_TEST_SCRIPT_FILE_NAME) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) @@ -149,7 +158,6 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = None @@ -184,7 +192,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") if driver_provider is not None: @@ -215,7 +222,6 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = "install-gpu-agent=true" @@ -246,10 +252,12 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') - unittest.expectedFailure(self) self.skipTest("known to fail") if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ @@ -265,7 +273,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") @@ -298,10 +305,9 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, # Operation [projects/.../regions/.../operations/...] failed: # Invalid value for field 'resource.machineType': \ # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \ - # 'machineTypes/a3-highgpu-8g'. \ + # 'machineTypes/a3-highgpu-2g'. \ # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature.. # ('This use case not thoroughly tested') - unittest.expectedFailure(self) self.skipTest("known to fail") if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ @@ -318,7 +324,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - master_machine_type="a3-highgpu-8g", + master_machine_type="a3-highgpu-2g", worker_machine_type="a2-highgpu-2g", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, @@ -338,11 +344,17 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = None @@ -372,6 +384,9 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator, worker_accelerator, cuda_version): + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): @@ -385,7 +400,6 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) @@ -416,18 +430,16 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), ) - def tests_driver_signing(self, configuration, machine_suffixes, + def untested_driver_signing(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version, image_os, image_version): - if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ - and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') - unittest.expectedFailure(self) self.skipTest("known to fail") kvp_array=[] From 0c420b70c6a5dc4ded7b50d29b50fbe198827d70 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 14 Jan 2025 21:45:58 -0800 Subject: [PATCH 088/112] * adding version 12.6 to the support matrix * changing layout of gcs package folder * install_pytorch function created and called when cuDNN is being installed --- gpu/install_gpu_driver.sh | 71 ++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index f93992cb4..b91046422 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -142,7 +142,7 @@ readonly -A DRIVER_FOR_CUDA=( ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08" - ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" + ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142" ) readonly -A DRIVER_SUBVER=( ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" @@ -403,7 +403,7 @@ function set_cuda_runfile_url() { ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not - ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" + ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05" ) # Verify that the file with the indicated combination exists @@ -413,16 +413,20 @@ function set_cuda_runfile_url() { local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - readonly NVIDIA_CUDA_URL - - CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" - readonly CUDA_RUNFILE if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" + if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then + echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead" + fi exit 1 fi + readonly NVIDIA_CUDA_URL + + CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" + readonly CUDA_RUNFILE + if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then @@ -588,7 +592,7 @@ function install_local_cudnn8_repo() { # cache the cudnn package cache_fetched_package "${local_deb_url}" \ - "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ + "${pkg_bucket}/nvidia/cudnn/${CUDNN8_CUDA_VER}/${deb_fn}" \ "${local_deb_fn}" local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" @@ -639,7 +643,7 @@ function install_nvidia_nccl() { test -d "${workdir}/nccl/build" || { local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}" output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then @@ -775,6 +779,48 @@ function install_nvidia_cudnn() { echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." } +function install_pytorch() { + if test -f "${workdir}/complete/pytorch" ; then return ; fi + local env + env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') + local mc3=/opt/conda/miniconda3 + local envpath="${mc3}/envs/${env}" + # Set numa node to 0 for all GPUs + for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done + local verb=create + if test -d "${envpath}" ; then verb=install ; fi + + readonly USE_PYTORCH=$(get_metadata_attribute 'use-pytorch' 'no') + case "${USE_PYTORCH^^}" in + "1" | "YES" | "TRUE" ) + local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + mkdir -p "${envpath}" + gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + cudart_spec="cuda-cudart" + if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi + "${mc3}/bin/mamba" "${verb}" -n "${env}" \ + -c conda-forge -c nvidia -c rapidsai \ + numba pytorch tensorflow[and-cuda] rapids pyspark \ + "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi + ;; + * ) echo "skip pytorch install" ;; + esac + touch "${workdir}/complete/pytorch" +} + function configure_dkms_certs() { if test -v PSN && [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping"; @@ -927,7 +973,7 @@ function build_driver_from_github() { then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" @@ -1021,7 +1067,7 @@ function install_nvidia_userspace_runfile() { local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ - "${pkg_bucket}/${USERSPACE_FILENAME}" \ + "${pkg_bucket}/nvidia/${USERSPACE_FILENAME}" \ "${local_fn}" local runfile_args @@ -1039,7 +1085,7 @@ function install_nvidia_userspace_runfile() { then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi - local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then cache_hit="1" @@ -1098,7 +1144,7 @@ function install_cuda_runfile() { local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ - "${pkg_bucket}/${CUDA_RUNFILE}" \ + "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \ "${local_fn}" execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" @@ -1578,6 +1624,7 @@ function main() { if [[ -n ${CUDNN_VERSION} ]]; then install_nvidia_nccl install_nvidia_cudnn + install_pytorch fi #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then From f69d071f68b93888e93260b269d7f652a5e6f282 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 15 Jan 2025 08:40:55 -0800 Subject: [PATCH 089/112] incorrect version check removed --- gpu/test_gpu.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index e1ced1f41..6ee2fb845 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -347,10 +347,6 @@ def test_gpu_allocation(self, configuration, master_accelerator, if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ - and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) - if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): From 73ffce5d29b6dce7d7c7392eff99e5485fcdad84 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 12:04:00 -0800 Subject: [PATCH 090/112] only install pytorch if include-pytorch metadata set to true --- gpu/install_gpu_driver.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b91046422..41a489447 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -790,8 +790,7 @@ function install_pytorch() { local verb=create if test -d "${envpath}" ; then verb=install ; fi - readonly USE_PYTORCH=$(get_metadata_attribute 'use-pytorch' 'no') - case "${USE_PYTORCH^^}" in + case "${INCLUDE_PYTORCH^^}" in "1" | "YES" | "TRUE" ) local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" @@ -1548,6 +1547,9 @@ function prepare_gpu_env(){ if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi + + INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no') + readonly INCLUDE_PYTORCH } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades @@ -1624,8 +1626,10 @@ function main() { if [[ -n ${CUDNN_VERSION} ]]; then install_nvidia_nccl install_nvidia_cudnn - install_pytorch fi + case "${INCLUDE_PYTORCH^^}" in + "1" | "YES" | "TRUE" ) install_pytorch ;; + esac #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then #install_ops_agent From 521df6288f6a4935639249c745e409fa95117ce8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 13:40:57 -0800 Subject: [PATCH 091/112] since call to install_pytorch is protected by metadata check, skip metadata check within the function ; create new function harden_sshd_config and call it --- gpu/install_gpu_driver.sh | 76 +++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 41a489447..63dbf493b 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -787,36 +787,31 @@ function install_pytorch() { local envpath="${mc3}/envs/${env}" # Set numa node to 0 for all GPUs for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done - local verb=create - if test -d "${envpath}" ; then verb=install ; fi - - case "${INCLUDE_PYTORCH^^}" in - "1" | "YES" | "TRUE" ) - local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" - - output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') - if echo "${output}" | grep -q "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - mkdir -p "${envpath}" - gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz - else - cudart_spec="cuda-cudart" - if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi - "${mc3}/bin/mamba" "${verb}" -n "${env}" \ - -c conda-forge -c nvidia -c rapidsai \ - numba pytorch tensorflow[and-cuda] rapids pyspark \ - "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" - pushd "${envpath}" - tar czf "${local_tarball}" . - popd - gcloud storage cp "${local_tarball}" "${gcs_tarball}" - fi - ;; - * ) echo "skip pytorch install" ;; - esac + + local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') + if echo "${output}" | grep -q "${gcs_tarball}" ; then + # cache hit - unpack from cache + echo "cache hit" + mkdir -p "${envpath}" + gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + local verb=create + if test -d "${envpath}" ; then verb=install ; fi + cudart_spec="cuda-cudart" + if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi + "${mc3}/bin/mamba" "${verb}" -n "${env}" \ + -c conda-forge -c nvidia -c rapidsai \ + numba pytorch tensorflow[and-cuda] rapids pyspark \ + "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + gcloud storage cp "${local_tarball}" "${gcs_tarball}" + fi touch "${workdir}/complete/pytorch" } @@ -1947,6 +1942,24 @@ function mount_ramdisk(){ fi } +function harden_sshd_config() { + # disable sha1 and md5 use in kex and kex-gss features + declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms") + for ftr in "${!feature_map[@]}" ; do + export feature=${feature_map[$ftr]} + sshd_config_line=$( + (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g'; + ssh -Q "${ftr}" ) \ + | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}; + print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if "@a"') + grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new + echo "$sshd_config_line" >> /tmp/sshd_config_new + # TODO: test whether sshd will reload with this change before mv + mv /tmp/sshd_config_new /etc/ssh/sshd_config + done + systemctl reload ssh +} + function prepare_to_install(){ # Verify OS compatability and Secure boot state check_os @@ -1971,9 +1984,10 @@ function prepare_to_install(){ if test -f "${workdir}/complete/prepare" ; then return ; fi - repair_old_backports + harden_sshd_config if is_debuntu ; then + repair_old_backports clean_up_sources_lists apt-get --allow-releaseinfo-change update apt-get -y clean From c0b60b2b1e34576f0489664012c9ea0b2cf46d47 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 13:47:55 -0800 Subject: [PATCH 092/112] increasing timeout and machine shape to reduce no-cache build time --- gpu/test_gpu.py | 14 +++++++------- integration_tests/dataproc_test_case.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 6ee2fb845..e9c2d92ad 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -6,7 +6,7 @@ from integration_tests.dataproc_test_case import DataprocTestCase -DEFAULT_TIMEOUT = 15 # minutes +DEFAULT_TIMEOUT = 45 # minutes DEFAULT_CUDA_VERSION = "12.4" class NvidiaGpuDriverTestCase(DataprocTestCase): @@ -199,7 +199,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -230,7 +230,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -280,7 +280,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -361,7 +361,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-highmem-8", + machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, boot_disk_size="50GB", @@ -402,7 +402,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -460,7 +460,7 @@ def untested_driver_signing(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 936718498..314603ea1 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -23,7 +23,7 @@ INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "false").lower() == "true" -DEFAULT_TIMEOUT = 15 # minutes +DEFAULT_TIMEOUT = 45 # minutes class DataprocTestCase(parameterized.TestCase): From 30c97c4ccfc76921258474075c7197bb7ed6a496 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 14:01:06 -0800 Subject: [PATCH 093/112] skip full test run due to edits to integration_tests directory --- cloudbuild/presubmit.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index eec7adb76..8f5a0a4b1 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,6 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + continue # to be removed before merge echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 From 84b1fb9dee4d21949549256a9a7bb0e7907d21a4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 14:09:37 -0800 Subject: [PATCH 094/112] ubuntu18 does not know about kex-gss ; use correct driver version number for cuda 11.1.1 url generation --- gpu/install_gpu_driver.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 63dbf493b..b98a5c9f2 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -389,7 +389,7 @@ function set_cuda_runfile_url() { ["10.1.234"]="418.87.00" ["10.2.89"]="440.33.01" ["11.0.3"]="450.51.06" - ["11.1.1"]="455.42.00" + ["11.1.1"]="455.32.00" ["11.2.2"]="460.32.03" ["11.3.1"]="465.19.01" ["11.4.4"]="470.82.01" @@ -1944,7 +1944,8 @@ function mount_ramdisk(){ function harden_sshd_config() { # disable sha1 and md5 use in kex and kex-gss features - declare -rA feature_map=(["kex"]="kexalgorithms" ["kex-gss"]="gssapikexalgorithms") + declare -A feature_map=(["kex"]="kexalgorithms") + if ( ! is_ubuntu || ge_ubuntu20 ) ; then feature_map["kex-gss"]="gssapikexalgorithms" ; fi for ftr in "${!feature_map[@]}" ; do export feature=${feature_map[$ftr]} sshd_config_line=$( From 11cbe953dd46bafea6bfb0bb04a6f50e0626ef79 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 19:48:17 -0800 Subject: [PATCH 095/112] on rocky9 sshd service is called sshd instead of ssh as the rest of the platforms call it --- gpu/install_gpu_driver.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b98a5c9f2..f7b5900f1 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1958,7 +1958,9 @@ function harden_sshd_config() { # TODO: test whether sshd will reload with this change before mv mv /tmp/sshd_config_new /etc/ssh/sshd_config done - systemctl reload ssh + local svc=ssh + if ge_rocky9 ; then svc="sshd" ; fi + systemctl reload "${svc}" } function prepare_to_install(){ From 56fe50cf4a5e9ed10cf41ab3d47734f7e02948bc Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 21:05:54 -0800 Subject: [PATCH 096/112] kex-gss is new in debian11 --- gpu/install_gpu_driver.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index f7b5900f1..7a0801081 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1945,7 +1945,8 @@ function mount_ramdisk(){ function harden_sshd_config() { # disable sha1 and md5 use in kex and kex-gss features declare -A feature_map=(["kex"]="kexalgorithms") - if ( ! is_ubuntu || ge_ubuntu20 ) ; then feature_map["kex-gss"]="gssapikexalgorithms" ; fi + if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then + feature_map["kex-gss"]="gssapikexalgorithms" ; fi for ftr in "${!feature_map[@]}" ; do export feature=${feature_map[$ftr]} sshd_config_line=$( From b1cd1d0c5864233b2f81f6b7cabba72b226782c4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 21:22:22 -0800 Subject: [PATCH 097/112] all rocky call it sshd it seems --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 7a0801081..dcda8154a 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1960,7 +1960,7 @@ function harden_sshd_config() { mv /tmp/sshd_config_new /etc/ssh/sshd_config done local svc=ssh - if ge_rocky9 ; then svc="sshd" ; fi + if is_rocky ; then svc="sshd" ; fi systemctl reload "${svc}" } From ca94393c8555e15ccceaaacbad0d3813e41c7b9e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 21 Jan 2025 22:02:35 -0800 Subject: [PATCH 098/112] cudnn no longer available on debian10 --- gpu/install_gpu_driver.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index dcda8154a..188ffcd7b 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -717,6 +717,7 @@ function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { if test -f "${workdir}/complete/cudnn" ; then return ; fi + if le_debian10 ; then return ; fi local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version From 1d2166c53e4919a9e1bb34c22ad72373d5f5d83b Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 22 Jan 2025 16:31:42 -0800 Subject: [PATCH 099/112] compared with #1282 ; this change matches parity more closely --- gpu/install_gpu_driver.sh | 231 ++++++++++++++++++++++---------------- 1 file changed, 135 insertions(+), 96 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 188ffcd7b..b79c67d6b 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -134,8 +134,7 @@ readonly ROLE # Minimum supported version for open kernel driver is 515.43.04 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags -# Rocky8: 12.0: 525.147.05 -latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" +latest="$(curl -s https://us.download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" readonly -A DRIVER_FOR_CUDA=( ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01" ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31" @@ -165,7 +164,6 @@ readonly -A CUDNN_FOR_CUDA=( ["12.6"]="9.6.0.74" ) # https://developer.nvidia.com/nccl/nccl-download -# 12.2: 2.19.3, 12.5: 2.21.5 readonly -A NCCL_FOR_CUDA=( ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3" ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" @@ -184,10 +182,16 @@ readonly -A CUDA_SUBVER=( ["12.6"]="12.6.3" ) -# Verify SPARK compatability -RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - function set_cuda_version() { + case "${DATAPROC_IMAGE_VERSION}" in + "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; + * ) + echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" + exit 1 + ;; + esac local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') if [[ -n "${cuda_url}" ]] ; then @@ -195,14 +199,9 @@ function set_cuda_version() { local CUDA_URL_VERSION CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then - DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" - CUDA_FULL_VERSION="${CUDA_URL_VERSION}" + DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION}" fi fi - - if ( ! test -v DEFAULT_CUDA_VERSION ) ; then - DEFAULT_CUDA_VERSION='12.4' - fi readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") @@ -215,7 +214,6 @@ function set_cuda_version() { CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} fi readonly CUDA_FULL_VERSION - } set_cuda_version @@ -264,7 +262,7 @@ function set_driver_version() { DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") readonly DRIVER_VERSION - readonly DRIVER=${DRIVER_VERSION%%.*} + readonly DRIVER="${DRIVER_VERSION%%.*}" export DRIVER_VERSION DRIVER @@ -498,25 +496,24 @@ function execute_with_retries() ( return 1 ) -CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { - if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi + is_complete cuda-keyring-installed && return local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" rm -f "${tmpdir}/cuda-keyring.deb" - CUDA_KEYRING_PKG_INSTALLED="1" + mark_complete cuda-keyring-installed } function uninstall_cuda_keyring_pkg() { apt-get purge -yq cuda-keyring - CUDA_KEYRING_PKG_INSTALLED="0" + mark_incomplete cuda-keyring-installed } function install_local_cuda_repo() { - if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi + is_complete install-local-cuda-repo && return pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" @@ -537,16 +534,15 @@ function install_local_cuda_repo() { -o /etc/apt/preferences.d/cuda-repository-pin-600 fi - touch "${workdir}/complete/install-local-cuda-repo" + mark_complete install-local-cuda-repo } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cuda-repo" + mark_incomplete install-local-cuda-repo } -CUDNN_PKG_NAME="" function install_local_cudnn_repo() { - if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi + is_complete install-local-cudnn-repo && return pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" @@ -562,18 +558,16 @@ function install_local_cudnn_repo() { cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/complete/install-local-cudnn-repo" + mark_complete install-local-cudnn-repo } function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cudnn-repo" + mark_incomplete install-local-cudnn-repo } -CUDNN8_LOCAL_REPO_INSTALLED="0" -CUDNN8_PKG_NAME="" function install_local_cudnn8_repo() { - if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi + is_complete install-local-cudnn8-repo && return if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" @@ -607,16 +601,16 @@ function install_local_cudnn8_repo() { rm -f "${local_deb_fn}" cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/complete/install-local-cudnn8-repo" + mark_complete install-local-cudnn8-repo } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cudnn8-repo" + mark_incomplete install-local-cudnn8-repo } function install_nvidia_nccl() { - if test -f "${workdir}/complete/nccl" ; then return ; fi + is_complete nccl && return if is_cuda11 && is_debian12 ; then echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" @@ -709,14 +703,14 @@ function install_nvidia_nccl() { fi popd - touch "${workdir}/complete/nccl" + mark_complete nccl } function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { - if test -f "${workdir}/complete/cudnn" ; then return ; fi + is_complete cudnn && return if le_debian10 ; then return ; fi local major_version major_version="${CUDNN_VERSION%%.*}" @@ -764,6 +758,7 @@ function install_nvidia_cudnn() { "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" + sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" @@ -776,8 +771,8 @@ function install_nvidia_cudnn() { ldconfig - touch "${workdir}/complete/cudnn" echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." + mark_complete cudnn } function install_pytorch() { @@ -948,7 +943,7 @@ function add_repo_cuda() { readonly uname_r=$(uname -r) function build_driver_from_github() { - # non-GPL driver will have been built on rocky8 + # non-GPL driver will have been built on rocky8 or if driver version is prior to open kernel version if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { @@ -976,7 +971,7 @@ function build_driver_from_github() { # build the kernel modules pushd open-gpu-kernel-modules install_build_dependencies - if is_cuda11 && is_ubuntu22 ; then + if ( is_cuda11 && is_ubuntu22 ) ; then echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" exit 1 fi @@ -985,12 +980,14 @@ function build_driver_from_github() { 2> kernel-open/build_error.log # Sign kernel modules if [[ -n "${PSN}" ]]; then + configure_dkms_certs for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ "${mok_key}" \ "${mok_der}" \ "${module}" done + clear_dkms_key fi make modules_install \ >> kernel-open/build.log \ @@ -1030,12 +1027,12 @@ function build_driver_from_packages() { add_contrib_component apt-get update -qq execute_with_retries apt-get install -y -qq --no-install-recommends dkms - #configure_dkms_certs + configure_dkms_certs execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync elif is_rocky ; then - #configure_dkms_certs + configure_dkms_certs if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else @@ -1043,7 +1040,7 @@ function build_driver_from_packages() { fi sync fi - #clear_dkms_key + clear_dkms_key } function install_nvidia_userspace_runfile() { @@ -1058,7 +1055,7 @@ function install_nvidia_userspace_runfile() { # # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - if test -f "${workdir}/complete/userspace" ; then return ; fi + is_complete userspace && return local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ @@ -1090,7 +1087,7 @@ function install_nvidia_userspace_runfile() { echo "cache hit" else install_build_dependencies - + configure_dkms_certs local signing_options signing_options="" if [[ -n "${PSN}" ]]; then @@ -1117,11 +1114,12 @@ function install_nvidia_userspace_runfile() { --install-libglvnd \ --tmpdir="${tmpdir}" - if is_rocky8 ; then + if ( is_rocky8 || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ) ; then if [[ "${cache_hit}" == "1" ]] ; then gcloud storage cat "${gcs_tarball}" | tar -C / -xzv depmod -a else + clear_dkms_key tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') @@ -1130,21 +1128,22 @@ function install_nvidia_userspace_runfile() { fi rm -f "${local_fn}" - touch "${workdir}/complete/userspace" + mark_complete userspace sync } function install_cuda_runfile() { - if test -f "${workdir}/complete/cuda" ; then return ; fi + is_complete cuda && return + local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ - "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \ + "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \ "${local_fn}" execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" rm -f "${local_fn}" - touch "${workdir}/complete/cuda" + mark_complete cuda sync } @@ -1170,7 +1169,9 @@ function install_cuda_toolkit() { function load_kernel_module() { # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + ( set +e + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + ) done depmod -a @@ -1182,7 +1183,8 @@ function load_kernel_module() { } function install_cuda(){ - if test -f "${workdir}/complete/cuda-repo" ; then return ; fi + is_complete cuda-repo && return + if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" @@ -1195,10 +1197,12 @@ function install_cuda(){ # Includes CUDA packages add_repo_cuda - touch "${workdir}/complete/cuda-repo" + mark_complete cuda-repo } function install_nvidia_container_toolkit() { + is_complete install-nvtk && return + local container_runtime_default if command -v docker ; then container_runtime_default='docker' elif command -v containerd ; then container_runtime_default='containerd' @@ -1214,11 +1218,14 @@ function install_nvidia_container_toolkit() { execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" systemctl restart "${CONTAINER_RUNTIME}" + + mark_complete install-nvtk } # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if test -f "${workdir}/complete/gpu-driver" ; then return ; fi + is_complete gpu-driver && return + if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components @@ -1240,11 +1247,11 @@ function install_nvidia_gpu_driver() { build_driver_from_github echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - touch "${workdir}/complete/gpu-driver" + mark_complete gpu-driver } function install_ops_agent(){ - if test -f "${workdir}/ops-agent-complete" ; then return ; fi + is_complete ops-agent && return mkdir -p /opt/google cd /opt/google @@ -1252,7 +1259,7 @@ function install_ops_agent(){ curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - touch "${workdir}/complete/ops-agent" + mark_complete ops-agent } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics @@ -1272,7 +1279,7 @@ function install_gpu_agent() { | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" - python3 -m venv "${venv}" + /opt/conda/miniconda3/bin/python3 -m venv "${venv}" ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip @@ -1329,11 +1336,12 @@ function configure_yarn_resources() { # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH + 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}" set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' set_hadoop_property 'yarn-site.xml' \ @@ -1358,13 +1366,12 @@ function configure_yarn_nodemanager() { } function configure_gpu_exclusive_mode() { - # check if running spark 3, if not, enable GPU exclusive mode - local spark_version - spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) - if [[ ${spark_version} != 3.* ]]; then - # include exclusive mode on GPU - nvidia-smi -c EXCLUSIVE_PROCESS - fi + if [[ "${gpu_count}" == "0" ]] ; then return ; fi + # only run this function when spark < 3.0 + if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi + # include exclusive mode on GPU + nvsmi -c EXCLUSIVE_PROCESS + clear_nvsmi_cache } function fetch_mig_scripts() { @@ -1376,6 +1383,7 @@ function fetch_mig_scripts() { } function configure_gpu_script() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # Download GPU discovery script local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' mkdir -p ${spark_gpu_script_dir} @@ -1402,6 +1410,7 @@ function configure_gpu_script() { # See the License for the specific language governing permissions and # limitations under the License. # +# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') @@ -1411,18 +1420,18 @@ EOF chmod a+rx "${gpus_resources_script}" local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" - if version_ge "${SPARK_VERSION}" "3.0" ; then - local gpu_count - gpu_count="$(lspci | grep NVIDIA | wc -l)" - local executor_cores - executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" - local executor_memory - executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" - local task_cpus=2 - local gpu_amount - gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" - - cat >>"${spark_defaults_conf}" <>"${spark_defaults_conf}" < /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi - if [[ "$1" == "-L" ]] ; then + if test -v 1 && [[ "$1" == "-L" ]] ; then local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi @@ -1489,7 +1499,7 @@ function nvsmi() { } function install_build_dependencies() { - if test -f "${workdir}/complete/build-dependencies" ; then return ; fi + is_complete build-dependencies && return if is_debuntu ; then if is_ubuntu22 && is_cuda12 ; then @@ -1527,25 +1537,57 @@ function install_build_dependencies() { execute_with_retries "${dnf_cmd}" fi - touch "${workdir}/complete/build-dependencies" + mark_complete build-dependencies +} + +function is_complete() { + phase="$1" + test -f "${workdir}/complete/${phase}" +} + +function mark_complete() { + phase="$1" + touch "${workdir}/complete/${phase}" +} + +function mark_incomplete() { + phase="$1" + rm -f "${workdir}/complete/${phase}" } function install_dependencies() { + is_complete install-dependencies && return 0 + pkg_list="pciutils screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi + mark_complete install-dependencies } function prepare_gpu_env(){ + set +e + gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" + set -e + readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 nvsmi_works="0" if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi - INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no') - readonly INCLUDE_PYTORCH + if ! test -v DEFAULT_RAPIDS_RUNTIME ; then + readonly DEFAULT_RAPIDS_RUNTIME='SPARK' + fi + + # Set variables from metadata + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') + INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")" + INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')" + readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH + + # determine whether we have nvidia-smi installed and working + nvsmi } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades @@ -1584,8 +1626,6 @@ function check_secure_boot() { mok_der=/var/lib/shim-signed/mok/MOK.der else mok_key=/var/lib/dkms/mok.key mok_der=/var/lib/dkms/mok.pub ; fi - - configure_dkms_certs } @@ -1836,15 +1876,12 @@ function exit_handler() { /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ /usr/lib \ /opt/nvidia/* \ - /usr/local/cuda-1?.? \ /opt/conda/miniconda3 | sort -h elif is_debian ; then du -x -hs \ - /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ /var/lib/{docker,mysql,} \ - /usr/lib \ /opt/nvidia/* \ - /usr/local/cuda-1?.? \ /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ /usr/bin \ /usr \ @@ -1853,11 +1890,9 @@ function exit_handler() { else du -hs \ /var/lib/docker \ - /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ /usr/lib64/google-cloud-sdk \ - /usr/lib \ /opt/nvidia/* \ - /usr/local/cuda-1?.? \ /opt/conda/miniconda3 fi @@ -1874,11 +1909,12 @@ function exit_handler() { perl -e '@siz=( sort { $a => $b } map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } ); -$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting}; print( " samples-taken: ", scalar @siz, $/, - "maximum-disk-used: $max", $/, - "minimum-disk-used: $min", $/, - " increased-by: $inc", $/ )' < "/run/disk-usage.log" + "starting-disk-used: $starting", $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" echo "exit_handler has completed" @@ -1987,18 +2023,21 @@ function prepare_to_install(){ readonly install_log="${tmpdir}/install.log" - if test -f "${workdir}/complete/prepare" ; then return ; fi + is_complete prepare.common && return harden_sshd_config if is_debuntu ; then repair_old_backports clean_up_sources_lists - apt-get --allow-releaseinfo-change update + apt-get update -qq --allow-releaseinfo-change apt-get -y clean apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi + if is_ubuntu ; then + while ! command -v gcloud ; do sleep 5s ; done + fi else dnf clean all fi @@ -2016,7 +2055,7 @@ function prepare_to_install(){ screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" - touch "${workdir}/complete/prepare" + mark_complete prepare.common } function check_os() { From 50142f6ee1b8ece3bfc168dcb6aeef2d23bb6824 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 22 Jan 2025 17:21:04 -0800 Subject: [PATCH 100/112] slightly better variable declaration ordering ; it is better still in the templates/ directory from #1282 --- gpu/install_gpu_driver.sh | 95 ++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b79c67d6b..a48e624e9 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -216,8 +216,6 @@ function set_cuda_version() { readonly CUDA_FULL_VERSION } -set_cuda_version - function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) @@ -273,39 +271,27 @@ function set_driver_version() { fi } -set_driver_version - -readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39" -readonly DEFAULT_CUDNN8_VERSION="8.3.1.22" -readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" - -# Parameters for NVIDIA-provided cuDNN library -readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} -CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") -function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) -function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) -# The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION} -if is_rocky && (version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}" -elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then - # cuDNN v8 is not distribution for ubuntu20+, debian12 - CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" -elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then - # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 - CUDNN_VERSION="8.8.0.121" -fi -readonly CUDNN_VERSION - -readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} -readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) - -# Parameters for NVIDIA-provided Debian GPU driver -readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - -readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") +function set_cudnn_version() { + readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39" + readonly DEFAULT_CUDNN8_VERSION="8.3.1.22" + readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + + # Parameters for NVIDIA-provided cuDNN library + readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} + CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") + # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION} + if ( is_rocky && version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then + CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}" + elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" + elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" + fi + readonly CUDNN_VERSION +} -USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" -readonly USERSPACE_FILENAME # Short name for urls if is_ubuntu22 ; then @@ -330,15 +316,14 @@ else nccl_shortname="${shortname}" fi -# Parameters for NVIDIA-provided package repositories -readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' -readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" +function set_nv_urls() { + # Parameters for NVIDIA-provided package repositories + readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' + readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" -# Parameters for NVIDIA-provided NCCL library -readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" -NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") -readonly NCCL_REPO_URL -readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub + # Parameter for NVIDIA-provided Rocky Linux GPU driver + readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" +} function set_cuda_runfile_url() { local MAX_DRIVER_VERSION @@ -436,11 +421,7 @@ function set_cuda_runfile_url() { fi } -set_cuda_runfile_url - -# Parameter for NVIDIA-provided Rocky Linux GPU driver -readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" - +function set_cudnn_tarball_url() { CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then @@ -460,6 +441,7 @@ if ( version_ge "${CUDA_VERSION}" "12.0" ); then fi readonly CUDNN_TARBALL readonly CUDNN_TARBALL_URL +} # Whether to install NVIDIA-provided or OS-provided GPU driver GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') @@ -610,6 +592,9 @@ function uninstall_local_cudnn8_repo() { } function install_nvidia_nccl() { + readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} + readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) + is_complete nccl && return if is_cuda11 && is_debian12 ; then @@ -1044,6 +1029,13 @@ function build_driver_from_packages() { } function install_nvidia_userspace_runfile() { + # Parameters for NVIDIA-provided Debian GPU driver + readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + + readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + + USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" + readonly USERSPACE_FILENAME # This .run file contains NV's OpenGL implementation as well as # nvidia optimized implementations of the gtk+ 2,3 stack(s) not @@ -1565,6 +1557,10 @@ function install_dependencies() { } function prepare_gpu_env(){ + #set_support_matrix + + set_cuda_version + set_driver_version set +e gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" @@ -1588,6 +1584,11 @@ function prepare_gpu_env(){ # determine whether we have nvidia-smi installed and working nvsmi + + set_nv_urls + set_cuda_runfile_url + set_cudnn_version + set_cudnn_tarball_url } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades From 6363203f7ee077b2e26eb1cebe41fa7d0f43bb63 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 22 Jan 2025 18:34:33 -0800 Subject: [PATCH 101/112] install spark rapids --- gpu/install_gpu_driver.sh | 40 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index a48e624e9..f9df64b31 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1566,7 +1566,6 @@ function prepare_gpu_env(){ gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" set -e - readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 nvsmi_works="0" if is_cuda11 ; then gcc_ver="11" @@ -1708,6 +1707,8 @@ function main() { fi configure_yarn_nodemanager + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + install_spark_rapids ; fi configure_gpu_script configure_gpu_isolation elif [[ "${ROLE}" == "Master" ]]; then @@ -2149,6 +2150,43 @@ function os_add_repo() { readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" +function install_spark_rapids() { + # Update SPARK RAPIDS config + local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 + + # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu + local -r scala_ver="2.12" + + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then + local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 + fi + + readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) + + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' + + local jar_basename + + jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" + + jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" + cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" +} + prepare_to_install main From dba00dfef9b292681b5e75ad15e348ae7bbafc8e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 22 Jan 2025 20:57:37 -0800 Subject: [PATCH 102/112] cache the results of nvidia-smi --query-gpu --- gpu/install_gpu_driver.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index f9df64b31..18e694d73 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1404,9 +1404,15 @@ function configure_gpu_script() { # # Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} +set -e +resources_json="/dev/shm/nvidia/gpusResources.json" +if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi + +mkdir -p "$(dirname ${resources_json})" + ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') -echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} +echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}" EOF chmod a+rx "${gpus_resources_script}" From 96a8d6d01a4ff525c99f153f04fc8330df9edfdb Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 23 Jan 2025 06:50:15 -0800 Subject: [PATCH 103/112] reduce development time --- cloudbuild/presubmit.sh | 1 - integration_tests/dataproc_test_case.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index 8f5a0a4b1..f796dd1f8 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -105,7 +105,6 @@ run_tests() { bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ - --flaky_test_attempts=3 \ --action_env="INTERNAL_IP_SSH=true" \ --test_output="all" \ --noshow_progress \ diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 314603ea1..4e4848523 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -180,7 +180,7 @@ def createCluster(self, if not FLAGS.skip_cleanup: args.append("--max-age=60m") - args.append("--max-idle=25m") + args.append("--max-idle=45m") cmd = "{} dataproc clusters create {} {}".format( "gcloud beta" if beta else "gcloud", self.name, " ".join(args)) From 11f099c804f33f5b63e69767584f735b55bf815d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 23 Jan 2025 07:56:06 -0800 Subject: [PATCH 104/112] exercising more CUDA variants ; testing whether tests fail on long runs --- gpu/test_gpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index e9c2d92ad..3ec053e0e 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -370,10 +370,10 @@ def test_gpu_allocation(self, configuration, master_accelerator, self.verify_instance_spark() @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, "11.8"), -# ("STANDARD", ["m"], GPU_T4, None, "12.0"), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), +# ("SINGLE", ["m"], GPU_T4, None, "11.8"), + ("STANDARD", ["m"], GPU_T4, None, "12.0"), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), + ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), ) def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, From 8ae2c0a2ecd995c49f87e458272b8f5cb8b3e4fe Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 23 Jan 2025 11:26:10 -0800 Subject: [PATCH 105/112] try to reduce concurrent builds ; extend build time further ; only enable spark rapids on images >= 2.1 --- gpu/install_gpu_driver.sh | 59 ++++++++++++++++++++++++- integration_tests/dataproc_test_case.py | 4 +- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 18e694d73..c0da65e34 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -624,6 +624,14 @@ function install_nvidia_nccl() { local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}" + if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + # do not build in tests with < 32 cores + sleep $(( ( RANDOM % 11 ) + 10 )) + while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do + sleep 5m + done + fi + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache @@ -631,6 +639,8 @@ function install_nvidia_nccl() { gcloud storage cat "${gcs_tarball}" | tar xvz else # build and cache + touch "${local_tarball}.building" + gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install install_build_dependencies @@ -677,6 +687,7 @@ function install_nvidia_nccl() { popd tar xzvf "${local_tarball}" gcloud storage cp "${local_tarball}" "${gcs_tarball}" + gcloud storage rm "${gcs_tarball}.building" rm "${local_tarball}" fi } @@ -773,6 +784,14 @@ function install_pytorch() { local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + # do not build in tests with < 32 cores + sleep $(( ( RANDOM % 11 ) + 10 )) + while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do + sleep 5m + done + fi + output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache @@ -780,6 +799,8 @@ function install_pytorch() { mkdir -p "${envpath}" gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz else + touch "${local_tarball}.building" + gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" local verb=create if test -d "${envpath}" ; then verb=install ; fi cudart_spec="cuda-cudart" @@ -792,6 +813,7 @@ function install_pytorch() { tar czf "${local_tarball}" . popd gcloud storage cp "${local_tarball}" "${gcs_tarball}" + gcloud storage rm "${gcs_tarball}.building" fi touch "${workdir}/complete/pytorch" } @@ -950,10 +972,20 @@ function build_driver_from_github() { local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + # do not build in tests with < 32 cores + sleep $(( ( RANDOM % 11 ) + 10 )) + while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do + sleep 5m + done + fi + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" else # build the kernel modules + touch "${local_tarball}.building" + gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" pushd open-gpu-kernel-modules install_build_dependencies if ( is_cuda11 && is_ubuntu22 ) ; then @@ -982,6 +1014,7 @@ function build_driver_from_github() { "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" + gcloud storage rm "${gcs_tarball}.building" rm "${local_tarball}" make clean popd @@ -1071,6 +1104,14 @@ function install_nvidia_userspace_runfile() { local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + # do not build in tests with < 32 cores + sleep $(( ( RANDOM % 11 ) + 10 )) + while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do + sleep 5m + done + fi + if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then cache_hit="1" if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then @@ -1078,6 +1119,9 @@ function install_nvidia_userspace_runfile() { fi echo "cache hit" else + # build the kernel modules + touch "${local_tarball}.building" + gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" install_build_dependencies configure_dkms_certs local signing_options @@ -1116,6 +1160,7 @@ function install_nvidia_userspace_runfile() { /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" + gcloud storage rm "${gcs_tarball}.building" fi fi @@ -1429,6 +1474,13 @@ EOF # gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" gpu_amount="$(perl -e "print 1 / ${executor_cores}")" + plugin_line="" + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + if version_ge "${DATAPROC_IMAGE_VERSION}" 2.1 ; then + plugin_line="spark.plugins=com.nvidia.spark.SQLPlugin" + fi + fi + cat >>"${spark_defaults_conf}" < Date: Thu, 23 Jan 2025 17:23:48 -0800 Subject: [PATCH 106/112] fixed bug with spark rapids version assignment ; more conservative about requirements for ramdisk ; roll back spark.SQLPlugin change --- gpu/install_gpu_driver.sh | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index c0da65e34..a419be423 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1374,6 +1374,7 @@ function configure_yarn_resources() { # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { if [[ "${gpu_count}" == "0" ]] ; then return ; fi + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' @@ -1474,13 +1475,6 @@ EOF # gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" gpu_amount="$(perl -e "print 1 / ${executor_cores}")" - plugin_line="" - if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then - if version_ge "${DATAPROC_IMAGE_VERSION}" 2.1 ; then - plugin_line="spark.plugins=com.nvidia.spark.SQLPlugin" - fi - fi - cat >>"${spark_defaults_conf}" < Date: Thu, 23 Jan 2025 18:48:49 -0800 Subject: [PATCH 107/112] * gpu does not work on capacity scheduler on dataproc 2.0 ; use fair * protect against race condition on removing the .building files * add logic for pre-11.7 cuda package repo back in * clean up and verify yarn config --- gpu/install_gpu_driver.sh | 48 +++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index a419be423..e2d5c6591 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -687,7 +687,7 @@ function install_nvidia_nccl() { popd tar xzvf "${local_tarball}" gcloud storage cp "${local_tarball}" "${gcs_tarball}" - gcloud storage rm "${gcs_tarball}.building" + if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi rm "${local_tarball}" fi } @@ -813,7 +813,7 @@ function install_pytorch() { tar czf "${local_tarball}" . popd gcloud storage cp "${local_tarball}" "${gcs_tarball}" - gcloud storage rm "${gcs_tarball}.building" + if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi fi touch "${workdir}/complete/pytorch" } @@ -941,7 +941,16 @@ function add_repo_nvidia_container_toolkit() { function add_repo_cuda() { if is_debuntu ; then - install_cuda_keyring_pkg # 11.7+, 12.0+ + if version_le "${CUDA_VERSION}" 11.6 ; then + local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg + local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" + echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ + | sudo tee "${sources_list_path}" + curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ + -o "${kr_path}" + else + install_cuda_keyring_pkg # 11.7+, 12.0+ + fi elif is_rocky ; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" fi @@ -1014,7 +1023,7 @@ function build_driver_from_github() { "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" - gcloud storage rm "${gcs_tarball}.building" + if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi rm "${local_tarball}" make clean popd @@ -1160,7 +1169,7 @@ function install_nvidia_userspace_runfile() { /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" - gcloud storage rm "${gcs_tarball}.building" + if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi fi fi @@ -1369,13 +1378,32 @@ function configure_yarn_resources() { 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' + + # Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below + if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then + fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml" + set_hadoop_property 'yarn-site.xml' \ + 'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler' + set_hadoop_property 'yarn-site.xml' \ + "yarn.scheduler.fair.user-as-default-queue" "false" + set_hadoop_property 'yarn-site.xml' \ + "yarn.scheduler.fair.allocation.file" "${fs_xml}" + set_hadoop_property 'yarn-site.xml' \ + 'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + cat > "${fs_xml}" < + + 1 + +EOF + fi } # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { if [[ "${gpu_count}" == "0" ]] ; then return ; fi - - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' set_hadoop_property 'yarn-site.xml' \ @@ -1387,9 +1415,9 @@ function configure_yarn_nodemanager() { set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' + 'yarn.nodemanager.container-executor.class' 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.group' 'yarn' # Fix local dirs access permissions local yarn_local_dirs=() From cc5abca91c4170fa600cf659f881092637eddb0c Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 23 Jan 2025 22:50:21 -0800 Subject: [PATCH 108/112] revert test_install_gpu_cuda_nvidia_with_spark_job cuda versions --- gpu/test_gpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 3ec053e0e..e9c2d92ad 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -370,10 +370,10 @@ def test_gpu_allocation(self, configuration, master_accelerator, self.verify_instance_spark() @parameterized.parameters( -# ("SINGLE", ["m"], GPU_T4, None, "11.8"), - ("STANDARD", ["m"], GPU_T4, None, "12.0"), -# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), - ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), + ("SINGLE", ["m"], GPU_T4, None, "11.8"), +# ("STANDARD", ["m"], GPU_T4, None, "12.0"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), ) def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, From 8936442e1b2f9546d2a49e58e9754afdbf9d8c67 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 28 Jan 2025 13:56:52 -0800 Subject: [PATCH 109/112] configure for use with JupyterLab --- gpu/install_gpu_driver.sh | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index e2d5c6591..acf3e21db 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -772,15 +772,18 @@ function install_nvidia_cudnn() { } function install_pytorch() { - if test -f "${workdir}/complete/pytorch" ; then return ; fi + is_complete pytorch && return + local env env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') local mc3=/opt/conda/miniconda3 local envpath="${mc3}/envs/${env}" + if [[ "${env}" == "base" ]]; then + echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi # Set numa node to 0 for all GPUs for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done - local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" @@ -805,17 +808,28 @@ function install_pytorch() { if test -d "${envpath}" ; then verb=install ; fi cudart_spec="cuda-cudart" if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi + + # Install pytorch and company to this environment "${mc3}/bin/mamba" "${verb}" -n "${env}" \ -c conda-forge -c nvidia -c rapidsai \ numba pytorch tensorflow[and-cuda] rapids pyspark \ "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" + + # Install jupyter kernel in this environment + "${envpath}/bin/python3" -m pip install ipykernel + + # package environment and cache in GCS pushd "${envpath}" tar czf "${local_tarball}" . popd gcloud storage cp "${local_tarball}" "${gcs_tarball}" if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi fi - touch "${workdir}/complete/pytorch" + + # register the environment as a selectable kernel + "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})" + + mark_complete pytorch } function configure_dkms_certs() { @@ -2067,11 +2081,11 @@ function harden_sshd_config() { feature_map["kex-gss"]="gssapikexalgorithms" ; fi for ftr in "${!feature_map[@]}" ; do export feature=${feature_map[$ftr]} - sshd_config_line=$( + sshd_config_line="${feature} $( (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g'; ssh -Q "${ftr}" ) \ - | sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}; - print("$ENV{feature} ",join(q",",map{ chomp; $_ }@a), $/) if "@a"') + | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)" + grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new echo "$sshd_config_line" >> /tmp/sshd_config_new # TODO: test whether sshd will reload with this change before mv From 0bc3c1f29876a30758d1cb3db22d2ff965fc75c4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Jan 2025 14:31:36 -0800 Subject: [PATCH 110/112] 2.2 should use 12.6.3 (latest) --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index acf3e21db..917816bd1 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -186,7 +186,7 @@ function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;; + "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;; * ) echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" exit 1 From e56ddd0fbef897c5fb2ab2d2397e5a4f3a72b330 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Sat, 1 Feb 2025 15:32:48 -0800 Subject: [PATCH 111/112] Addressing review from cnauroth gpu/install_gpu_driver.sh: * use the same retry arguments in all calls to curl * correct 12.3's driver and sub-version * improve logic for pause as other workers perform build * remove call to undefined clear_nvsmi_cache * move closing "fi" to line of its own * added comments for unclear logic * removed commented code * remove unused curl for latest driver version gpu/test_gpu.py * removed excess test * added comment about numa node selection * removed skips of rocky9 ; 2.2.44-rocky9 build succeeds --- gpu/install_gpu_driver.sh | 192 +++++++++++++++++++++++++------------- gpu/test_gpu.py | 56 ++--------- 2 files changed, 137 insertions(+), 111 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 917816bd1..6fc243fd2 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -61,9 +61,9 @@ function repair_old_backports { # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 debdists="https://deb.debian.org/debian/dists" - oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); - oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); - stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl ${curl_retry_args} "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl ${curl_retry_args} "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) @@ -134,13 +134,12 @@ readonly ROLE # Minimum supported version for open kernel driver is 515.43.04 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags -latest="$(curl -s https://us.download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" readonly -A DRIVER_FOR_CUDA=( ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01" ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31" ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" - ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.23.08" + ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06" ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142" ) readonly -A DRIVER_SUBVER=( @@ -231,6 +230,8 @@ function set_driver_version() { local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') + local nv_xf86_x64_base="https://us.download.nvidia.com/XFree86/Linux-x86_64" + local DEFAULT_DRIVER # Take default from gpu-driver-url metadata value if [[ -n "${gpu_driver_url}" ]] ; then @@ -242,12 +243,12 @@ function set_driver_version() { if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then + if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the version indicated by the cuda url as the default if it exists - DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl -s --head "https://us.download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then + DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" + elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default - DEFAULT_DRIVER="${driver_max_maj_version}" + DEFAULT_DRIVER="${driver_max_maj_version}" fi fi fi @@ -264,8 +265,8 @@ function set_driver_version() { export DRIVER_VERSION DRIVER - gpu_driver_url="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then + gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" exit 1 fi @@ -397,7 +398,7 @@ function set_cuda_runfile_url() { NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then + if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead" @@ -481,7 +482,7 @@ function execute_with_retries() ( function install_cuda_keyring_pkg() { is_complete cuda-keyring-installed && return local kr_ver=1.1 - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + curl ${curl_retry_args} \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" @@ -503,7 +504,7 @@ function install_local_cuda_repo() { readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" readonly DIST_KEYRING_DIR="/var/${pkgname}" - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + curl ${curl_retry_args} \ "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" @@ -511,7 +512,7 @@ function install_local_cuda_repo() { cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + curl ${curl_retry_args} \ "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi @@ -531,7 +532,7 @@ function install_local_cudnn_repo() { local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz - curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ + curl ${curl_retry_args} \ "${local_deb_url}" -o "${tmpdir}/local-installer.deb" dpkg -i "${tmpdir}/local-installer.deb" @@ -609,7 +610,7 @@ function install_nvidia_nccl() { test -d "${workdir}/nccl" || { local tarball_fn="v${NCCL_VERSION}-1.tar.gz" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + curl ${curl_retry_args} \ "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ | tar xz mv "nccl-${NCCL_VERSION}-1" nccl @@ -625,11 +626,22 @@ function install_nvidia_nccl() { local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then - # do not build in tests with < 32 cores + # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do - sleep 5m - done + if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then + local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + local build_start_epoch="$(date -d "${build_start_time}" +%s)" + local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while gsutil ls -L "${gcs_tarball}.building" ; do + local now_epoch="$(date -u +%s)" + if (( now_epoch > timeout_epoch )) ; then + # detect unexpected build failure after 45m + gsutil rm "${gcs_tarball}.building" + break + fi + sleep 5m + done + fi fi output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') @@ -641,6 +653,7 @@ function install_nvidia_nccl() { # build and cache touch "${local_tarball}.building" gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + building_file="${gcs_tarball}.building" pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install install_build_dependencies @@ -688,6 +701,7 @@ function install_nvidia_nccl() { tar xzvf "${local_tarball}" gcloud storage cp "${local_tarball}" "${gcs_tarball}" if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + building_file="" rm "${local_tarball}" fi } @@ -735,17 +749,17 @@ function install_nvidia_cudnn() { add_repo_cuda apt-get update -qq - # Ignore version requested and use the latest version in the package index - cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" + # Ignore version requested and use the latest version in the package index + cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" - sync + sync elif is_cudnn9 ; then - install_cuda_keyring_pkg + install_cuda_keyring_pkg apt-get update -qq @@ -755,7 +769,7 @@ function install_nvidia_cudnn() { "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" - sync + sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi @@ -788,11 +802,22 @@ function install_pytorch() { local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then - # do not build in tests with < 32 cores + # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do - sleep 5m - done + if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then + local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + local build_start_epoch="$(date -d "${build_start_time}" +%s)" + local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while gsutil ls -L "${gcs_tarball}.building" ; do + local now_epoch="$(date -u +%s)" + if (( now_epoch > timeout_epoch )) ; then + # detect unexpected build failure after 45m + gsutil rm "${gcs_tarball}.building" + break + fi + sleep 5m + done + fi fi output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') @@ -804,6 +829,7 @@ function install_pytorch() { else touch "${local_tarball}.building" gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + building_file="${gcs_tarball}.building" local verb=create if test -d "${envpath}" ; then verb=install ; fi cudart_spec="cuda-cudart" @@ -824,6 +850,7 @@ function install_pytorch() { popd gcloud storage cp "${local_tarball}" "${gcs_tarball}" if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + building_file="" fi # register the environment as a selectable kernel @@ -960,7 +987,7 @@ function add_repo_cuda() { local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ | sudo tee "${sources_list_path}" - curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ + curl ${curl_retry_args} "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \ -o "${kr_path}" else install_cuda_keyring_pkg # 11.7+, 12.0+ @@ -978,7 +1005,7 @@ function build_driver_from_github() { pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { tarball_fn="${DRIVER_VERSION}.tar.gz" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + curl ${curl_retry_args} \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ | tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules @@ -996,11 +1023,22 @@ function build_driver_from_github() { local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then - # do not build in tests with < 32 cores + # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do - sleep 5m - done + if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then + local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + local build_start_epoch="$(date -d "${build_start_time}" +%s)" + local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while gsutil ls -L "${gcs_tarball}.building" ; do + local now_epoch="$(date -u +%s)" + if (( now_epoch > timeout_epoch )) ; then + # detect unexpected build failure after 45m + gsutil rm "${gcs_tarball}.building" + break + fi + sleep 5m + done + fi fi if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then @@ -1009,6 +1047,7 @@ function build_driver_from_github() { # build the kernel modules touch "${local_tarball}.building" gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + building_file="${gcs_tarball}.building" pushd open-gpu-kernel-modules install_build_dependencies if ( is_cuda11 && is_ubuntu22 ) ; then @@ -1038,6 +1077,7 @@ function build_driver_from_github() { $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + building_file="" rm "${local_tarball}" make clean popd @@ -1128,11 +1168,22 @@ function install_nvidia_userspace_runfile() { local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then - # do not build in tests with < 32 cores + # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do - sleep 5m - done + if gcloud storage ls -j "${gcs_tarball}.building" > "${local_tarball}.building.json" ; then + local build_start_time="$(jq -r .[0].metadata.timeCreated "${local_tarball}.building.json")" + local build_start_epoch="$(date -d "${build_start_time}" +%s)" + local timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes + while gsutil ls -L "${gcs_tarball}.building" ; do + local now_epoch="$(date -u +%s)" + if (( now_epoch > timeout_epoch )) ; then + # detect unexpected build failure after 45m + gsutil rm "${gcs_tarball}.building" + break + fi + sleep 5m + done + fi fi if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then @@ -1145,6 +1196,7 @@ function install_nvidia_userspace_runfile() { # build the kernel modules touch "${local_tarball}.building" gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building" + building_file="${gcs_tarball}.building" install_build_dependencies configure_dkms_certs local signing_options @@ -1184,6 +1236,7 @@ function install_nvidia_userspace_runfile() { $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi + building_file="" fi fi @@ -1316,7 +1369,7 @@ function install_ops_agent(){ mkdir -p /opt/google cd /opt/google # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation - curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install mark_complete ops-agent @@ -1332,9 +1385,9 @@ function install_gpu_agent() { fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + curl ${curl_retry_args} \ "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + curl ${curl_retry_args} \ "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" @@ -1451,7 +1504,6 @@ function configure_gpu_exclusive_mode() { if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU nvsmi -c EXCLUSIVE_PROCESS - clear_nvsmi_cache } function fetch_mig_scripts() { @@ -1653,6 +1705,9 @@ function install_dependencies() { function prepare_gpu_env(){ #set_support_matrix + # if set, this variable includes a gcs path to a build-in-progress indicator + building_file="" + set_cuda_version set_driver_version @@ -1763,7 +1818,7 @@ function main() { #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then #install_ops_agent - install_gpu_agent + install_gpu_agent echo 'GPU metrics agent successfully deployed.' else echo 'GPU metrics agent will not be installed.' @@ -1775,22 +1830,22 @@ function main() { done if test -n "$(nvsmi -L)" ; then - # cache the result of the gpu query + # cache the result of the gpu query ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" - chmod a+r "/var/run/nvidia-gpu-index.txt" + chmod a+r "/var/run/nvidia-gpu-index.txt" fi MIG_GPU_LIST="$(nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n "")" NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then # enable MIG on every GPU - for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do + for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do if version_le "${CUDA_VERSION}" "11.6" ; then nvsmi -i "${GPU_ID}" --multi-instance-gpu=1 else - nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 fi - done + done NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)" @@ -1825,7 +1880,7 @@ function cache_fetched_package() { if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then time gcloud storage cp "${gcs_fn}" "${local_fn}" else - time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ + time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \ gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) fi } @@ -1854,7 +1909,7 @@ function clean_up_sources_lists() { local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" rm -f "${bigtop_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ + curl ${curl_retry_args} \ "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" @@ -1868,7 +1923,7 @@ function clean_up_sources_lists() { local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" rm -f "${adoptium_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ + curl ${curl_retry_args} "${key_url}" \ | gpg --dearmor -o "${adoptium_kr_path}" echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ > /etc/apt/sources.list.d/adoptium.list @@ -1882,7 +1937,7 @@ function clean_up_sources_lists() { local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" rm -f "${docker_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ + curl ${curl_retry_args} "${docker_key_url}" \ | gpg --dearmor -o "${docker_kr_path}" echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ > ${docker_repo_file} @@ -1892,7 +1947,7 @@ function clean_up_sources_lists() { # if ls /etc/apt/sources.list.d/google-cloud*.list ; then rm -f /usr/share/keyrings/cloud.google.gpg - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg + curl ${curl_retry_args} https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg for list in google-cloud google-cloud-logging google-cloud-monitoring ; do list_file="/etc/apt/sources.list.d/${list}.list" if [[ -f "${list_file}" ]]; then @@ -1908,7 +1963,7 @@ function clean_up_sources_lists() { keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi rm -f /usr/share/keyrings/cran-r.gpg - curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ + curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ gpg --dearmor -o /usr/share/keyrings/cran-r.gpg sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi @@ -1918,7 +1973,7 @@ function clean_up_sources_lists() { # if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then rm -f /usr/share/keyrings/mysql.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + curl ${curl_retry_args} 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ gpg --dearmor -o /usr/share/keyrings/mysql.gpg sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi @@ -1931,6 +1986,11 @@ function exit_handler() { # Purge private key material until next grant clear_dkms_key + # clean up incomplete build indicators + if test -n "${building_file}" ; then + if gcloud storage ls "${building_file}" ; then gcloud storage rm "${building_file}" || true ; fi + fi + set +ex echo "Exit handler invoked" @@ -2078,9 +2138,11 @@ function harden_sshd_config() { # disable sha1 and md5 use in kex and kex-gss features declare -A feature_map=(["kex"]="kexalgorithms") if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then - feature_map["kex-gss"]="gssapikexalgorithms" ; fi + feature_map["kex-gss"]="gssapikexalgorithms" + fi for ftr in "${!feature_map[@]}" ; do - export feature=${feature_map[$ftr]} + local feature=${feature_map[$ftr]} + local sshd_config_line sshd_config_line="${feature} $( (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g'; ssh -Q "${ftr}" ) \ @@ -2089,7 +2151,7 @@ function harden_sshd_config() { grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new echo "$sshd_config_line" >> /tmp/sshd_config_new # TODO: test whether sshd will reload with this change before mv - mv /tmp/sshd_config_new /etc/ssh/sshd_config + mv -f /tmp/sshd_config_new /etc/ssh/sshd_config done local svc=ssh if is_rocky ; then svc="sshd" ; fi @@ -2101,6 +2163,8 @@ function prepare_to_install(){ check_os check_secure_boot + curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" + prepare_gpu_env workdir=/opt/install-dpgce @@ -2178,6 +2242,9 @@ function check_os() { if test -v DATAPROC_VERSION ; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" else + # When building custom-images, neither of the above variables + # are defined and we need to make a reasonable guess + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" @@ -2213,9 +2280,8 @@ function dnf_add_repo() { local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - curl -s -L "${repo_url}" \ + curl ${curl_retry_args} "${repo_url}" \ | dd of="${repo_path}" status=progress -# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ } # @@ -2233,7 +2299,7 @@ function os_add_repo() { mkdir -p "$(dirname "${kr_path}")" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + curl ${curl_retry_args} "${signing_key_url}" \ | gpg --import --no-default-keyring --keyring "${kr_path}" if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index e9c2d92ad..3d6dbd416 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -64,6 +64,12 @@ def verify_pytorch(self, name): self.upload_test_file(test_filename, name) conda_env="dpgce" + + # until the numa node is selected, every time the GPU is accessed + # from pytorch, log noise about numa node not being selected is + # printed to the console. Selecting numa node before the python is + # executed improves readability of the diagnostic information. + verify_cmd = \ "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ @@ -77,8 +83,9 @@ def verify_tensorflow(self, name): self.TF_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) # all on a single numa node + conda_env="dpgce" verify_cmd = \ - "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format("dpgce") + \ + "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ "${envpath}/bin/python {}".format( self.TF_TEST_SCRIPT_FILE_NAME) @@ -144,41 +151,6 @@ def verify_driver_signature(self, name): """ self.assert_instance_command( name, cert_verification_cmd.format(cert_path) ) - @parameterized.parameters( - ("SINGLE", ["m"], GPU_T4, None, None), -# ("STANDARD", ["m"], GPU_T4, None, None), - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "NVIDIA"), - ) - def test_install_gpu_default_agent(self, configuration, machine_suffixes, - master_accelerator, worker_accelerator, - driver_provider): - self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") - - if configuration == 'SINGLE' \ - and self.getImageOs() == 'rocky' \ - and self.getImageVersion() <= pkg_resources.parse_version("2.1"): - # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - self.skipTest("known to fail") - - metadata = None - if driver_provider is not None: - metadata = "gpu-driver-provider={}".format(driver_provider) - self.createCluster( - configuration, - self.INIT_ACTIONS, - machine_type="n1-highmem-32", - master_accelerator=master_accelerator, - worker_accelerator=worker_accelerator, - metadata=metadata, - timeout_in_minutes=90, # This cluster is sized and timed correctly to build the driver and nccl - boot_disk_size="60GB") - for machine_suffix in machine_suffixes: - machine_name="{}-{}".format(self.getClusterName(),machine_suffix) - self.verify_instance(machine_name) - self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) - self.verify_instance_pyspark(machine_name) - self.verify_instance_spark() - @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), ) @@ -252,9 +224,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') @@ -344,9 +313,6 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): @@ -380,9 +346,6 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator, worker_accelerator, cuda_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): @@ -430,9 +393,6 @@ def untested_driver_signing(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version, image_os, image_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') From 3384a4de73b9e10105d894331f28b8ee19bb263f Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 6 Feb 2025 11:48:25 -0800 Subject: [PATCH 112/112] reverting changes to presubmit.sh --- cloudbuild/presubmit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index f796dd1f8..eec7adb76 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,7 +70,6 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then - continue # to be removed before merge echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 @@ -105,6 +104,7 @@ run_tests() { bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ + --flaky_test_attempts=3 \ --action_env="INTERNAL_IP_SSH=true" \ --test_output="all" \ --noshow_progress \