Skip to content

Commit

Permalink
changes from testing PR GoogleCloudDataproc#1275
Browse files Browse the repository at this point in the history
  • Loading branch information
cjac committed Jan 29, 2025
1 parent 07949a9 commit 989b445
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 42 deletions.
7 changes: 3 additions & 4 deletions templates/common/util_functions
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | c
# ( version_ge 2.0 2.1 ) evaluates to false
# ( version_ge 2.2 2.1 ) evaluates to true
function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge "$1" "$2" ; )
function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le "$1" "$2" ; )

function define_os_comparison_functions() {

Expand Down Expand Up @@ -500,8 +500,7 @@ function harden_sshd_config() {
sshd_config_line=$(
(sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
ssh -Q "${ftr}" ) \
| sort -u | perl -e '@a=grep{!/(sha1|md5)/ig}<STDIN>;
print("$ENV{feature} ",join(",",map{ chomp; $_ }@a), $/) if "@a"')
| sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)
grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
echo "$sshd_config_line" >> /tmp/sshd_config_new
# TODO: test whether sshd will reload with this change before mv
Expand Down
96 changes: 63 additions & 33 deletions templates/gpu/install_functions
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ function set_cuda_runfile_url() {
["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
)

# Verify that the file with the indicated combination exists
Expand Down Expand Up @@ -200,6 +200,7 @@ function uninstall_local_cuda_repo(){
}

function install_local_cudnn_repo() {
# https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
is_complete install-local-cudnn-repo && return

pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
Expand Down Expand Up @@ -368,6 +369,7 @@ function install_nvidia_nccl() {
mark_complete nccl
}

# https://docs.nvidia.com/deeplearning/cudnn/sla/index.html
function install_nvidia_cudnn() {
if le_debian10 ; then return ; fi
is_complete cudnn && return
Expand Down Expand Up @@ -435,45 +437,64 @@ function install_nvidia_cudnn() {
}

function install_pytorch() {
if test -f "${workdir}/complete/pytorch" ; then return ; fi
is_complete pytorch && return

local env
env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')
local mc3=/opt/conda/miniconda3
local envpath="${mc3}/envs/${env}"
if [[ "${env}" == "base" ]]; then
echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi
# Set numa node to 0 for all GPUs
for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done

readonly INCLUDE_PYTORCH=$(get_metadata_attribute 'include-pytorch' 'no')
case "${INCLUDE_PYTORCH^^}" in
"1" | "YES" | "TRUE" )
local build_tarball="pytorch_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
local local_tarball="${workdir}/${build_tarball}"
local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"
local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
local local_tarball="${workdir}/${build_tarball}"
local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"

output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
if echo "${output}" | grep -q "${gcs_tarball}" ; then
# cache hit - unpack from cache
echo "cache hit"
mkdir -p "${envpath}"
gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
else
local verb=create
if test -d "${envpath}" ; then verb=install ; fi
cudart_spec="cuda-cudart"
if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi
"${mc3}/bin/mamba" "${verb}" -n "${env}" \
-c conda-forge -c nvidia -c rapidsai \
numba pytorch tensorflow[and-cuda] rapids pyspark \
"cuda-version<=${CUDA_VERSION}" "${cudart_spec}"
pushd "${envpath}"
tar czf "${local_tarball}" .
popd
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
fi
;;
* ) echo "skip pytorch install" ;;
esac
touch "${workdir}/complete/pytorch"
if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
# do not build in tests with < 32 cores
sleep $(( ( RANDOM % 11 ) + 10 ))
while gsutil ls "${gcs_tarball}.building" 2>&1 | grep -q "${gcs_tarball}.building" ; do
sleep 5m
done
fi

output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
if echo "${output}" | grep -q "${gcs_tarball}" ; then
# cache hit - unpack from cache
echo "cache hit"
mkdir -p "${envpath}"
gcloud storage cat "${gcs_tarball}" | tar -C "${envpath}" -xz
else
touch "${local_tarball}.building"
gcloud storage cp "${local_tarball}.building" "${gcs_tarball}.building"
local verb=create
if test -d "${envpath}" ; then verb=install ; fi
cudart_spec="cuda-cudart"
if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi

# Install pytorch and company to this environment
"${mc3}/bin/mamba" "${verb}" -n "${env}" \
-c conda-forge -c nvidia -c rapidsai \
numba pytorch tensorflow[and-cuda] rapids pyspark \
"cuda-version<=${CUDA_VERSION}" "${cudart_spec}"

# Install jupyter kernel in this environment
"${envpath}/bin/python3" -m pip install ipykernel

# package environment and cache in GCS
pushd "${envpath}"
tar czf "${local_tarball}" .
popd
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" || true ; fi
fi

# register the environment as a selectable kernel
"${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"

mark_complete pytorch
}

function add_nonfree_components() {
Expand Down Expand Up @@ -508,7 +529,16 @@ function add_repo_nvidia_container_toolkit() {

function add_repo_cuda() {
if is_debuntu ; then
install_cuda_keyring_pkg # 11.7+, 12.0+
if version_le "${CUDA_VERSION}" 11.6 ; then
local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
| sudo tee "${sources_list_path}"
curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
-o "${kr_path}"
else
install_cuda_keyring_pkg # 11.7+, 12.0+
fi
elif is_rocky ; then
execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
fi
Expand Down
5 changes: 3 additions & 2 deletions templates/gpu/spark_functions
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ function download_spark_jar() {

function install_spark_rapids() {
# Update SPARK RAPIDS config
local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
local DEFAULT_SPARK_RAPIDS_VERSION
DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3

# https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
local -r scala_ver="2.12"

if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
fi

readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
Expand Down
2 changes: 1 addition & 1 deletion templates/gpu/util_functions
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ function set_cuda_version() {
case "${DATAPROC_IMAGE_VERSION}" in
"2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
"2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
"2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
"2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
* )
echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
exit 1
Expand Down
30 changes: 28 additions & 2 deletions templates/gpu/yarn_functions
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@ function configure_yarn_gpu_resources() {
'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'

set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'

# Older CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler on 2.0 and below
if version_lt "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
fs_xml="$HADOOP_CONF_DIR/fair-scheduler.xml"
set_hadoop_property 'yarn-site.xml' \
'yarn.resourcemanager.scheduler.class' 'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler'
set_hadoop_property 'yarn-site.xml' \
"yarn.scheduler.fair.user-as-default-queue" "false"
set_hadoop_property 'yarn-site.xml' \
"yarn.scheduler.fair.allocation.file" "${fs_xml}"
set_hadoop_property 'yarn-site.xml' \
'yarn.scheduler.fair.resource-calculator' 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
cat > "${fs_xml}" <<EOF
<!-- ${fs_xml} -->
<allocations>
<queueMaxAppsDefault>1</queueMaxAppsDefault>
</allocations>
EOF
fi
}

function configure_gpu_script() {
Expand Down Expand Up @@ -44,9 +63,15 @@ function configure_gpu_script() {
#
# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}

set -e
resources_json="/dev/shm/nvidia/gpusResources.json"
if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi

mkdir -p "$(dirname ${resources_json})"

ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')

echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}"
EOF

chmod a+rx "${gpus_resources_script}"
Expand Down Expand Up @@ -78,14 +103,14 @@ EOF
# having AQE enabled gives user the best performance.
spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
spark.executor.resource.gpu.amount=${gpu_count}
spark.plugins=com.nvidia.spark.SQLPlugin
spark.executor.cores=${executor_cores}
spark.executor.memory=${executor_memory_gb}G
spark.dynamicAllocation.enabled=false
# please update this config according to your application
spark.task.resource.gpu.amount=${gpu_amount}
spark.task.cpus=2
spark.yarn.unmanagedAM.enabled=false
spark.plugins=com.nvidia.spark.SQLPlugin
###### END : RAPIDS properties for Spark ${SPARK_VERSION} ######
EOF
}
Expand All @@ -97,6 +122,7 @@ function configure_yarn_nodemanager_gpu() {
'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"

configure_yarn_nodemanager
}

Expand Down

0 comments on commit 989b445

Please sign in to comment.