Skip to content

Commit

Permalink
gpu does not work on capacity scheduler on dataproc 2.0 ; use fair ; …
Browse files Browse the repository at this point in the history
…also protect against race condition on removing the .building files
  • Loading branch information
cjac committed Jan 24, 2025
1 parent 02732e1 commit 3b3e90e
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions gpu/install_gpu_driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,7 @@ function install_nvidia_nccl() {
popd
tar xzvf "${local_tarball}"
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
gcloud storage rm "${gcs_tarball}.building"
if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" ; fi
rm "${local_tarball}"
fi
}
Expand Down Expand Up @@ -813,7 +813,7 @@ function install_pytorch() {
tar czf "${local_tarball}" .
popd
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
gcloud storage rm "${gcs_tarball}.building"
if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" ; fi
fi
touch "${workdir}/complete/pytorch"
}
Expand Down Expand Up @@ -1014,7 +1014,7 @@ function build_driver_from_github() {
"${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
$(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
gcloud storage rm "${gcs_tarball}.building"
if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" ; fi
rm "${local_tarball}"
make clean
popd
Expand Down Expand Up @@ -1160,7 +1160,7 @@ function install_nvidia_userspace_runfile() {
/var/log/nvidia-installer.log \
$(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
gcloud storage cp "${local_tarball}" "${gcs_tarball}"
gcloud storage rm "${gcs_tarball}.building"
if gcloud storage ls "${gcs_tarball}.building" ; then gcloud storage rm "${gcs_tarball}.building" ; fi
fi
fi

Expand Down Expand Up @@ -1374,7 +1374,12 @@ function configure_yarn_resources() {
# This configuration should be applied only if GPU is attached to the node
function configure_yarn_nodemanager() {
if [[ "${gpu_count}" == "0" ]] ; then return ; fi

# CapacityScheduler does not permit use of gpu resources ; switch to FairScheduler
if version_lt "${DATAPROC_IMAGE_VERSION}" 2.1 ; then
set_hadoop_property 'yarn-site.xml' \
'yarn.resourcemanager.scheduler.class' \
'org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler'
fi
set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
set_hadoop_property 'yarn-site.xml' \
'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
Expand Down

0 comments on commit 3b3e90e

Please sign in to comment.