Skip to content

Commit

Permalink
Move parallel_gpu_execute.sh to build_tools/ci
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 718115722
  • Loading branch information
ddunl authored and Google-ML-Automation committed Jan 23, 2025
1 parent 427aaaa commit 153ff32
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 5 deletions.
5 changes: 5 additions & 0 deletions build_tools/ci/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,8 @@ diff_test(
file2 = ":generated_build_commands",
tags = ["not_run:arm"],
)

sh_binary(
name = "parallel_gpu_execute",
srcs = ["parallel_gpu_execute.sh"],
)
2 changes: 1 addition & 1 deletion build_tools/ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def nvidia_gpu_build_with_compute_capability(
+ extra_gpu_tags,
build_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only"),
options={
"run_under": "//tools/ci_build/gpu_build:parallel_gpu_execute",
"run_under": "//build_tools/ci:parallel_gpu_execute",
"repo_env": f"TF_CUDA_COMPUTE_CAPABILITIES={compute_capability/10}",
"@cuda_driver//:enable_forward_compatibility": "true",
**_DEFAULT_BAZEL_OPTIONS,
Expand Down
4 changes: 2 additions & 2 deletions build_tools/ci/golden_commands.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ $KOKORO_ARTIFACTS_DIR/github/xla/.kokoro/generate_index_html.sh index.html
nvidia-smi
parallel --ungroup --retries 3 --delay 15 --nonall -- docker pull us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest
docker run --detach --name=xla_ci --rm --interactive --tty --volume=./github:/github --workdir=/github/xla us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest bash
docker exec xla_ci parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
docker exec xla_ci bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
docker exec xla_ci parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
docker exec xla_ci bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
docker exec xla_ci bazel analyze-profile profile.json.gz
docker stop xla_ci
# END BuildType.GPU
Expand Down
83 changes: 83 additions & 0 deletions build_tools/ci/parallel_gpu_execute.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
#
# A script to run multiple GPU tests in parallel controlled with an environment
# variable.
#
# Required environment variables:
# TF_GPU_COUNT = Number of GPUs available.

TF_GPU_COUNT=${TF_GPU_COUNT:-4}
TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8}

# This function is used below in rlocation to check that a path is absolute
function is_absolute {
[[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
}

export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-2048}

# *******************************************************************
# This section of the script is needed to
# make things work on windows under msys.
# *******************************************************************
RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
function rlocation() {
if is_absolute "$1" ; then
# If the file path is already fully specified, simply return it.
echo "$1"
elif [[ -e "$TEST_SRCDIR/$1" ]]; then
# If the file exists in the $TEST_SRCDIR then just use it.
echo "$TEST_SRCDIR/$1"
elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
# If a runfiles manifest file exists then use it.
echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
fi
}

TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
shift
# *******************************************************************

mkdir -p /var/lock
# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
# slots to run a test at.
#
# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
# So, we iterate over TF_TESTS_PER_GPU first.
for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
for i in `seq 0 $((TF_GPU_COUNT-1))`; do
exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
if flock -n "$lock_fd";
then
(
# This export only works within the brackets, so it is isolated to one
# single command.
export CUDA_VISIBLE_DEVICES=$i
export HIP_VISIBLE_DEVICES=$i
echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
"$TEST_BINARY" $@
)
return_code=$?
flock -u "$lock_fd"
exit $return_code
fi
done
done

echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
exit 1
2 changes: 1 addition & 1 deletion build_tools/rocm/run_xla.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,5 +71,5 @@ bazel \
--action_env=TF_ROCM_AMDGPU_TARGETS=gfx90a \
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
--run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
--run_under=//build_tools/ci:parallel_gpu_execute \
-- //xla/...
2 changes: 1 addition & 1 deletion build_tools/rocm/run_xla_multi_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# //xla/pjrt/distributed:topology_util_test
# //xla/pjrt/distributed:client_server_test
# ```
# Also these tests do not use `--run_under=//tools/ci_build/gpu_build:parallel_gpu_execute` with bazel which
# Also these tests do not use `--run_under=//build_tools/ci:parallel_gpu_execute` with bazel which
# locks down individual gpus thus making multi gpu tests impossible to run

set -e
Expand Down

0 comments on commit 153ff32

Please sign in to comment.