diff --git a/build_tools/ci/BUILD b/build_tools/ci/BUILD index d5568177b4bd11..5f452ad103a97a 100644 --- a/build_tools/ci/BUILD +++ b/build_tools/ci/BUILD @@ -41,3 +41,8 @@ diff_test( file2 = ":generated_build_commands", tags = ["not_run:arm"], ) + +sh_binary( + name = "parallel_gpu_execute", + srcs = ["parallel_gpu_execute.sh"], +) diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index 8f0ff3fda6d4da..45bddabc94da73 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -267,7 +267,7 @@ def nvidia_gpu_build_with_compute_capability( + extra_gpu_tags, build_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only"), options={ - "run_under": "//tools/ci_build/gpu_build:parallel_gpu_execute", + "run_under": "//build_tools/ci:parallel_gpu_execute", "repo_env": f"TF_CUDA_COMPUTE_CAPABILITIES={compute_capability/10}", "@cuda_driver//:enable_forward_compatibility": "true", **_DEFAULT_BAZEL_OPTIONS, diff --git a/build_tools/ci/golden_commands.txt b/build_tools/ci/golden_commands.txt index a2061701ffb915..56395824ca3a1d 100644 --- a/build_tools/ci/golden_commands.txt +++ b/build_tools/ci/golden_commands.txt @@ -21,8 +21,8 @@ $KOKORO_ARTIFACTS_DIR/github/xla/.kokoro/generate_index_html.sh index.html nvidia-smi parallel --ungroup --retries 3 --delay 15 --nonall -- docker pull us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest docker run --detach --name=xla_ci --rm --interactive --tty --volume=./github:/github --workdir=/github/xla us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest bash -docker exec xla_ci parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/... -docker exec xla_ci bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/... +docker exec xla_ci parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/... +docker exec xla_ci bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/... docker exec xla_ci bazel analyze-profile profile.json.gz docker stop xla_ci # END BuildType.GPU diff --git a/build_tools/ci/parallel_gpu_execute.sh b/build_tools/ci/parallel_gpu_execute.sh new file mode 100755 index 00000000000000..a00dcbc3f3404a --- /dev/null +++ b/build_tools/ci/parallel_gpu_execute.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# +# A script to run multiple GPU tests in parallel controlled with an environment +# variable. +# +# Required environment variables: +# TF_GPU_COUNT = Number of GPUs available. + +TF_GPU_COUNT=${TF_GPU_COUNT:-4} +TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8} + +# This function is used below in rlocation to check that a path is absolute +function is_absolute { + [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]] +} + +export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-2048} + +# ******************************************************************* +# This section of the script is needed to +# make things work on windows under msys. +# ******************************************************************* +RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST" +function rlocation() { + if is_absolute "$1" ; then + # If the file path is already fully specified, simply return it. + echo "$1" + elif [[ -e "$TEST_SRCDIR/$1" ]]; then + # If the file exists in the $TEST_SRCDIR then just use it. + echo "$TEST_SRCDIR/$1" + elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then + # If a runfiles manifest file exists then use it. + echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')" + fi +} + +TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})" +shift +# ******************************************************************* + +mkdir -p /var/lock +# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU +# slots to run a test at. +# +# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU. +# So, we iterate over TF_TESTS_PER_GPU first. +for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do + for i in `seq 0 $((TF_GPU_COUNT-1))`; do + exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1 + if flock -n "$lock_fd"; + then + ( + # This export only works within the brackets, so it is isolated to one + # single command. + export CUDA_VISIBLE_DEVICES=$i + export HIP_VISIBLE_DEVICES=$i + echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES" + "$TEST_BINARY" $@ + ) + return_code=$? + flock -u "$lock_fd" + exit $return_code + fi + done +done + +echo "Cannot find a free GPU to run the test $* on, exiting with failure..." +exit 1 diff --git a/build_tools/rocm/run_xla.sh b/build_tools/rocm/run_xla.sh index 2ed5dc2d317acc..ef556b5b850ede 100755 --- a/build_tools/rocm/run_xla.sh +++ b/build_tools/rocm/run_xla.sh @@ -71,5 +71,5 @@ bazel \ --action_env=TF_ROCM_AMDGPU_TARGETS=gfx90a \ --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \ --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \ - --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \ + --run_under=//build_tools/ci:parallel_gpu_execute \ -- //xla/... diff --git a/build_tools/rocm/run_xla_multi_gpu.sh b/build_tools/rocm/run_xla_multi_gpu.sh index 4d7d67d2fe8475..5cca78c8a70992 100755 --- a/build_tools/rocm/run_xla_multi_gpu.sh +++ b/build_tools/rocm/run_xla_multi_gpu.sh @@ -26,7 +26,7 @@ # //xla/pjrt/distributed:topology_util_test # //xla/pjrt/distributed:client_server_test # ``` -# Also these tests do not use `--run_under=//tools/ci_build/gpu_build:parallel_gpu_execute` with bazel which +# Also these tests do not use `--run_under=//build_tools/ci:parallel_gpu_execute` with bazel which # locks down individual gpus thus making multi gpu tests impossible to run set -e