From 4ea5b6f4b8d008c6c335e4051bc83bf0083b6d6e Mon Sep 17 00:00:00 2001 From: Harsha HS Date: Thu, 10 Oct 2024 10:33:59 -0700 Subject: [PATCH] Add multigpu script and disable triton tests --- build_tools/rocm/run_xla_multi_gpu.sh | 79 +++++++++++++++++++ xla/service/gpu/BUILD | 7 +- xla/service/gpu/tests/BUILD | 1 - xla/tests/collective_ops_test_e2e.cc | 1 + .../functional_hlo_runner_test.cc | 2 + 5 files changed, 84 insertions(+), 6 deletions(-) create mode 100755 build_tools/rocm/run_xla_multi_gpu.sh diff --git a/build_tools/rocm/run_xla_multi_gpu.sh b/build_tools/rocm/run_xla_multi_gpu.sh new file mode 100755 index 0000000000000..d030b3e3f20b0 --- /dev/null +++ b/build_tools/rocm/run_xla_multi_gpu.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ============================================================================== + +set -e +set -x + +N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo) +# If rocm-smi exists locally (it should) use it to find +# out how many GPUs we have to test with. +rocm-smi -i +STATUS=$? +if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else + TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l) +fi +if [[ $TF_GPU_COUNT -lt 4 ]]; then + echo "Found only ${TF_GPU_COUNT} gpus, multi-gpu tests need atleast 4 gpus." + exit +fi + +TF_TESTS_PER_GPU=1 +N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU}) + +echo "" +echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)." +echo "" + +# First positional argument (if any) specifies the ROCM_INSTALL_DIR +if [[ -n $1 ]]; then + ROCM_INSTALL_DIR=$1 +else + if [[ -z "${ROCM_PATH}" ]]; then + ROCM_INSTALL_DIR=/opt/rocm-6.0.2 + else + ROCM_INSTALL_DIR=$ROCM_PATH + fi +fi + +export PYTHON_BIN_PATH=`which python3` +export TF_NEED_ROCM=1 +export ROCM_PATH=$ROCM_INSTALL_DIR +TAGS_FILTER="-oss_excluded,-oss_serial" +UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})" +TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}" + +bazel \ + test \ + --config=rocm \ + --build_tag_filters=${TAGS_FILTER} \ + --test_tag_filters=${TAGS_FILTER} \ + --test_timeout=920,2400,7200,9600 \ + --test_sharding_strategy=disabled \ + --test_output=errors \ + --flaky_test_attempts=3 \ + --keep_going \ + --local_test_jobs=${N_TEST_JOBS} \ + --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \ + --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \ + --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \ + --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \ + -- //xla/tests:collective_ops_test_e2e_gpu_amd_any \ + //xla/tests:collective_ops_test_gpu_amd_any \ + //xla/tests:replicated_io_feed_test_gpu_amd_any \ + //xla/tools/multihost_hlo_runner:functional_hlo_runner_test_gpu_amd_any \ + //xla/pjrt/distributed:topology_util_test \ + //xla/pjrt/distributed:client_server_test diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD index c1164551aaa2e..8997fc9e44cc9 100644 --- a/xla/service/gpu/BUILD +++ b/xla/service/gpu/BUILD @@ -752,7 +752,6 @@ xla_test( backends = [ "gpu_a100", "gpu_h100", - "gpu_amd_any", ], shard_count = 10, tags = ["nomac"], @@ -1255,7 +1254,6 @@ xla_test( backends = [ "gpu_a100", "gpu_h100", - "gpu_amd_any", ], deps = [ ":gpu_device_info_for_tests", @@ -1263,7 +1261,7 @@ xla_test( ":triton_fusion_analysis", ":triton_support", ":triton_test_utils", - "//third_party/protobuf", + "third_party/protobuf", "//xla:xla_data_proto_cc", "//xla:xla_proto_cc", "//xla/hlo/ir:hlo", @@ -1284,7 +1282,6 @@ xla_test( backends = [ "gpu_a100", "gpu_h100", - "gpu_amd_any", ], tags = ["nomac"], deps = [ @@ -6165,7 +6162,7 @@ xla_test( backend_tags = {"gpu": [ "requires-gpu-sm80", ]}, - backends = ["gpu"], + backends = ["gpu_a100", "gpu_h100"], deps = [ ":autotuner_compile_util", ":autotuner_util", diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD index fdf73310efd16..8f8317f88114b 100644 --- a/xla/service/gpu/tests/BUILD +++ b/xla/service/gpu/tests/BUILD @@ -469,7 +469,6 @@ xla_test( backends = [ "gpu_a100", "gpu_v100", - "gpu_amd_any", ], deps = [ ":gpu_codegen_test", diff --git a/xla/tests/collective_ops_test_e2e.cc b/xla/tests/collective_ops_test_e2e.cc index 2c7d53d59d2ee..a9449af7804e2 100644 --- a/xla/tests/collective_ops_test_e2e.cc +++ b/xla/tests/collective_ops_test_e2e.cc @@ -766,6 +766,7 @@ ENTRY main.12 { TEST_F(CollectiveOpsTestE2EWindowedNonWindowed, WindowedEinsumE2EAllGatherAndReduceScatterF8) { + GTEST_SKIP() << "F8E4M3 not supported"; absl::string_view kModuleReplicatedStr = R"( HloModule pjit__unnamed_wrapped_function_, entry_computation_layout={(<>[2,16,48]{2,1,0}, <>[48,192]{1,0}, <>[192,48]{1,0}, bf16[], bf16[], bf16[], bf16[], bf16[])->bf16[2,16,48]{2,1,0}}, allow_spmd_sharding_propagation_to_parameters={false,false,false,false}, num_partitions=4 diff --git a/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc b/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc index 8486ba21d4ec2..4018460bc90e3 100644 --- a/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc +++ b/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc @@ -263,6 +263,8 @@ TEST_F(FunctionalHloRunnerTest, ShardedAutotuningWorks) { GTEST_SKIP() << "GPU-only test."; } + GTEST_SKIP() << "Triton is not enabled."; + tsl::SubProcess child[kNumNodes]; for (int node_id = 0; node_id < kNumNodes; ++node_id) { std::vector argv;