diff --git a/build_tools/ci/BUILD b/build_tools/ci/BUILD
index d5568177b4bd11..5f452ad103a97a 100644
--- a/build_tools/ci/BUILD
+++ b/build_tools/ci/BUILD
@@ -41,3 +41,8 @@ diff_test(
     file2 = ":generated_build_commands",
     tags = ["not_run:arm"],
 )
+
+sh_binary(
+    name = "parallel_gpu_execute",
+    srcs = ["parallel_gpu_execute.sh"],
+)
diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py
index 8f0ff3fda6d4da..45bddabc94da73 100755
--- a/build_tools/ci/build.py
+++ b/build_tools/ci/build.py
@@ -267,7 +267,7 @@ def nvidia_gpu_build_with_compute_capability(
       + extra_gpu_tags,
       build_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only"),
       options={
-          "run_under": "//tools/ci_build/gpu_build:parallel_gpu_execute",
+          "run_under": "//build_tools/ci:parallel_gpu_execute",
           "repo_env": f"TF_CUDA_COMPUTE_CAPABILITIES={compute_capability/10}",
           "@cuda_driver//:enable_forward_compatibility": "true",
           **_DEFAULT_BAZEL_OPTIONS,
diff --git a/build_tools/ci/golden_commands.txt b/build_tools/ci/golden_commands.txt
index a2061701ffb915..56395824ca3a1d 100644
--- a/build_tools/ci/golden_commands.txt
+++ b/build_tools/ci/golden_commands.txt
@@ -21,8 +21,8 @@ $KOKORO_ARTIFACTS_DIR/github/xla/.kokoro/generate_index_html.sh index.html
 nvidia-smi
 parallel --ungroup --retries 3 --delay 15 --nonall -- docker pull us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest
 docker run --detach --name=xla_ci --rm --interactive --tty --volume=./github:/github --workdir=/github/xla us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest bash
-docker exec xla_ci parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
-docker exec xla_ci bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
+docker exec xla_ci parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
+docker exec xla_ci bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
 docker exec xla_ci bazel analyze-profile profile.json.gz
 docker stop xla_ci
 # END BuildType.GPU
diff --git a/build_tools/ci/parallel_gpu_execute.sh b/build_tools/ci/parallel_gpu_execute.sh
new file mode 100755
index 00000000000000..a00dcbc3f3404a
--- /dev/null
+++ b/build_tools/ci/parallel_gpu_execute.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+#
+# A script to run multiple GPU tests in parallel controlled with an environment
+# variable.
+#
+# Required environment variables:
+#     TF_GPU_COUNT = Number of GPUs available.
+
+TF_GPU_COUNT=${TF_GPU_COUNT:-4}
+TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8}
+
+# This function is used below in rlocation to check that a path is absolute
+function is_absolute {
+  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
+}
+
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-2048}
+
+# *******************************************************************
+#         This section of the script is needed to
+#         make things work on windows under msys.
+# *******************************************************************
+RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
+function rlocation() {
+  if is_absolute "$1" ; then
+    # If the file path is already fully specified, simply return it.
+    echo "$1"
+  elif [[ -e "$TEST_SRCDIR/$1" ]]; then
+    # If the file exists in the $TEST_SRCDIR then just use it.
+    echo "$TEST_SRCDIR/$1"
+  elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
+    # If a runfiles manifest file exists then use it.
+    echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
+  fi
+}
+
+TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
+shift
+# *******************************************************************
+
+mkdir -p /var/lock
+# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
+# slots to run a test at.
+#
+# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
+# So, we iterate over TF_TESTS_PER_GPU first.
+for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
+  for i in `seq 0 $((TF_GPU_COUNT-1))`; do
+    exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
+    if flock -n "$lock_fd";
+    then
+      (
+        # This export only works within the brackets, so it is isolated to one
+        # single command.
+        export CUDA_VISIBLE_DEVICES=$i
+        export HIP_VISIBLE_DEVICES=$i
+        echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
+        "$TEST_BINARY" $@
+      )
+      return_code=$?
+      flock -u "$lock_fd"
+      exit $return_code
+    fi
+  done
+done
+
+echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
+exit 1
diff --git a/build_tools/rocm/run_xla.sh b/build_tools/rocm/run_xla.sh
index 2ed5dc2d317acc..ef556b5b850ede 100755
--- a/build_tools/rocm/run_xla.sh
+++ b/build_tools/rocm/run_xla.sh
@@ -71,5 +71,5 @@ bazel \
     --action_env=TF_ROCM_AMDGPU_TARGETS=gfx90a \
     --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
     --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
-    --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
+    --run_under=//build_tools/ci:parallel_gpu_execute \
     -- //xla/...
diff --git a/build_tools/rocm/run_xla_multi_gpu.sh b/build_tools/rocm/run_xla_multi_gpu.sh
index 4d7d67d2fe8475..5cca78c8a70992 100755
--- a/build_tools/rocm/run_xla_multi_gpu.sh
+++ b/build_tools/rocm/run_xla_multi_gpu.sh
@@ -26,7 +26,7 @@
 # //xla/pjrt/distributed:topology_util_test
 # //xla/pjrt/distributed:client_server_test
 # ```
-# Also these tests do not use `--run_under=//tools/ci_build/gpu_build:parallel_gpu_execute` with bazel which
+# Also these tests do not use `--run_under=//build_tools/ci:parallel_gpu_execute` with bazel which
 # locks down individual gpus thus making multi gpu tests impossible to run
 
 set -e