Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix ucx against inference branch #1230

Merged
merged 2 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 11 additions & 97 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake)
set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR})
set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC -UNDEBUG")

# set std 17
#set(CMAKE_CXX_STANDARD 17)
#set(CMAKE_CUDA_STANDARD 17)

option(INFERENCE_TESTS "Run inference tests" OFF)
set(LIBTORCH_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../libtorch" CACHE STRING "LibTorch Path")
if (INFERENCE_TESTS)
Expand Down Expand Up @@ -69,106 +73,15 @@ option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if availab
option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON)
option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)

# option for using Python
set(FF_GASNET_CONDUITS aries udp mpi ibv ucx)
# option for using network
set(FF_GASNET_CONDUITS aries udp mpi ibv)
set(FF_GASNET_CONDUIT "mpi" CACHE STRING "Select GASNet conduit ${FF_GASNET_CONDUITS}")
set_property(CACHE FF_GASNET_CONDUIT PROPERTY STRINGS ${FF_GASNET_CONDUITS})
set(FF_LEGION_NETWORKS "" CACHE STRING "Network backend(s) to use")

if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") OR FF_LEGION_NETWORKS STREQUAL "ucx")
if("${FF_UCX_URL}" STREQUAL "")
set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz")
else()
set(UCX_URL "${FF_UCX_URL}")
endif()

set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx)
get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME)
# message(STATUS "UCX_URL: ${UCX_URL}")
# message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}")
set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}")
set(UCX_BUILD_NEEDED OFF)
set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt)
set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log)

if(EXISTS ${UCX_CONFIG_FILE})
file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG)
# message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}")
if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}")
# configs match - no build needed
set(UCX_BUILD_NEEDED OFF)
else()
message(STATUS "UCX configuration has changed - rebuilding...")
set(UCX_BUILD_NEEDED ON)
endif()
else()
message(STATUS "Configuring and building UCX...")
set(UCX_BUILD_NEEDED ON)
endif()

if(UCX_BUILD_NEEDED)
if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}")
message(STATUS "Downloading openucx/ucx from: ${UCX_URL}")
file(
DOWNLOAD
"${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}"
SHOW_PROGRESS
STATUS status
LOG log
)

list(GET status 0 status_code)
list(GET status 1 status_string)

if(status_code EQUAL 0)
message(STATUS "Downloading... done")
else()
message(FATAL_ERROR "error: downloading '${UCX_URL}' failed
status_code: ${status_code}
status_string: ${status_string}
log:
--- LOG BEGIN ---
${log}
--- LOG END ---"
)
endif()
else()
message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists")
endif()

execute_process(COMMAND mkdir -p ${UCX_DIR})
execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1)
message(STATUS "Building UCX...")
execute_process(
COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install"
RESULT_VARIABLE UCX_BUILD_STATUS
OUTPUT_FILE ${UCX_BUILD_OUTPUT}
ERROR_FILE ${UCX_BUILD_OUTPUT}
)

if(UCX_BUILD_STATUS)
message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details")
endif()

# Currently, we use default build configurations for UCX and therefore only save URL as configuration settings
file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}")
endif()

if (FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx")
set(ENV{UCX_HOME} "${UCX_DIR}/install")
install(DIRECTORY ${UCX_DIR}/install/bin/ DESTINATION bin)
install(DIRECTORY ${UCX_DIR}/install/include/ DESTINATION include)
install(DIRECTORY ${UCX_DIR}/install/lib/ DESTINATION lib)
install(DIRECTORY ${UCX_DIR}/install/share/ DESTINATION share)
endif()

if (FF_LEGION_NETWORKS STREQUAL "ucx")
set(ucx_DIR ${UCX_DIR}/cmake)
set(ENV{Legion_NETWORKS} "ucx")
message(STATUS "Legion_NETWORKS: $ENV{Legion_NETWORKS}")
endif()
else()
message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}")
if (FF_LEGION_NETWORKS STREQUAL "gasnet")
message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
endif()

set(FF_GPU_BACKENDS cuda hip_cuda hip_rocm intel)
Expand Down Expand Up @@ -213,6 +126,7 @@ list(APPEND CC_FLAGS
list(APPEND NVCC_FLAGS
-std=c++17)


add_compile_options(${CC_FLAGS})
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
link_libraries(${LD_FLAGS})
Expand Down Expand Up @@ -524,7 +438,7 @@ if(NOT BUILD_LEGION_ONLY)
endif()

# build binary
option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON)
option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF)
option(FF_BUILD_RESNET "build resnet example" OFF)
option(FF_BUILD_RESNEXT "build resnext example" OFF)
option(FF_BUILD_ALEXNET "build alexnet example" OFF)
Expand Down
57 changes: 52 additions & 5 deletions MULTI-NODE.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,33 @@ Source: Custom (use the security group ID)

You can also use your own GPU cluster, as long as all machines are interconnected with a low-latency network.

## 2. Configure and build FlexFlow
## 2. Configure and build UCX

Follow steps 1 to 5 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**.
Find the latest source code release for UCX at https://github.com/openucx/ucx/releases. As of writing this documentation, the latest UCX was 1.15.0 at https://github.com/openucx/ucx/releases/download/v1.15.0/ucx-1.15.0.tar.gz. Extract it and switch to the directory with UCX source code, and run:

```
CUDA_PATH=/usr/local/cuda
PREFIX=$PWD/install
./contrib/configure-release-mt --prefix="$PREFIX" --without-go --enable-mt --with-cuda="$CUDA_PATH"
make -j install
echo "$PREFIX"
```

Replace `{{ CUDA_PATH }}` with the path of your CUDA installation. If you don't know the path, try `which nvcc`. Take note of the path of UCX installation, echoed as part of the last command.

## 3. Configure and build FlexFlow

Follow steps 1 to 5 in [INSTALL.md](INSTALL.md#1-download-the-source-code) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. Or you can use NFS to mount home directory of each instance so that only a single build is necessary.

You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance.

For step 4 (Configuring the FlexFlow build), make sure to specify a network using the `FF_LEGION_NETWORKS` parameter. We recommend using `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT=ucx`. Other configurations are optional.
For step 4 (Configuring the FlexFlow build), here are the parameters that need to be configured:
* Set `FF_LEGION_NETWORKS=ucx`
* Set `UCX_DIR` to the UCX installation path mentioned in [Configure and build UCX](#2-configure-and-build-ucx)

## 3. Configure MPI
Other configuration options are optional.

## 4. Configure MPI

MPI is an easy way to launch FlexFlow across all instances simultaneously and set up communication between them.

Expand Down Expand Up @@ -64,8 +82,37 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su

5. Test MPI by running `mpirun -N 1 --hostfile ~/hostfile hostname`. It should display the hostname of all your nodes. If you encounter any errors like `WARNING: Open MPI accepted a TCP connection from what appears to be another Open MPI process but cannot find a corresponding process entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command (refer to [this Stack Overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)).

## 4. Test FlexFlow
## 5. Test FlexFlow

<<<<<<< HEAD
Follow step 6 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to set environment variables.

A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html).
=======
Follow step 6 in [INSTALL.md](INSTALL.md#6-test-flexflow) to set environment variables.

Save the following script as `mnist_mlp_run.sh` and make sure to change `FLEXFLOW_DIR` and `UCX_DIR` to appropriate paths:

```bash
#!/bin/bash
eval "$(conda shell.bash hook)"
conda activate flexflow
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib

# Path to your FlexFlow build
FLEXFLOW_DIR=/home/ubuntu/FlexFlow/build

# Path to your UCX installation
UCX_DIR=/home/ubuntu/ucx-1.15.0/install

export REALM_UCP_BOOTSTRAP_PLUGIN=$FLEXFLOW_DIR/deps/legion/lib/realm_ucp_bootstrap_mpi.so
export LD_LIBRARY_PATH=$FLEXFLOW_DIR/deps/legion/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$FLEXFLOW_DIR:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/conda/envs/flexflow/lib:$LD_LIBRARY_PATH

mpiexec -x REALM_UCP_BOOTSTRAP_PLUGIN -x PATH -x LD_LIBRARY_PATH --hostfile ~/hostfile --mca btl_tcp_if_include ens5 -np 2 "$FLEXFLOW_DIR"/flexflow_python "$FLEXFLOW_DIR"/../examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000
```

Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program.
>>>>>>> c031ab1f... fix ucx
2 changes: 2 additions & 0 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ if(CUDA_FOUND)
endforeach()
string(REGEX REPLACE "([0-9]+)" "-gencode arch=compute_\\1,code=sm_\\1" CUDA_GENCODE "${CUDA_GENCODE}")

set(CMAKE_CUDA_COMPILER "${CUDA_NVCC_EXECUTABLE}")
#output
message( STATUS "CUDA_VERSION: ${CUDA_VERSION}")
message( STATUS "CUDA root path : ${CUDA_TOOLKIT_ROOT_DIR}" )
Expand All @@ -80,6 +81,7 @@ if(CUDA_FOUND)
message( STATUS "CURAND libraries : ${CUDA_curand_LIBRARY}" )
message( STATUS "CUDA Arch : ${FF_CUDA_ARCH}" )
message( STATUS "CUDA_GENCODE: ${CUDA_GENCODE}")
message( STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}")

list(APPEND FLEXFLOW_INCLUDE_DIRS
${CUDA_INCLUDE_DIRS})
Expand Down
4 changes: 4 additions & 0 deletions cmake/legion.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ else()
set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version")
set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit")
set(GASNet_CONDUIT ${FF_GASNET_CONDUIT})
elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx")
set(ucx_ROOT ${UCX_PATH}/lib/cmake)
message(STATUS "Find ucx: ${UCX_PATH}")
set(Legion_NETWORKS "ucx" CACHE STRING "Enable UCX")
endif()
message(STATUS "GASNET ROOT: $ENV{GASNet_ROOT_DIR}")
set(Legion_MAX_DIM ${FF_MAX_DIM} CACHE STRING "Maximum number of dimensions")
Expand Down
3 changes: 2 additions & 1 deletion cmake/nccl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,9 @@ else()
message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
add_library(nccl SHARED IMPORTED)

# Build NCCL from source
else()
# Build NCCL from source
message(STATUS "Building NCCL from source")
list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)

Expand Down
9 changes: 5 additions & 4 deletions config/config.inc
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,13 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi"
elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp"
elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx"
SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL"
fi
elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
# set ucx dir
if [ -n "$UCX_DIR" ]; then
SET_UCX="-DUCX_PATH=${UCX_DIR}"
fi
fi

# build C++ examples
Expand Down Expand Up @@ -227,7 +228,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
fi
fi

CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"

function run_cmake() {
SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}
Expand Down
8 changes: 5 additions & 3 deletions config/config.linux
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
# set CUDA dir in case cmake cannot autodetect a path
CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}

#set NCCL dir
# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
# otherwise, we will build nccl from source
NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}

# enable Python
Expand All @@ -54,8 +55,8 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
# select GASNET conduit
FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}

# set UCX URL
FF_UCX_URL=${FF_UCX_URL:-""}
# set UCX dir if Legion networks is set to ucx
UCX_DIR=${UCX_DIR:-""}

# build C++ examples
FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF}
Expand All @@ -67,6 +68,7 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
# use precompiled NCCL and Legion libraries, where available
FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-OFF}
FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF}

# use the flag below to use both the NCCL and Legion pre-built libraries.
# when the flag below is set to ON, the two flags above are ignored.
FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF}
Expand Down