Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ucx build #938

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 8 additions & 103 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
STRING "Choose the type of build." FORCE)
endif()

# set std 11
set (CMAKE_CXX_STANDARD 11)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lockshaw Have we reached out the agreement that FlexFlow will use c++17 moving forward?


# do not disable assertions even if in release mode
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG")

Expand All @@ -43,106 +46,15 @@ option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if availab
option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON)
option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)

# option for using Python
set(FF_GASNET_CONDUITS aries udp mpi ibv ucx)
# option for using network
set(FF_GASNET_CONDUITS aries udp mpi ibv)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we remove ucx as an opinion?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is prefer to use the realm ucx module over gastnet ucx conduit, so I think it is not necessary to provide the ucx option.

set(FF_GASNET_CONDUIT "mpi" CACHE STRING "Select GASNet conduit ${FF_GASNET_CONDUITS}")
set_property(CACHE FF_GASNET_CONDUIT PROPERTY STRINGS ${FF_GASNET_CONDUITS})
set(FF_LEGION_NETWORKS "" CACHE STRING "Network backend(s) to use")

if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") OR FF_LEGION_NETWORKS STREQUAL "ucx")
if("${FF_UCX_URL}" STREQUAL "")
set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz")
else()
set(UCX_URL "${FF_UCX_URL}")
endif()

set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx)
get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME)
# message(STATUS "UCX_URL: ${UCX_URL}")
# message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}")
set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}")
set(UCX_BUILD_NEEDED OFF)
set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt)
set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log)

if(EXISTS ${UCX_CONFIG_FILE})
file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG)
# message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}")
if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}")
# configs match - no build needed
set(UCX_BUILD_NEEDED OFF)
else()
message(STATUS "UCX configuration has changed - rebuilding...")
set(UCX_BUILD_NEEDED ON)
endif()
else()
message(STATUS "Configuring and building UCX...")
set(UCX_BUILD_NEEDED ON)
endif()

if(UCX_BUILD_NEEDED)
if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}")
message(STATUS "Downloading openucx/ucx from: ${UCX_URL}")
file(
DOWNLOAD
"${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}"
SHOW_PROGRESS
STATUS status
LOG log
)

list(GET status 0 status_code)
list(GET status 1 status_string)

if(status_code EQUAL 0)
message(STATUS "Downloading... done")
else()
message(FATAL_ERROR "error: downloading '${UCX_URL}' failed
status_code: ${status_code}
status_string: ${status_string}
log:
--- LOG BEGIN ---
${log}
--- LOG END ---"
)
endif()
else()
message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists")
endif()

execute_process(COMMAND mkdir -p ${UCX_DIR})
execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1)
message(STATUS "Building UCX...")
execute_process(
COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install"
RESULT_VARIABLE UCX_BUILD_STATUS
OUTPUT_FILE ${UCX_BUILD_OUTPUT}
ERROR_FILE ${UCX_BUILD_OUTPUT}
)

if(UCX_BUILD_STATUS)
message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details")
endif()

# Currently, we use default build configurations for UCX and therefore only save URL as configuration settings
file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}")
endif()

if (FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx")
set(ENV{UCX_HOME} "${UCX_DIR}/install")
install(DIRECTORY ${UCX_DIR}/install/bin/ DESTINATION bin)
install(DIRECTORY ${UCX_DIR}/install/include/ DESTINATION include)
install(DIRECTORY ${UCX_DIR}/install/lib/ DESTINATION lib)
install(DIRECTORY ${UCX_DIR}/install/share/ DESTINATION share)
endif()

if (FF_LEGION_NETWORKS STREQUAL "ucx")
set(ucx_DIR ${UCX_DIR}/cmake)
set(ENV{Legion_NETWORKS} "ucx")
message(STATUS "Legion_NETWORKS: $ENV{Legion_NETWORKS}")
endif()
else()
message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}")
if (FF_LEGION_NETWORKS STREQUAL "gasnet")
message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
endif()

set(FF_GPU_BACKENDS cuda hip_cuda hip_rocm intel)
Expand Down Expand Up @@ -179,13 +91,6 @@ set(CC_FLAGS $ENV{CC_FLAGS})
set(NVCC_FLAGS $ENV{NVCC_FLAGS})
set(LD_FLAGS $ENV{LD_FLAGS})

# Set global FLAGS
list(APPEND CC_FLAGS
-std=c++11)

list(APPEND NVCC_FLAGS
-std=c++11)

add_compile_options(${CC_FLAGS})
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
link_libraries(${LD_FLAGS})
Expand Down
55 changes: 48 additions & 7 deletions MULTI-NODE.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,33 @@ Source: Custom (use the security group ID)

You can also use your own GPU cluster, as long as all machines are interconnected with a low-latency network.

## 2. Configure and build FlexFlow
## 2. Configure and build UCX

Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**.
Find the latest source code release for UCX at https://github.com/openucx/ucx/releases. As of writing this documentation, the latest UCX was 1.15.0 at https://github.com/openucx/ucx/releases/download/v1.15.0/ucx-1.15.0.tar.gz. Extract it and switch to the directory with UCX source code, and run:

```
CUDA_PATH=/usr/local/cuda
PREFIX=$PWD/install
./contrib/configure-release-mt --prefix="$PREFIX" --without-go --enable-mt --with-cuda="$CUDA_PATH"
make -j install
echo "$PREFIX"
```

Replace `{{ CUDA_PATH }}` with the path of your CUDA installation. If you don't know the path, try `which nvcc`. Take note of the path of UCX installation, echoed as part of the last command.

## 3. Configure and build FlexFlow

Follow steps 1 to 5 in [INSTALL.md](INSTALL.md#1-download-the-source-code) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. Or you can use NFS to mount home directory of each instance so that only a single build is necessary.

You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance.

For step 4 (Configuring the FlexFlow build), make sure to specify a network using the `FF_LEGION_NETWORKS` parameter. We recommend using `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT=ucx`. Other configurations are optional.
For step 4 (Configuring the FlexFlow build), here are the parameters that need to be configured:
* Set `FF_LEGION_NETWORKS=ucx`
* Set `UCX_DIR` to the UCX installation path mentioned in [Configure and build UCX](#2-configure-and-build-ucx)

Other configuration options are optional.

## 3. Configure MPI
## 4. Configure MPI

MPI is an easy way to launch FlexFlow across all instances simultaneously and set up communication between them.

Expand Down Expand Up @@ -64,8 +82,31 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su

5. Test MPI by running `mpirun -N 1 --hostfile ~/hostfile hostname`. It should display the hostname of all your nodes. If you encounter any errors like `WARNING: Open MPI accepted a TCP connection from what appears to be another Open MPI process but cannot find a corresponding process entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command (refer to [this Stack Overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)).

## 4. Test FlexFlow
## 5. Test FlexFlow

Follow step 6 in [INSTALL.md](INSTALL.md) to set environment variables.
Follow step 6 in [INSTALL.md](INSTALL.md#6-test-flexflow) to set environment variables.

Save the following script as `mnist_mlp_run.sh` and make sure to change `FLEXFLOW_DIR` and `UCX_DIR` to appropriate paths:

```bash
#!/bin/bash
eval "$(conda shell.bash hook)"
conda activate flexflow
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib

# Path to your FlexFlow build
FLEXFLOW_DIR=/home/ubuntu/FlexFlow/build

# Path to your UCX installation
UCX_DIR=/home/ubuntu/ucx-1.15.0/install

export REALM_UCP_BOOTSTRAP_PLUGIN=$FLEXFLOW_DIR/deps/legion/lib/realm_ucp_bootstrap_mpi.so
export LD_LIBRARY_PATH=$FLEXFLOW_DIR/deps/legion/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$FLEXFLOW_DIR:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/conda/envs/flexflow/lib:$LD_LIBRARY_PATH

mpiexec -x REALM_UCP_BOOTSTRAP_PLUGIN -x PATH -x LD_LIBRARY_PATH --hostfile ~/hostfile --mca btl_tcp_if_include ens5 -np 2 "$FLEXFLOW_DIR"/flexflow_python "$FLEXFLOW_DIR"/../examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize 8000 -ll:zsize 8000
```

A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. You can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) (if you configured it in step 3) or [`srun`](https://slurm.schedmd.com/srun.html).
Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program.
4 changes: 4 additions & 0 deletions cmake/legion.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ else()
set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version")
set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit")
set(GASNet_CONDUIT ${FF_GASNET_CONDUIT})
elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx")
set(ucx_ROOT ${UCX_PATH}/lib/cmake)
message(STATUS "Find ucx: ${UCX_PATH}")
set(Legion_NETWORKS "ucx" CACHE STRING "Enable UCX")
endif()
message(STATUS "GASNET ROOT: $ENV{GASNet_ROOT_DIR}")
set(Legion_MAX_DIM ${FF_MAX_DIM} CACHE STRING "Maximum number of dimensions")
Expand Down
3 changes: 2 additions & 1 deletion cmake/nccl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,9 @@ else()
message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
add_library(nccl SHARED IMPORTED)

# Build NCCL from source
else()
# Build NCCL from source
message(STATUS "Building NCCL from source")
list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)

Expand Down
7 changes: 6 additions & 1 deletion config/config.inc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ if [ -n "$CUDNN_DIR" ]; then
SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
fi

# set ucx dir
if [ -n "$UCX_DIR" ]; then
SET_UCX="-DUCX_PATH=${UCX_DIR}"
fi

# enable Python
if [ "$FF_USE_PYTHON" = "ON" ]; then
SET_PYTHON="-DFF_USE_PYTHON=ON"
Expand Down Expand Up @@ -188,7 +193,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
fi
fi

CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_UCX} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"

function run_cmake() {
SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}
Expand Down
8 changes: 5 additions & 3 deletions config/config.linux
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
# set CUDA dir in case cmake cannot autodetect a path
CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}

#set NCCL dir
# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
# otherwise, we will build nccl from source
NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}

# enable Python
Expand All @@ -40,8 +41,8 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
# select GASNET conduit
FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv}

# set UCX URL
FF_UCX_URL=${FF_UCX_URL:-""}
# set UCX dir if Legion networks is set to ucx
UCX_DIR=${UCX_DIR:-""}

# build C++ examples
FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF}
Expand All @@ -52,6 +53,7 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
# use precompiled NCCL and Legion libraries, where available
FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-OFF}
FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF}

# use the flag below to use both the NCCL and Legion pre-built libraries.
# when the flag below is set to ON, the two flags above are ignored.
FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF}
Expand Down
Loading