Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

draft, RDMA via thallium #69

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .devcontainer/thallium-container/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"name": "thallium",
"image": "interactem/operator-rdma:latest",
"features": {
"ghcr.io/devcontainers/features/git:1": {}
},
"customizations": {
"vscode": {
"extensions": [
"charliermarsh.ruff",
"ms-azuretools.vscode-docker",
"ms-python.black-formatter",
"ms-python.debugpy",
"ms-python.isort",
"ms-python.python",
"tamasfe.even-better-toml",
"ms-vscode.cpptools",
"ms-vscode.cmake-tools",
"ms-vscode.cpptools-extension-pack"
]
}
}
}
18 changes: 18 additions & 0 deletions .vscode/c_cpp_properties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**",
"/opt/views/view/include",
"/usr/local/include/python3.10/",
"/usr/local/lib/python3.10/site-packages/nanobind/include"
],
"defines": [],
"compilerPath": "/usr/bin/gcc",
"cStandard": "c17",
"intelliSenseMode": "linux-gcc-arm64"
}
],
"version": 4
}
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,10 @@
"editor.formatOnSave": true,
"editor.formatOnSaveMode": "modifications",
"editor.defaultFormatter": "charliermarsh.ruff",
"[cpp]": {
"editor.defaultFormatter": "ms-vscode.cpptools",
"editor.formatOnSave": true
},
"C_Cpp.clang_format_fallbackStyle": "{ BasedOnStyle: llvm }",
"C_Cpp.formatting": "clangFormat",
}
72 changes: 72 additions & 0 deletions backend/operators/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Build stage with Spack pre-installed and ready to be used
FROM spack/ubuntu-jammy:develop as builder

# What we want to install and how we want to install it
# is specified in a manifest file (spack.yaml)
RUN mkdir -p /opt/spack-environment && \
set -o noclobber \
&& (echo spack: \
&& echo ' specs:' \
&& echo ' - cmake' \
&& echo ' - pkgconfig' \
&& echo ' - mochi-thallium ^mercury~boostsys~checksum ^libfabric fabrics=tcp,rxm' \
&& echo ' - nlohmann-json' \
&& echo ' - spdlog' \
&& echo ' - fmt' \
&& echo ' - tclap' \
&& echo ' - flatbuffers' \
&& echo ' concretizer:' \
&& echo ' unify: true' \
&& echo ' reuse: true' \
&& echo ' packages:' \
&& echo ' mochi-margo:' \
&& echo ' require: '"'"'@0.18.3:'"'"'' \
&& echo ' mochi-thallium:' \
&& echo ' require: '"'"'@0.14.6:'"'"'' \
&& echo ' config:' \
&& echo ' install_tree: /opt/software' \
&& echo ' view: /opt/views/view') > /opt/spack-environment/spack.yaml

# Install the software, remove unnecessary deps
RUN cd /opt/spack-environment && spack env activate . && spack install --fail-fast && spack gc -y

# Strip all the binaries
RUN find -L /opt/views/view/* -type f -exec readlink -f '{}' \; | \
xargs file -i | \
grep 'charset=binary' | \
grep 'x-executable\|x-archive\|x-sharedlib' | \
awk -F: '{print $1}' | xargs strip

# Modifications to the environment that are necessary to run
RUN cd /opt/spack-environment && \
spack env activate --sh -d . > activate.sh


# Bare OS image to run the installed executables
FROM interactem/operator

RUN apt-get update && apt-get install -y \
git gcc automake pkgconf \
python3-pip && rm -rf /var/lib/apt/lists/*

RUN pip install \
nanobind

COPY --from=builder /opt/spack-environment /opt/spack-environment
COPY --from=builder /opt/software /opt/software

# paths.view is a symlink, so copy the parent to avoid dereferencing and duplicating it
COPY --from=builder /opt/views /opt/views

RUN { \
echo '#!/bin/sh' \
&& echo '.' /opt/spack-environment/activate.sh \
&& echo 'exec "$@"'; \
} > /entrypoint.sh \
&& chmod a+x /entrypoint.sh \
&& ln -s /opt/views/view /opt/view


ENTRYPOINT [ "/entrypoint.sh" ]
CMD [ "/bin/bash" ]

51 changes: 51 additions & 0 deletions backend/operators/thallium/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
cmake_minimum_required(VERSION 3.15...3.26)

project(nanobind_example LANGUAGES CXX)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if (NOT SKBUILD)
message(WARNING "\
This CMake file is meant to be executed using 'scikit-build'. Running
it directly will almost certainly not produce the desired result. If
you are a user trying to install this package, please use the command
below, which will install all necessary build dependencies, compile
the package in an isolated environment, and then install it.
=====================================================================
$ pip install .
=====================================================================
If you are a software developer, and this is your own package, then
it is usually much more efficient to install the build dependencies
in your environment once and use the following command that avoids
a costly creation of a new virtual environment at every compilation:
=====================================================================
$ pip install nanobind scikit-build-core[pyproject]
$ pip install --no-build-isolation -ve .
=====================================================================
You may optionally add -Ceditable.rebuild=true to auto-rebuild when
the package is imported. Otherwise, you need to re-run the above
after editing C++ files.")
endif()

# Try to import all Python components potentially needed by nanobind
find_package(Python 3.8
REQUIRED COMPONENTS Interpreter Development.Module
OPTIONAL_COMPONENTS Development.SABIModule)

# Import nanobind through CMake's find_package mechanism
execute_process(
COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE nanobind_ROOT)
find_package(nanobind CONFIG REQUIRED)

# Thallium
find_package(thallium REQUIRED)

# Serialization
find_package(CapnProto REQUIRED)
find_package(FlatBuffers REQUIRED)

add_subdirectory(${CMAKE_SOURCE_DIR}/src/proto)

# Generate bindings
add_subdirectory(${CMAKE_SOURCE_DIR}/src)
10 changes: 10 additions & 0 deletions backend/operators/thallium/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Thallium python bindings
========================

Notes
-----

RMA is zero-copy, but SM transport is not. From mercury-hpc dev:

> RMA is zero-copy but there's always a copy currently for RPC payloads. We might be able to optimize some more by doing what we call a multi-recv optimization where there's a single block allocated on the server where processes can directly memcpy into but that's not something that's been implemented for the sm transport yet. It's not possible however to just pass around buffers, there's always some memcpy involved.

42 changes: 42 additions & 0 deletions backend/operators/thallium/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[build-system]
requires = ["scikit-build-core >=0.10", "nanobind >=1.3.2", "typing_extensions"]
build-backend = "scikit_build_core.build"

[project]
name = "thallium"
version = "0.0.1"
description = "Python bindings for mochi-thallium"
readme = "README.md"
requires-python = ">=3.8"
authors = [
{ name = "Sam Welborn", email = "[email protected]" },
]

[project.urls]
Homepage = "https://github.com/NERSC/interactEM"


[tool.scikit-build]
# Protect the configuration against future changes in scikit-build-core
minimum-version = "build-system.requires"

# Setuptools-style build caching in a local directory
build-dir = "build/{wheel_tag}"

# Build stable ABI wheels for CPython 3.12+
wheel.py-api = "cp312"

[tool.cibuildwheel]
# Necessary to see build output from the actual compilation
build-verbosity = 1

# Run pytest to ensure that the package was correctly built
test-command = "pytest {project}/tests"
test-requires = "pytest"

# Don't test Python 3.8 wheels on macOS/arm64
test-skip="cp38-macosx_*:arm64"

# Needed for full C++17 support
[tool.cibuildwheel.macos.environment]
MACOSX_DEPLOYMENT_TARGET = "10.14"
3 changes: 3 additions & 0 deletions backend/operators/thallium/src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.capnp.c++
*.capnp.h
*/gen/*
50 changes: 50 additions & 0 deletions backend/operators/thallium/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Bindings

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)

# Create the extension module
nanobind_add_module(
# Name of the extension
_thallium

# Target the stable ABI for Python 3.12+, which reduces
# the number of binary wheels that must be built. This
# does nothing on older Python versions
STABLE_ABI

# Build libnanobind statically and merge it into the
# extension (which itself remains a shared library)
#
# If your project builds multiple extensions, you can
# replace this flag by NB_SHARED to conserve space by
# reusing a shared libnanobind across libraries
NB_STATIC

# Source code goes here
bindings.cpp
)

# Make a dep for generated schemas
add_dependencies(_thallium ${THALLIUM_FBS_SCHEMAS_TARGET})

target_link_libraries(_thallium PRIVATE thallium CapnProto::capnp)

# Figured this out, and then found this discussion as a good ref:
# https://github.com/wjakob/nanobind/discussions/537
nanobind_add_stub(
thallium_stub
MODULE _thallium
OUTPUT __init__.pyi
MARKER_FILE py.typed
PYTHON_PATH $<TARGET_FILE_DIR:_thallium>
DEPENDS _thallium
)


# Install directive for scikit-build-core
install(TARGETS _thallium LIBRARY DESTINATION thallium)
install(FILES
${CMAKE_BINARY_DIR}/src/py.typed
${CMAKE_BINARY_DIR}/src/__init__.pyi
DESTINATION thallium
)
101 changes: 101 additions & 0 deletions backend/operators/thallium/src/bindings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#include <memory>
#include <nanobind/ndarray.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unique_ptr.h>

#include <thallium.hpp>

#include "QueueClient.hpp"
#include "QueueProvider.hpp"

namespace nb = nanobind;
namespace tl = thallium;
using namespace nb::literals;

namespace interactEM {

enum class EngineMode {
SERVER = THALLIUM_SERVER_MODE,
CLIENT = THALLIUM_CLIENT_MODE
};

class PyEngine {
public:
PyEngine(const std::string &protocol, const EngineMode mode)
: m_engine(protocol, static_cast<int>(mode)) {}

std::string get_address() const { return m_engine.self(); }

void wait_for_finalize() { m_engine.wait_for_finalize(); }

margo_instance_id get_id() const { return m_engine.get_margo_instance(); }

private:
tl::engine m_engine;
};

typedef std::unique_ptr<PyMessage, nb::deleter<PyMessage>>
PyMessagePtrWithDeleter;

class PyQueueClient {
public:
PyQueueClient(const PyEngine &engine, const std::string &server_addr,
const uint16_t provider_id = 1) {
m_client = QueueClient::create(engine.get_id(), server_addr, provider_id);
}

void push_rdma(PyMessagePtrWithDeleter msg) {
m_client->push_rdma(msg->header, msg->data);
}

private:
std::unique_ptr<QueueClient> m_client;
};

class PyQueueProvider {
public:
PyQueueProvider(const PyEngine &engine, const uint16_t provider_id) {
provider = QueueProvider::create(engine.get_id(), provider_id);
}

PyMessagePtrWithDeleter pull() {
auto msg = provider->pull();
return PyMessagePtrWithDeleter(new PyMessage(std::move(*msg)));
}

private:
std::unique_ptr<QueueProvider> provider;
};

NB_MODULE(_thallium, m) {
nb::enum_<interactEM::EngineMode>(m, "EngineMode")
.value("SERVER", interactEM::EngineMode::SERVER)
.value("CLIENT", interactEM::EngineMode::CLIENT);

nb::class_<interactEM::PyEngine>(m, "Engine")
.def(nb::init<const std::string &, const interactEM::EngineMode>(),
"protocol"_a, "mode"_a)
.def_prop_ro("address", &interactEM::PyEngine::get_address)
.def("wait_for_finalize", &interactEM::PyEngine::wait_for_finalize);

nb::class_<PyMessage>(m, "Message")
.def(nb::init<const std::string, nb::ndarray<nb::numpy>>(), "header"_a,
"data"_a)
.def_rw("header", &PyMessage::header)
// TODO: fix so we see that it is ndarray in python stubs
// TODO: understand the rv policy better. This works (no leak warnings
// emitted), but unsure if it is the correct thing to do...
.def_rw("data", &PyMessage::data, nb::rv_policy::take_ownership);

nb::class_<interactEM::PyQueueClient>(m, "QueueClient")
.def(nb::init<const interactEM::PyEngine &, const std::string &,
const uint16_t>(),
"engine"_a, "server_addr"_a, "provider_id"_a)
.def("push_rdma", &interactEM::PyQueueClient::push_rdma, "msg"_a);

nb::class_<interactEM::PyQueueProvider>(m, "QueueProvider")
.def(nb::init<const interactEM::PyEngine &, const uint16_t>(), "engine"_a,
"provider_id"_a)
.def("pull", &interactEM::PyQueueProvider::pull);
}
} // namespace interactEM
Loading
Loading