diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index bfbfdbe..b8ba9f6 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -7,6 +7,8 @@ on: pull_request: branches: - '**' +env: + PROJECT_ROOT: /app/ jobs: setup-and-test: @@ -47,6 +49,12 @@ jobs: - name: Build run: go build -v ./... working-directory: go + + - name: Set Project Root + run: echo "PROJECT_ROOT=$(pwd)" >> $GITHUB_ENV + + - name: Use Project Root + run: echo "The project root is $PROJECT_ROOT" - name: Test run: go test -v ./... diff --git a/Dockerfile b/Dockerfile index 534a642..4e2fad3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,8 @@ -# Use the official Golang image to create a build artifact. -# This is based on Debian and includes the Go toolset. -FROM golang -#:1.19 -#FROM golang:1.22 -#FROM debian:buster -# as builder +# Declare the CONFIG argument at the beginning +ARG CONFIG=base + +# Base stage for common setup +FROM golang as base RUN apt update RUN apt install -y protobuf-compiler @@ -35,13 +33,24 @@ RUN python3 -m pip install grpcio-tools --break-system-packages RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.28 RUN go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2 - # now that we've installed pre-reqs, build everything -RUN make clean && make all +RUN make clean && make go && make py && make build + +# just to test things out +RUN apt update && apt install -y iputils-ping + +ENV PROJECT_ROOT=/app + +FROM base as driver + +# install necessary Python packages to run anything +RUN python3 -m pip install dill --break-system-packages +RUN cd python && python3 -m pip install -e . --break-system-packages -# expose all the ports we may use -#EXPOSE 50000-69999 +# install basic necessities to actually do driver stuff +RUN apt install -y nano -# placeholder commands -#CMD ["./bin/localobjstore", "&", "./bin/localscheduler", "&", "./bin/worker"] +# take in a CONFIG argument which will tell us what to target (GCS, global scheduler, or worker) +# using multi-stage builds: https://chat.openai.com/share/a5eb4076-e36a-4a1e-b4c8-9d56ea7a604e +FROM ${CONFIG} as final diff --git a/Makefile b/Makefile index 3346c43..0cf3c7a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -.PHONY: all go py clean build servers +.PHONY: all go py clean build servers docker -all: go py build +all: go py build docker go: proto @echo "Generating Go gRPC code..." @@ -13,7 +13,6 @@ py: proto # below line is now compatible with both MacOS (BSD) and GNU sed -i'' -e 's/import rayclient_pb2 as rayclient__pb2/from . import rayclient_pb2 as rayclient__pb2/' python/babyray/rayclient_pb2_grpc.py - build: servers servers: gcsfunctable gcsobjtable globalscheduler localobjstore localscheduler worker @@ -42,6 +41,10 @@ worker: @echo "Building Worker Server..." cd go && go build -o bin/worker cmd/worker/main.go +docker: + docker build --build-arg CONFIG=base -t ray-node:base . + docker build --build-arg CONFIG=driver -t ray-node:driver . + clean: @echo "Cleaning up..." rm -f go/pkg/*.pb.go diff --git a/README.md b/README.md index 25a81d4..fba9ff1 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,79 @@ # baby🦈 +[Baby Ray](https://www.youtube.com/watch?v=WkCecpH2GAo) is a minimal yet fully-functioning (fault-tolerance included!) implementation of [Ray](https://arxiv.org/abs/1712.05889) in Go. + +Currently a work in progress. By the end of this project, you should be able to simulate a full Ray cluster using Docker Compose and launch CPU jobs using Python, using exactly the same API exposed by the real Ray Core library. + + +## Automatic deployment - shorthand version +```bash +make all +``` + +```bash +docker-compose up +``` + +Separate terminal: +```bash +./log_into_driver.sh +``` + ## Automatic deployment -You can automatically deploy the worker + GCS + global scheduler nodes by doing: +You can automatically deploy the entire cluster (workers, GCS, global scheduler nodes) by following these instructions. + +First, build everything: + +```bash +make all +``` + +This will generate the Go and Python gRPC stub files, compile the Go server code, and build the Docker images we need to deploy the cluster. + +Next, spin up the cluster: ```bash docker-compose up ``` -Reset by doing ^C and then running `docker-compose down` to make sure all the containers are killed. +Now, your cluster should be running! Let's test out if a node can talk to another node. Note that these commands will now be run-specific, since Docker containers are ID'd randomly. First, list the container ID's of our cluster with `docker ps`. We'll get something like: + +``` +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +c20387bbd534 ray-node "/bin/sh -c './go/bi…" 3 seconds ago Up 2 seconds 0.0.0.0:50004->50004/tcp babyray_worker3_1 +dce69f377b01 ray-node "/bin/sh -c ./go/bin…" 3 seconds ago Up 2 seconds 0.0.0.0:50001->50001/tcp babyray_global_scheduler_1 +e06583f90acd ray-node "/bin/sh -c './go/bi…" 3 seconds ago Up 2 seconds 0.0.0.0:50002->50002/tcp babyray_worker1_1 +1d2559b0bb90 ray-node "/bin/sh -c './go/bi…" 3 seconds ago Up 2 seconds 0.0.0.0:50003->50003/tcp babyray_worker2_1 +c3781cf7bbce ray-node "/bin/sh -c './go/bi…" 3 seconds ago Up 2 seconds 0.0.0.0:50000->50000/tcp babyray_gcs_1 +``` + +Now, pick one of those container ID's---here we pick worker 1---and run an interactive shell on it: + +```bash +docker exec -it e06583f90acd /bin/bash +``` + +Then, run the command `ping node0` and you'll get something like: + +``` +docker exec -it e06583f90acd /bin/bash +root@e06583f90acd:/app# ping node0 +PING node0 (172.24.0.2) 56(84) bytes of data. +64 bytes from babyray_gcs_1.babyray_mynetwork (172.24.0.2): icmp_seq=1 ttl=64 time=3.11 ms +64 bytes from babyray_gcs_1.babyray_mynetwork (172.24.0.2): icmp_seq=2 ttl=64 time=0.499 ms +64 bytes from babyray_gcs_1.babyray_mynetwork (172.24.0.2): icmp_seq=3 ttl=64 time=0.470 ms +64 bytes from babyray_gcs_1.babyray_mynetwork (172.24.0.2): icmp_seq=4 ttl=64 time=0.348 ms +64 bytes from babyray_gcs_1.babyray_mynetwork (172.24.0.2): icmp_seq=5 ttl=64 time=0.224 ms +64 bytes from babyray_gcs_1.babyray_mynetwork (172.24.0.2): icmp_seq=6 ttl=64 time=0.268 ms +64 bytes from babyray_gcs_1.babyray_mynetwork (172.24.0.2): icmp_seq=7 ttl=64 time=0.173 ms +``` + +Packets are flowing between worker 1 and the GCS! + +### Shut down the cluster +Reset by doing ^C in the `docker-compose up` session and then running `docker-compose down` to make sure all the containers are killed and the network is deleted. ## Run a single node diff --git a/docker-compose.yml b/docker-compose.yml index fd60065..616bb91 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,32 +2,61 @@ version: '3' services: gcs: - image: ray-node + image: ray-node:base command: ["/bin/sh", "-c", "./go/bin/gcsfunctable & ./go/bin/gcsobjtable"] + networks: + mynetwork: + aliases: + - node0 ports: - "50000:50000" global_scheduler: - image: ray-node - command: ["/bin/sh", "-c", "./go/bin/globalscheduler &"] + image: ray-node:base + command: ["/bin/sh", "-c", "./go/bin/globalscheduler"] + networks: + mynetwork: + aliases: + - node1 ports: - "50001:50001" worker1: - image: ray-node + image: ray-node:driver command: ["/bin/sh", "-c", "./go/bin/localscheduler & ./go/bin/localobjstore"] + networks: + mynetwork: + aliases: + - node2 ports: - "50002:50002" + environment: + NODE_ID: 2 worker2: - image: ray-node + image: ray-node:driver command: ["/bin/sh", "-c", "./go/bin/localscheduler & ./go/bin/localobjstore"] + networks: + mynetwork: + aliases: + - node3 ports: - "50003:50003" + environment: + NODE_ID: 3 worker3: - image: ray-node + image: ray-node:driver command: ["/bin/sh", "-c", "./go/bin/localscheduler & ./go/bin/localobjstore"] + networks: + mynetwork: + aliases: + - node4 ports: - "50004:50004" + environment: + NODE_ID: 4 +networks: + mynetwork: + driver: bridge diff --git a/go/config/config.go b/go/config/config.go index f2c613a..2206be3 100644 --- a/go/config/config.go +++ b/go/config/config.go @@ -34,16 +34,11 @@ type Config struct { func LoadConfig() *Config { var config Config - // Get the working directory of the executable. Key assumption here is that - // the executable is located at go/cmd/*/[executable]. Otherwise this will - // break. - cwd, err := os.Getwd() - if err != nil { - log.Fatalf("Failed to get current working directory: %v", err) - } - - // Construct the path to the configuration file - configFile := filepath.Join(cwd, "..", "..", "..", "config", "app_config.yaml") + // Construct the path to the configuration file. PROJECT_ROOT should somehow + // be set prior to any Go code execution (i.e., in GitHub Actions workflow + // before unit tests are run or in Dockerfile) + rootPath := os.Getenv("PROJECT_ROOT") + configFile := filepath.Join(rootPath, "config", "app_config.yaml") yamlFile, err := ioutil.ReadFile(configFile) if err != nil { diff --git a/log_into_driver.sh b/log_into_driver.sh new file mode 100755 index 0000000..dd4e2a7 --- /dev/null +++ b/log_into_driver.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# start an interactive /bin/bash session on the first container returned by "docker ps" that +# uses the "ray-node:driver" image +docker exec -it $(docker ps | grep 'ray-node:driver' | awk '{print $1}' | head -1) /bin/bash diff --git a/python/babyray/__init__.py b/python/babyray/__init__.py index f98b581..82510a0 100644 --- a/python/babyray/__init__.py +++ b/python/babyray/__init__.py @@ -47,7 +47,7 @@ class Future: def get(self): # make a request to local object store - out = local_object_store_gRPC.Get(rayclient_pb2.GetRequest(uid=self.uid)) + out = pickle.loads(local_object_store_gRPC.Get(rayclient_pb2.GetRequest(uid=self.uid)).objectBytes) return out @@ -64,7 +64,7 @@ def remote(self, *args, **kwargs): rayclient_pb2.ScheduleRequest( name=self.name, args=pickle.dumps(args), kwargs=pickle.dumps(kwargs) ) - ) + ).uid return Future(uid) def register(self): @@ -73,7 +73,7 @@ def register(self): rayclient_pb2.RegisterRequest( serializedFunc=pickle.dumps(self.func) ) - ) + ).name def init():