Skip to content

Commit

Permalink
Setup ci (#6)
Browse files Browse the repository at this point in the history
* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

---------

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
  • Loading branch information
jiafuzha authored Dec 20, 2023
1 parent 83cb052 commit c5738a1
Showing 8 changed files with 91 additions and 29 deletions.
30 changes: 20 additions & 10 deletions .github/workflows/workflow_finetune.yml
Original file line number Diff line number Diff line change
@@ -19,30 +19,44 @@ jobs:
model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b ]
isPR:
- ${{inputs.ci_type == 'pr'}}

exclude:
- { isPR: true }
include:
- { model: "EleutherAI/gpt-j-6b"}
- { model: "meta-llama/Llama-2-7b-chat-hf"}

runs-on: self-hosted

defaults:
run:
shell: bash
container:
image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
env:
http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}:/root/actions-runner-config

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Load environment variables
run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV
run: cat /root/actions-runner-config/.env >> $GITHUB_ENV

- name: Build Docker Image
run: docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes | docker image prune
run: |
docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes
docker image prune -f
- name: Start Docker Container
run: |
cid=$(docker ps -q --filter "name=finetune")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="finetune" --hostname="finetune-container" finetune:latest
docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="finetune" --hostname="finetune-container" finetune:latest
- name: Run Finetune Test
run: |
docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external"
@@ -76,7 +90,6 @@ jobs:
)
docker exec "finetune" python -c "$CMD"
docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
- name: Run PEFT-LoRA Test
run: |
docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
@@ -96,7 +109,6 @@ jobs:
)
docker exec "finetune" python -c "$CMD"
docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
- name: Run Deltatuner Test on DENAS-LoRA Model
run: |
if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf)$ ]]; then
@@ -126,7 +138,6 @@ jobs:
docker exec "finetune" python -c "$CMD"
docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
fi
- name: Stop Ray
run: |
cid=$(docker ps -q --filter "name=finetune")
@@ -139,6 +150,5 @@ jobs:
run: |
cid=$(docker ps -q --filter "name=finetune")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
- name: Test Summary
run: echo "to be continued"
run: echo "to be continued"
19 changes: 15 additions & 4 deletions .github/workflows/workflow_finetune_gpu.yml
Original file line number Diff line number Diff line change
@@ -10,16 +10,27 @@ jobs:
matrix:
model: [ pythia-6.9b, gpt-j-6b ]
runs-on: self-hosted

defaults:
run:
shell: bash
container:
image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
env:
http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
volumes:
- /var/run/docker.sock:/var/run/docker.sock

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Running task on Intel GPU
run: |
rm ~/borealis-runner/llm-ray.tar.gz -f
tar zcf ~/borealis-runner/llm-ray.tar.gz -C ~/actions-runner/_work/llm-ray .
rm ~/borealis-runner/llm-on-ray.tar.gz -f
tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
cd ~/borealis-runner/
python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
- name: Test Summary
run: echo "to be continued"
run: echo "to be continued"
24 changes: 20 additions & 4 deletions .github/workflows/workflow_inference.yml
Original file line number Diff line number Diff line change
@@ -32,10 +32,22 @@ jobs:
model: mpt-7b

runs-on: self-hosted

defaults:
run:
shell: bash
container:
image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
env:
http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
volumes:
- /var/run/docker.sock:/var/run/docker.sock

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Set Name Prefix
id: "prefix"
run: |
@@ -54,14 +66,15 @@ jobs:
DF_SUFFIX=".cpu_and_deepspeed"
fi
PREFIX=${{steps.prefix.outputs.prefix}}
docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes | docker image prune
docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes
docker image prune -f
- name: Start Docker Container
run: |
PREFIX=${{steps.prefix.outputs.prefix}}
cid=$(docker ps -q --filter "name=${PREFIX}")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest
docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest
- name: Start Ray Cluster
run: |
@@ -126,4 +139,7 @@ jobs:
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
- name: Test Summary
run: echo "to be continued"
run: echo "to be continued"



23 changes: 23 additions & 0 deletions .github/workflows/workflow_orders on_merge.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: llm-ray inference & finetune

on:
push:
branches:
- main
paths:
- '.github/**'
- 'docker/**'
- 'common/**'
- 'dev/docker/**'
- 'finetune/**'
- 'inference/**'
- 'rlhf/**'
- 'tools/**'

jobs:

call-inference:
uses: ./.github/workflows/workflow_inference.yml

call-finetune:
uses: ./.github/workflows/workflow_finetune.yml
6 changes: 3 additions & 3 deletions .github/workflows/workflow_orders_nightly.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: llm-ray inference & finetune
name: llm-ray inference & finetune nightly

on:
schedule:
- cron: "0 21 * * *"
- cron: "0 16 * * *"

jobs:

@@ -17,4 +17,4 @@ jobs:
ci_type: nightly

call-finetune-on-intel-gpu:
uses: ./.github/workflows/workflow_finetune_gpu.yml
uses: ./.github/workflows/workflow_finetune_gpu.yml
File renamed without changes.
9 changes: 5 additions & 4 deletions dev/docker/Dockerfile.bigdl-cpu
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# syntax=docker/dockerfile:1
FROM ubuntu:22.04

ENV LANG C.UTF-8

WORKDIR /root/llm-ray
WORKDIR /root/llm-on-ray

RUN apt-get update -y \
RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
&& apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
@@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH
# setup env
SHELL ["/bin/bash", "--login", "-c"]

RUN conda init bash && \
RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
unset -f conda && \
export PATH=$CONDA_DIR/bin/:${PATH} && \
conda config --add channels intel && \
@@ -27,7 +28,7 @@ COPY ./pyproject.toml .

RUN mkdir ./finetune && mkdir ./inference

RUN pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \
RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \
-f https://download.pytorch.org/whl/torch_stable.html

# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
9 changes: 5 additions & 4 deletions dev/docker/Dockerfile.cpu_and_deepspeed
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# syntax=docker/dockerfile:1
FROM ubuntu:22.04

ENV LANG C.UTF-8

WORKDIR /root/llm-ray
WORKDIR /root/llm-on-ray

RUN apt-get update -y \
RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
&& apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
@@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH
# setup env
SHELL ["/bin/bash", "--login", "-c"]

RUN conda init bash && \
RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
unset -f conda && \
export PATH=$CONDA_DIR/bin/:${PATH} && \
conda config --add channels intel && \
@@ -27,7 +28,7 @@ COPY ./pyproject.toml .

RUN mkdir ./finetune && mkdir ./inference

RUN pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \
RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \
-f https://download.pytorch.org/whl/torch_stable.html

RUN ds_report

0 comments on commit c5738a1

Please sign in to comment.