Setup ci (#6)

* verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> * verify Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com> --------- Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
intel · Dec 20, 2023 · c5738a1 · c5738a1
1 parent 83cb052
commit c5738a1
Showing 8 changed files with 91 additions and 29 deletions.
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -19,30 +19,44 @@ jobs:
         model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
-        
+
         exclude:
           - { isPR: true }
         include:
           - { model: "EleutherAI/gpt-j-6b"}
           - { model: "meta-llama/Llama-2-7b-chat-hf"}
 
     runs-on: self-hosted
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
+      env:
+        http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
+        https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+        - ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}:/root/actions-runner-config
+
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
       - name: Load environment variables
-        run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
 
       - name: Build Docker Image
-        run: docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes | docker image prune
+        run: |
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes
+          docker image prune -f
 
       - name: Start Docker Container
         run: |
           cid=$(docker ps -q --filter "name=finetune")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-          docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="finetune" --hostname="finetune-container" finetune:latest
-
+          docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="finetune" --hostname="finetune-container" finetune:latest
       - name: Run Finetune Test
         run: |
           docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
@@ -76,7 +90,6 @@ jobs:
           )
           docker exec "finetune" python -c "$CMD"
           docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
-
       - name: Run PEFT-LoRA Test
         run: |
           docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
@@ -96,7 +109,6 @@ jobs:
           )
           docker exec "finetune" python -c "$CMD"
           docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
-
       - name: Run Deltatuner Test on DENAS-LoRA Model
         run: |
           if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf)$ ]]; then
@@ -126,7 +138,6 @@ jobs:
             docker exec "finetune" python -c "$CMD"
             docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
           fi
-
       - name: Stop Ray
         run: |
           cid=$(docker ps -q --filter "name=finetune")
@@ -139,6 +150,5 @@ jobs:
         run: |
           cid=$(docker ps -q --filter "name=finetune")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-
       - name: Test Summary
-        run: echo "to be continued"
+        run: echo "to be continued"
diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
@@ -10,16 +10,27 @@ jobs:
       matrix:
         model: [ pythia-6.9b, gpt-j-6b ]
     runs-on: self-hosted
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
+      env:
+        http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
+        https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
       - name: Running task on Intel GPU
         run: |
-          rm ~/borealis-runner/llm-ray.tar.gz -f
-          tar zcf ~/borealis-runner/llm-ray.tar.gz -C ~/actions-runner/_work/llm-ray .
+          rm ~/borealis-runner/llm-on-ray.tar.gz -f
+          tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
           cd ~/borealis-runner/
           python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
-
       - name: Test Summary
-        run: echo "to be continued"
+        run: echo "to be continued"
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -32,10 +32,22 @@ jobs:
             model: mpt-7b
 
     runs-on: self-hosted
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
+      env:
+        http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
+        https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+
     steps:
       - name: Checkout
         uses: actions/checkout@v2
-
+      
       - name: Set Name Prefix
         id: "prefix"
         run: |
@@ -54,14 +66,15 @@ jobs:
             DF_SUFFIX=".cpu_and_deepspeed"
           fi
           PREFIX=${{steps.prefix.outputs.prefix}}
-          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes | docker image prune
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes
+          docker image prune -f
 
       - name: Start Docker Container
         run: |
           PREFIX=${{steps.prefix.outputs.prefix}}
           cid=$(docker ps -q --filter "name=${PREFIX}")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-          docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest
+          docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest
 
       - name: Start Ray Cluster
         run: |
@@ -126,4 +139,7 @@ jobs:
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
 
       - name: Test Summary
-        run: echo "to be continued"
+        run: echo "to be continued"
+
+
+
diff --git a/.github/workflows/workflow_orders on_merge.yml b/.github/workflows/workflow_orders on_merge.yml
@@ -0,0 +1,23 @@
+name: llm-ray inference & finetune
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - '.github/**'
+      - 'docker/**'
+      - 'common/**'
+      - 'dev/docker/**'
+      - 'finetune/**'
+      - 'inference/**'
+      - 'rlhf/**'
+      - 'tools/**'
+
+jobs:
+
+  call-inference:
+    uses: ./.github/workflows/workflow_inference.yml
+
+  call-finetune:
+    uses: ./.github/workflows/workflow_finetune.yml
diff --git a/.github/workflows/workflow_orders_nightly.yml b/.github/workflows/workflow_orders_nightly.yml
@@ -1,8 +1,8 @@
-name: llm-ray inference & finetune
+name: llm-ray inference & finetune nightly
 
 on:
   schedule:
-    - cron: "0 21 * * *"
+    - cron: "0 16 * * *"
 
 jobs:
 
@@ -17,4 +17,4 @@ jobs:
       ci_type: nightly
 
   call-finetune-on-intel-gpu:
-    uses: ./.github/workflows/workflow_finetune_gpu.yml
+    uses: ./.github/workflows/workflow_finetune_gpu.yml
diff --git a/.github/workflows/workflow_orders.yml → .github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders.yml → .github/workflows/workflow_orders_on_pr.yml
diff --git a/dev/docker/Dockerfile.bigdl-cpu b/dev/docker/Dockerfile.bigdl-cpu
@@ -1,10 +1,11 @@
+# syntax=docker/dockerfile:1
 FROM ubuntu:22.04
 
 ENV LANG C.UTF-8
 
-WORKDIR /root/llm-ray
+WORKDIR /root/llm-on-ray
 
-RUN apt-get update -y \
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
     && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH
 # setup env
 SHELL ["/bin/bash", "--login", "-c"]
 
-RUN conda init bash && \
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
     unset -f conda && \
     export PATH=$CONDA_DIR/bin/:${PATH} && \
     conda config --add channels intel && \
@@ -27,7 +28,7 @@ COPY ./pyproject.toml .
 
 RUN mkdir ./finetune && mkdir ./inference
 
-RUN pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \
     -f https://download.pytorch.org/whl/torch_stable.html
 
 # Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)

diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed b/dev/docker/Dockerfile.cpu_and_deepspeed
@@ -1,10 +1,11 @@
+# syntax=docker/dockerfile:1
 FROM ubuntu:22.04
 
 ENV LANG C.UTF-8
 
-WORKDIR /root/llm-ray
+WORKDIR /root/llm-on-ray
 
-RUN apt-get update -y \
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
     && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH
 # setup env
 SHELL ["/bin/bash", "--login", "-c"]
 
-RUN conda init bash && \
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
     unset -f conda && \
     export PATH=$CONDA_DIR/bin/:${PATH} && \
     conda config --add channels intel && \
@@ -27,7 +28,7 @@ COPY ./pyproject.toml .
 
 RUN mkdir ./finetune && mkdir ./inference
 
-RUN pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \
     -f https://download.pytorch.org/whl/torch_stable.html
 
 RUN ds_report