.github/workflows/workflow_inference_gaudi2.yml

name: Inference

on:
  workflow_call:
    inputs:
      ci_type:
        type: string
        default: 'pr'
      runner_container_image:
        type: string
        default: '127.0.0.1:5000/llmray-build'
      runner_config_path:
        type: string
        default: '/home/ci/llm-ray-actions-runner'
      code_checkout_path:
        type: string
        default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
      model_cache_path:
        type: string
        default: '/scratch-2/huggingface/cache'

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-gaudi2
  cancel-in-progress: true

permissions:  # added using https://github.com/step-security/secure-repo
  contents: read

jobs:
  inference:
    name: inference
    strategy:
      matrix:
        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, mistral-7b-v0.1, mpt-7b, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ]
        isPR:
          - ${{inputs.ci_type == 'pr'}}

        exclude:
          - { isPR: true }

        include:
          - { model: "bloom-7b1"}
          - { model: "CodeLlama-7b-hf"}
          - { model: "falcon-7b"}
          - { model: "falcon-40b"}
          - { model: "gemma-2b"}
          - { model: "gpt-j-6b"}
          - { model: "gpt2"}
          - { model: "llama-2-7b-chat-hf"}
          - { model: "llama-2-70b-chat-hf"}
          - { model: "meta-llama-3-8b-instruct"}
          - { model: "meta-llama-3-70b-instruct"}
          - { model: "mpt-7b"}
          - { model: "mistral-7b-v0.1"}
          - { model: "Qwen2-7B-Instruct"}
          - { model: "llama-2-7b-chat-hf-vllm"}

    runs-on: gaudi2

    defaults:
      run:
        shell: bash
    container:
      image: ${{ inputs.runner_container_image }}
      env:
        SHELL: bash -eo pipefail
        http_proxy:
        https_proxy:
      volumes:
        - /var/run/docker.sock:/var/run/docker.sock
        - ${{ inputs.runner_config_path }}:/root/actions-runner-config

    steps:
      - name: Determine Target
        id: "target"
        run: |
          target="inference"
          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
            target="${target}_vllm_gaudi2"
          else
            target="${target}_gaudi2"
          fi
          echo "target is ${target}"
          echo "target=$target" >> $GITHUB_OUTPUT

      - name: Checkout
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

      - name: Build Docker Image
        run: |
          DF_SUFFIX=".gaudi2"
          TARGET=${{steps.target.outputs.target}}
          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
            dockerfile="dev/docker/ci/Dockerfile.habana_vllm"
          else
            dockerfile="dev/docker/ci/Dockerfile.habana"
          fi
          docker build --build-arg CACHEBUST=1 -f ${dockerfile} -t ${TARGET}:habana .
          docker container prune -f
          docker image prune -f

      - name: Start Docker Container
        run: |
          TARGET=${{steps.target.outputs.target}}
          cid=$(docker ps -q --filter "name=${TARGET}")
          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
          # check and remove exited container
          cid=$(docker ps -a -q --filter "name=${TARGET}")
          if [[ ! -z "$cid" ]]; then docker rm $cid; fi
          docker run -tid --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
      - name: Start Ray Cluster
        run: |
          TARGET=${{steps.target.outputs.target}}
          docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"

      - name: Run Inference Test
        run: |
          TARGET=${{steps.target.outputs.target}}
          CMD=$(cat << EOF
          import yaml
          conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml"
          if ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):   
              conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
          with open(conf_path, encoding="utf-8") as reader:
              result = yaml.load(reader, Loader=yaml.FullLoader)
          with open(conf_path, 'w') as output:
              yaml.dump(result, output, sort_keys=False)
          EOF
          )
          docker exec "${TARGET}" python -c "$CMD"
          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --keep_serve_terminal"
          elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
          elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal" 
          else
            docker exec "${TARGET}" bash -c "llm_on_ray-serve  --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal"
          fi
          echo Streaming query:
          docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response"

      - name: Stop Ray
        run: |
          TARGET=${{steps.target.outputs.target}}
          cid=$(docker ps -q --filter "name=${TARGET}")
          if [[ ! -z "$cid" ]]; then
            docker exec "${TARGET}" bash -c "ray stop"
          fi

      - name: Stop Container
        if: success() || failure()
        run: |
          TARGET=${{steps.target.outputs.target}}
          cid=$(docker ps -q --filter "name=${TARGET}")
          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi