From b6c28ff4dd4ad0002ea4abdcd85cc097d4d226fb Mon Sep 17 00:00:00 2001 From: Robert Dower Date: Sun, 12 Nov 2023 21:08:24 -0800 Subject: [PATCH 01/14] add code_of_conduct, contributing agreement, and security.md file --- CODE_OF_CONDUCT.md | 131 +++++++++++++++++++++++++++++++++++++++++++++ CONTRIBUTING.md | 57 ++++++++++++++++++++ security.md | 5 ++ 3 files changed, 193 insertions(+) create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 security.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..58dba18db --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,131 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +CommunityCodeOfConduct AT intel DOT com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..f682f4e4c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,57 @@ +# Contributing + +### License + + is licensed under the terms in [LICENSE]. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. + +### Sign your work + +Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify +the below (from [developercertificate.org](http://developercertificate.org/)): + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +660 York Street, Suite 102, +San Francisco, CA 94110 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +Then you just add a line to every git commit message: + + Signed-off-by: Joe Smith + +Use your real name (sorry, no pseudonyms or anonymous contributions.) + +If you set your `user.name` and `user.email` git configs, you can sign your +commit automatically with `git commit -s`. diff --git a/security.md b/security.md new file mode 100644 index 000000000..cb59eb893 --- /dev/null +++ b/security.md @@ -0,0 +1,5 @@ +# Security Policy +Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. + +## Reporting a Vulnerability +Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). From d17a473f1f56ff69c83de4f7710c2a10848b720f Mon Sep 17 00:00:00 2001 From: Carson Wang Date: Thu, 7 Dec 2023 19:30:23 +0800 Subject: [PATCH 02/14] Remove duplicate Security.md --- Security.md | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 Security.md diff --git a/Security.md b/Security.md deleted file mode 100644 index d85d4358b..000000000 --- a/Security.md +++ /dev/null @@ -1,5 +0,0 @@ -# Security Policy -Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. - -## Reporting a Vulnerability -Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). From cbb9898b34a81ab9e5814e92df94dde8ac9dd959 Mon Sep 17 00:00:00 2001 From: Carson Wang Date: Thu, 7 Dec 2023 19:43:36 +0800 Subject: [PATCH 03/14] Update image link in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 55198574f..8e38da957 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ LLM-on-Ray's modular workflow structure is designed to comprehensively cater to * **Interactive Web UI for Enhanced Usability**: Except for command line, LLM-on-Ray introduces a Web UI, allowing users to easily finetune and deploy LLMs through a user-friendly interface. Additionally, the UI includes a chatbot application, enabling users to immediately test and refine the models. -![image](https://github.com/intel-sandbox/llm-ray/assets/9278199/addd7a7f-83ef-43ae-b3ac-dd81cc2570e4) +![llm-on-ray](https://github.com/intel/llm-on-ray/assets/9278199/68017c14-c0be-4b91-8d71-4b74ab89bd81) ## Getting Started From 83cb052ee9cc032139d6349db845cfbc37d1bc22 Mon Sep 17 00:00:00 2001 From: Carson Wang Date: Fri, 15 Dec 2023 13:59:24 +0800 Subject: [PATCH 04/14] Update web_ui.md Signed-off-by: Carson Wang --- docs/web_ui.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/web_ui.md b/docs/web_ui.md index 3b0578643..da92b7f66 100644 --- a/docs/web_ui.md +++ b/docs/web_ui.md @@ -3,7 +3,7 @@ LLM-on-Ray introduces a Web UI, allowing users to easily finetune and deploy LLMs through a user-friendly interface. Additionally, the UI includes a chatbot application, enabling users to immediately test and refine the models. ## Setup -Please follow [docs/setup.md](docs/setup.md) to setup the environment first. +Please follow [setup.md](setup.md) to setup the environment first. ## Start Web UI @@ -17,17 +17,19 @@ python -u inference/start_ui.py --node_user_name $user --conda_env_name $conda_e ## Finetune LLMs On the `Finetune` tab, you can configure the base model, finetuning parameters, the dataset path and the new model name. Click `Start To Finetune` to start finetuning. -![image](https://github.com/carsonwang/llm-ray/assets/9278199/38cb6f1f-b5de-495e-a4db-741eb1e15980) +![webui1](https://github.com/intel/llm-on-ray/assets/9278199/895be765-13d3-455e-a00d-c9ba67ac6781) + ## Deploy and Serve LLM On the `Deployment` tab, you can choose a model to deploy, configure parameter `Model Replica Number`, `Cpus per Worker` and `Gpus per Worker`. Click `Deploy` and you will get a model endpoint. -![image](https://github.com/carsonwang/llm-ray/assets/9278199/937613ad-951c-4543-9e2d-e5b8e7f38d1b) +![webui2](https://github.com/intel/llm-on-ray/assets/9278199/2a1fb8f2-a2a8-4868-9d1c-418c5c2a6180) + ## Chatbot On the `Inferenc` tab, you can now test the model by asking questions. -![image](https://github.com/carsonwang/llm-ray/assets/9278199/5aa3dace-238a-4b34-9ce2-b3abbd6de2ba) +![webui3](https://github.com/intel/llm-on-ray/assets/9278199/f7b9dc79-92fe-4e75-85fa-2cf7f36bb58d) From c5738a1e2dd3bf9bf9a04c01e7cbd4b7dd66159a Mon Sep 17 00:00:00 2001 From: jiafu zhang Date: Wed, 20 Dec 2023 07:55:32 +0000 Subject: [PATCH 05/14] Setup ci (#6) * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang * verify Signed-off-by: Jiafu Zhang --------- Signed-off-by: Jiafu Zhang --- .github/workflows/workflow_finetune.yml | 30 ++++++++++++------- .github/workflows/workflow_finetune_gpu.yml | 19 +++++++++--- .github/workflows/workflow_inference.yml | 24 ++++++++++++--- .../workflows/workflow_orders on_merge.yml | 23 ++++++++++++++ .github/workflows/workflow_orders_nightly.yml | 6 ++-- ...w_orders.yml => workflow_orders_on_pr.yml} | 0 dev/docker/Dockerfile.bigdl-cpu | 9 +++--- dev/docker/Dockerfile.cpu_and_deepspeed | 9 +++--- 8 files changed, 91 insertions(+), 29 deletions(-) create mode 100644 .github/workflows/workflow_orders on_merge.yml rename .github/workflows/{workflow_orders.yml => workflow_orders_on_pr.yml} (100%) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index 1aaaf4df5..bb7c99326 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -19,7 +19,7 @@ jobs: model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b ] isPR: - ${{inputs.ci_type == 'pr'}} - + exclude: - { isPR: true } include: @@ -27,22 +27,36 @@ jobs: - { model: "meta-llama/Llama-2-7b-chat-hf"} runs-on: self-hosted + + defaults: + run: + shell: bash + container: + image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }} + env: + http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }} + https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}:/root/actions-runner-config + steps: - name: Checkout uses: actions/checkout@v2 - name: Load environment variables - run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV + run: cat /root/actions-runner-config/.env >> $GITHUB_ENV - name: Build Docker Image - run: docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes | docker image prune + run: | + docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes + docker image prune -f - name: Start Docker Container run: | cid=$(docker ps -q --filter "name=finetune") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi - docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="finetune" --hostname="finetune-container" finetune:latest - + docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="finetune" --hostname="finetune-container" finetune:latest - name: Run Finetune Test run: | docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external" @@ -76,7 +90,6 @@ jobs: ) docker exec "finetune" python -c "$CMD" docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" - - name: Run PEFT-LoRA Test run: | docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" @@ -96,7 +109,6 @@ jobs: ) docker exec "finetune" python -c "$CMD" docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" - - name: Run Deltatuner Test on DENAS-LoRA Model run: | if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf)$ ]]; then @@ -126,7 +138,6 @@ jobs: docker exec "finetune" python -c "$CMD" docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" fi - - name: Stop Ray run: | cid=$(docker ps -q --filter "name=finetune") @@ -139,6 +150,5 @@ jobs: run: | cid=$(docker ps -q --filter "name=finetune") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi - - name: Test Summary - run: echo "to be continued" \ No newline at end of file + run: echo "to be continued" diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml index e3adb7923..f18e4eaf5 100644 --- a/.github/workflows/workflow_finetune_gpu.yml +++ b/.github/workflows/workflow_finetune_gpu.yml @@ -10,16 +10,27 @@ jobs: matrix: model: [ pythia-6.9b, gpt-j-6b ] runs-on: self-hosted + + defaults: + run: + shell: bash + container: + image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }} + env: + http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }} + https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + steps: - name: Checkout uses: actions/checkout@v2 - name: Running task on Intel GPU run: | - rm ~/borealis-runner/llm-ray.tar.gz -f - tar zcf ~/borealis-runner/llm-ray.tar.gz -C ~/actions-runner/_work/llm-ray . + rm ~/borealis-runner/llm-on-ray.tar.gz -f + tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray . cd ~/borealis-runner/ python3 finetune_on_pvc.py --base_model "${{ matrix.model }}" - - name: Test Summary - run: echo "to be continued" + run: echo "to be continued" \ No newline at end of file diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index 4662ee5eb..d4e8d21b6 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -32,10 +32,22 @@ jobs: model: mpt-7b runs-on: self-hosted + + defaults: + run: + shell: bash + container: + image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }} + env: + http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }} + https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + steps: - name: Checkout uses: actions/checkout@v2 - + - name: Set Name Prefix id: "prefix" run: | @@ -54,14 +66,15 @@ jobs: DF_SUFFIX=".cpu_and_deepspeed" fi PREFIX=${{steps.prefix.outputs.prefix}} - docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes | docker image prune + docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes + docker image prune -f - name: Start Docker Container run: | PREFIX=${{steps.prefix.outputs.prefix}} cid=$(docker ps -q --filter "name=${PREFIX}") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi - docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest + docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest - name: Start Ray Cluster run: | @@ -126,4 +139,7 @@ jobs: if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi - name: Test Summary - run: echo "to be continued" \ No newline at end of file + run: echo "to be continued" + + + \ No newline at end of file diff --git a/.github/workflows/workflow_orders on_merge.yml b/.github/workflows/workflow_orders on_merge.yml new file mode 100644 index 000000000..e453f242b --- /dev/null +++ b/.github/workflows/workflow_orders on_merge.yml @@ -0,0 +1,23 @@ +name: llm-ray inference & finetune + +on: + push: + branches: + - main + paths: + - '.github/**' + - 'docker/**' + - 'common/**' + - 'dev/docker/**' + - 'finetune/**' + - 'inference/**' + - 'rlhf/**' + - 'tools/**' + +jobs: + + call-inference: + uses: ./.github/workflows/workflow_inference.yml + + call-finetune: + uses: ./.github/workflows/workflow_finetune.yml diff --git a/.github/workflows/workflow_orders_nightly.yml b/.github/workflows/workflow_orders_nightly.yml index 2ba24db1a..9ee0fd202 100644 --- a/.github/workflows/workflow_orders_nightly.yml +++ b/.github/workflows/workflow_orders_nightly.yml @@ -1,8 +1,8 @@ -name: llm-ray inference & finetune +name: llm-ray inference & finetune nightly on: schedule: - - cron: "0 21 * * *" + - cron: "0 16 * * *" jobs: @@ -17,4 +17,4 @@ jobs: ci_type: nightly call-finetune-on-intel-gpu: - uses: ./.github/workflows/workflow_finetune_gpu.yml + uses: ./.github/workflows/workflow_finetune_gpu.yml \ No newline at end of file diff --git a/.github/workflows/workflow_orders.yml b/.github/workflows/workflow_orders_on_pr.yml similarity index 100% rename from .github/workflows/workflow_orders.yml rename to .github/workflows/workflow_orders_on_pr.yml diff --git a/dev/docker/Dockerfile.bigdl-cpu b/dev/docker/Dockerfile.bigdl-cpu index 449a456b4..403848876 100644 --- a/dev/docker/Dockerfile.bigdl-cpu +++ b/dev/docker/Dockerfile.bigdl-cpu @@ -1,10 +1,11 @@ +# syntax=docker/dockerfile:1 FROM ubuntu:22.04 ENV LANG C.UTF-8 -WORKDIR /root/llm-ray +WORKDIR /root/llm-on-ray -RUN apt-get update -y \ +RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \ && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH # setup env SHELL ["/bin/bash", "--login", "-c"] -RUN conda init bash && \ +RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \ unset -f conda && \ export PATH=$CONDA_DIR/bin/:${PATH} && \ conda config --add channels intel && \ @@ -27,7 +28,7 @@ COPY ./pyproject.toml . RUN mkdir ./finetune && mkdir ./inference -RUN pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \ +RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \ -f https://download.pytorch.org/whl/torch_stable.html # Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s) diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed b/dev/docker/Dockerfile.cpu_and_deepspeed index a84ed7bdc..c907d775f 100644 --- a/dev/docker/Dockerfile.cpu_and_deepspeed +++ b/dev/docker/Dockerfile.cpu_and_deepspeed @@ -1,10 +1,11 @@ +# syntax=docker/dockerfile:1 FROM ubuntu:22.04 ENV LANG C.UTF-8 -WORKDIR /root/llm-ray +WORKDIR /root/llm-on-ray -RUN apt-get update -y \ +RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \ && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH # setup env SHELL ["/bin/bash", "--login", "-c"] -RUN conda init bash && \ +RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \ unset -f conda && \ export PATH=$CONDA_DIR/bin/:${PATH} && \ conda config --add channels intel && \ @@ -27,7 +28,7 @@ COPY ./pyproject.toml . RUN mkdir ./finetune && mkdir ./inference -RUN pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \ +RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \ -f https://download.pytorch.org/whl/torch_stable.html RUN ds_report From 3498b4df3dd3888a46467dee19ed48f507b30ffc Mon Sep 17 00:00:00 2001 From: jiafu zhang Date: Thu, 21 Dec 2023 06:05:27 +0000 Subject: [PATCH 06/14] Sync with internal (cherry-picked from 644488 to 25118e3) (#9) * merged [common] unified conf to yaml Signed-off-by: Jiafu Zhang * reconstruct config by moving ipex and precision to ipex struct (#168) * reconstruct config by moving ipex and precision to ipex struct Signed-off-by: Jiafu Zhang * reconstruct config by moving ipex and precision to ipex struct Signed-off-by: Jiafu Zhang --------- Signed-off-by: Jiafu Zhang * [Inference] Add Neural-chat inference support (#149) * add neural chat inference * change transformers from 4.31 to 4.35 * update prompt * nit * trigger ci * remove from ci * add auth token to all models * revert * merged [Inference] Add Neural-chat inference support Signed-off-by: Jiafu Zhang * remove llama-2-7b in inferencce ci since ipex failed to optimize it Signed-off-by: Jiafu Zhang * Add the HFTonkenizer patch for Model-References (#169) Add the HFTokenizer patch Add the pretrain_module to invoke different pretrain module Signed-off-by: yuanwu * Abstract common features into Predictor (#166) * fix bug of precess config; use tokenizer.__call__ Signed-off-by: Zhi Lin * init Signed-off-by: Zhi Lin * create utils & move tokenizer to predictor Signed-off-by: Zhi Lin * fix Signed-off-by: Zhi Lin * change inferCfg to infer_conf; simplify code Signed-off-by: Zhi Lin * fix Signed-off-by: Zhi Lin * replace inferenceConfig with infer_conf Signed-off-by: Zhi Lin * fix deepspeed Signed-off-by: Zhi Lin * further simplify Signed-off-by: Zhi Lin * move actor_options to utils Signed-off-by: Zhi Lin * fix Signed-off-by: Zhi Lin * remove input len Signed-off-by: Zhi Lin * remove input len follow-up Signed-off-by: Zhi Lin --------- Signed-off-by: Zhi Lin * Update the Dockerfile.optimum.habana (#184) Updathe the dockerfile Fix the HABANA_VISIBLE_MODULES envs issue Signed-off-by: yuanwu * Update the Pretrain ReadME (#186) Signed-off-by: yuanwu * Move the dockerfiles of pretrain into pretrain/docker (#187) Delete the useless dp dockerfile Change the nvidia GPU dockerfile name, because it use the same dockerfile for both megatron-deepspeed and huggingface trainer refactor the folder path of pretrain Signed-off-by: yuanwu * renamed workflow_orders to workflow_orders_on_pr Signed-off-by: Jiafu Zhang --------- Signed-off-by: Jiafu Zhang Signed-off-by: yuanwu Signed-off-by: Zhi Lin Co-authored-by: harborn Co-authored-by: Yizhong Zhang Co-authored-by: yuanwu2017 Co-authored-by: Zhi Lin --- .github/workflows/config/mpt_deltatuner.yaml | 5 +- .../config/mpt_deltatuner_deepspeed.yaml | 5 +- .../update_finetune_config_on_intel_gpu.py | 7 +- .../config/update_inference_config.py | 27 +++ .github/workflows/workflow_finetune.yml | 31 +-- .github/workflows/workflow_inference.yml | 5 +- ...merge.yml => workflow_orders_on_merge.yml} | 0 README.md | 2 +- common/config.py | 19 +- dev/scripts/head_node_monitor.sh | 2 +- docs/pretrain.md | 89 ++++++-- docs/rlhf.md | 6 +- .../finetune/dolly1/dolly_1_finetune.conf | 35 --- .../finetune/dolly1/dolly_1_finetune.yaml | 29 +++ .../finetune/dolly2/dolly_2_finetune.conf | 35 --- .../finetune/dolly2/dolly_2_finetune.yaml | 29 +++ .../finetune/gpt_j_6b/finetune_intel_gpu.conf | 41 ---- .../finetune/gpt_j_6b/finetune_intel_gpu.yaml | 30 +++ .../open_assistant_finetune.conf | 35 --- .../open_assistant_finetune.yaml | 29 +++ finetune/finetune.conf | 40 ---- finetune/finetune.py | 24 +- finetune/finetune.yaml | 29 +++ finetune/finetune_config.py | 93 ++++++++ finetune/models/bloom-560m.yaml | 29 +++ finetune/models/finetune_config_template.yaml | 29 +++ finetune/models/gpt-j-6b.yaml | 29 +++ finetune/models/gpt2.yaml | 29 +++ finetune/models/llama-2-7b-chat-hf.yaml | 29 +++ finetune/models/llama-7b.yaml | 29 +++ finetune/models/mistral-7b-v0.1.yaml | 38 ++++ finetune/models/mpt-7b-chat.yaml | 29 +++ finetune/models/opt-125m.yaml | 29 +++ inference/deepspeed_predictor.py | 122 +++++++---- inference/inference_config.py | 31 ++- .../models/bigdl/mistral-7b-v0.1-bigdl.yaml | 5 +- inference/models/bigdl/mpt-7b-bigdl.yaml | 5 +- inference/models/bloom-560m.yaml | 5 +- inference/models/gpt-j-6b.yaml | 6 +- inference/models/gpt2.yaml | 5 +- inference/models/llama-2-7b-chat-hf.yaml | 5 +- inference/models/mistral-7b-v0.1.yaml | 5 +- inference/models/mpt-7b.yaml | 5 +- inference/models/neural-chat-7b-v3-1.yaml | 24 ++ inference/models/opt-125m.yaml | 5 +- .../template/inference_config_template.yaml | 5 +- inference/predictor.py | 129 +++++++---- inference/run_model_batch_predict.py | 5 - inference/run_model_infer.py | 2 +- inference/run_model_serve.py | 207 ++++-------------- inference/transformer_predictor.py | 52 +++-- inference/utils.py | 76 +++++++ ...ain.conf => llama_7b_8Guadi_pretrain.conf} | 35 +-- ...train.conf => llama_7b_8gpu_pretrain.conf} | 1 + ...egatron_deepspeed_zs0_8Gaudi_pretrain.conf | 17 +- .../docker}/Dockerfile.megatron.habana | 4 +- .../docker/Dockerfile.nvidia | 0 pretrain/docker/Dockerfile.optimum.habana | 12 + .../docker}/build-image.sh | 6 +- pretrain/megatron_deepspeed_pretrain.py | 32 ++- .../0001-Add-the-Huggingface-tokenizer.patch | 145 ++++++++++++ pretrain/plugin/megatron_dataset.py | 4 +- pretrain/plugin/megtron_initializer.py | 2 +- pretrain/pretrain.py | 5 +- pretrain/requirements.optimum-habana.txt | 22 ++ pyproject.toml | 4 +- rlhf/ppo.conf | 25 --- rlhf/ppo.yaml | 18 ++ rlhf/reward.conf | 24 -- rlhf/reward.yaml | 18 ++ tools/workload_in_containers/Dockerfile.dp | 34 --- .../Dockerfile.optimum.habana | 23 -- 72 files changed, 1353 insertions(+), 694 deletions(-) create mode 100644 .github/workflows/config/update_inference_config.py rename .github/workflows/{workflow_orders on_merge.yml => workflow_orders_on_merge.yml} (100%) delete mode 100644 examples/finetune/dolly1/dolly_1_finetune.conf create mode 100644 examples/finetune/dolly1/dolly_1_finetune.yaml delete mode 100644 examples/finetune/dolly2/dolly_2_finetune.conf create mode 100644 examples/finetune/dolly2/dolly_2_finetune.yaml delete mode 100644 examples/finetune/gpt_j_6b/finetune_intel_gpu.conf create mode 100644 examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml delete mode 100644 examples/finetune/open_assistant/open_assistant_finetune.conf create mode 100644 examples/finetune/open_assistant/open_assistant_finetune.yaml delete mode 100644 finetune/finetune.conf create mode 100644 finetune/finetune.yaml create mode 100644 finetune/finetune_config.py create mode 100644 finetune/models/bloom-560m.yaml create mode 100644 finetune/models/finetune_config_template.yaml create mode 100644 finetune/models/gpt-j-6b.yaml create mode 100644 finetune/models/gpt2.yaml create mode 100644 finetune/models/llama-2-7b-chat-hf.yaml create mode 100644 finetune/models/llama-7b.yaml create mode 100644 finetune/models/mistral-7b-v0.1.yaml create mode 100644 finetune/models/mpt-7b-chat.yaml create mode 100644 finetune/models/opt-125m.yaml create mode 100644 inference/models/neural-chat-7b-v3-1.yaml create mode 100644 inference/utils.py rename pretrain/config/{llama2_7b_8Guadi_pretrain.conf => llama_7b_8Guadi_pretrain.conf} (80%) rename pretrain/config/{llama2_7b_8gpu_pretrain.conf => llama_7b_8gpu_pretrain.conf} (98%) rename {tools/workload_in_containers => pretrain/docker}/Dockerfile.megatron.habana (83%) rename tools/workload_in_containers/Dockerfile.megatron.gpu => pretrain/docker/Dockerfile.nvidia (100%) create mode 100644 pretrain/docker/Dockerfile.optimum.habana rename {tools/workload_in_containers => pretrain/docker}/build-image.sh (75%) create mode 100644 pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch create mode 100644 pretrain/requirements.optimum-habana.txt delete mode 100644 rlhf/ppo.conf create mode 100644 rlhf/ppo.yaml delete mode 100644 rlhf/reward.conf create mode 100644 rlhf/reward.yaml delete mode 100755 tools/workload_in_containers/Dockerfile.dp delete mode 100644 tools/workload_in_containers/Dockerfile.optimum.habana diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml index 7a801045f..d9a41398d 100644 --- a/.github/workflows/config/mpt_deltatuner.yaml +++ b/.github/workflows/config/mpt_deltatuner.yaml @@ -1,13 +1,14 @@ port: 8000 name: mpt-7b route_prefix: /mpt-7b -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: true + precision: bf16 model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b diff --git a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml index c0aca37f8..227f79cc1 100644 --- a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml +++ b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml @@ -1,13 +1,14 @@ port: 8000 name: mpt-7b route_prefix: /mpt-7b -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: true workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: false + precision: bf16 model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py index fd1f579eb..f0e2a715e 100644 --- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py +++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py @@ -1,10 +1,11 @@ +import yaml import argparse def update_finetune_config(base_model): - conf_file = "finetune/finetune.conf" + conf_file = "finetune/finetune.yaml" with open(conf_file) as f: - config = eval(f.read()) + config = yaml.load(f, Loader=yaml.FullLoader) # due to compute node can't connect network # base models are downloaded as local files in directory ~/models/ # avaiable base models are: @@ -29,7 +30,7 @@ def update_finetune_config(base_model): config["Training"]["accelerate_mode"] = "GPU_DDP" with open(conf_file, "w") as f: - f.write(str(config)) + yaml.dump(config, f, sort_keys=False) def get_parser(): diff --git a/.github/workflows/config/update_inference_config.py b/.github/workflows/config/update_inference_config.py new file mode 100644 index 000000000..c1c700cdd --- /dev/null +++ b/.github/workflows/config/update_inference_config.py @@ -0,0 +1,27 @@ +import yaml +import argparse + + +def update_inference_config(config_file: str, output_file: str, deepspeed: bool, ipex: bool): + with open(config_file) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + config["deepspeed"] = deepspeed + config["ipex"]["enabled"] = ipex + + with open(output_file, "w") as f: + yaml.dump(config, f, sort_keys=False) + + +def get_parser(): + parser = argparse.ArgumentParser(description="Adjust Inference Config File") + parser.add_argument("--config_file", type=str, required=True) + parser.add_argument("--output_file", type=str, required=True) + parser.add_argument("--deepspeed", action='store_true') + parser.add_argument("--ipex", action='store_true') + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + update_inference_config(args.config_file, args.output_file, args.deepspeed, args.ipex) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index bb7c99326..4749750e4 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -61,9 +61,10 @@ jobs: run: | docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external" CMD=$(cat << EOF - conf_path = "finetune/finetune.conf" + import yaml + conf_path = "finetune/finetune.yaml" with open(conf_path, encoding="utf-8") as reader: - result = eval(reader.read()) + result = yaml.load(reader, Loader=yaml.FullLoader) result['General']['base_model'] = "${{ matrix.model }}" if "${{ matrix.model }}" == "mosaicml/mpt-7b-chat": result['General']['config']['trust_remote_code'] = True @@ -85,18 +86,20 @@ jobs: result['Training']['num_training_workers'] = 1 result['General']['lora_config'] = None with open(conf_path, 'w') as output: - print(result, file=output) + yaml.dump(result, output, sort_keys=False) EOF ) docker exec "finetune" python -c "$CMD" - docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" + docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml" + - name: Run PEFT-LoRA Test run: | docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" CMD=$(cat << EOF - conf_path = "finetune/finetune.conf" + import yaml + conf_path = "finetune/finetune.yaml" with open(conf_path, encoding="utf-8") as reader: - result = eval(reader.read()) + result = yaml.load(reader, Loader=yaml.FullLoader) result['General']['lora_config'] = { "task_type": "CAUSAL_LM", "r": 8, @@ -104,11 +107,12 @@ jobs: "lora_dropout": 0.1 } with open(conf_path, 'w') as output: - print(result, file=output) + yaml.dump(result, output, sort_keys=False) EOF ) docker exec "finetune" python -c "$CMD" - docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" + docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml" + - name: Run Deltatuner Test on DENAS-LoRA Model run: | if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf)$ ]]; then @@ -117,10 +121,11 @@ jobs: docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" CMD=$(cat << EOF import os + import yaml os.system("cp -r $(python -m pip show deltatuner | grep Location | cut -d: -f2)/deltatuner/conf/best_structure examples/") - conf_path = "finetune/finetune.conf" + conf_path = "finetune/finetune.yaml" with open(conf_path, encoding="utf-8") as reader: - result = eval(reader.read()) + result = yaml.load(reader, Loader=yaml.FullLoader) result['General']['lora_config'] = { "task_type": "CAUSAL_LM", "r": 8, @@ -133,10 +138,10 @@ jobs: "best_model_structure": f"examples/best_structure/${{ matrix.model }}-best_structure.jsonl", } with open(conf_path, 'w') as output: - print(result, file=output) + yaml.dump(result, output, sort_keys=False) EOF) docker exec "finetune" python -c "$CMD" - docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf" + docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml" fi - name: Stop Ray run: | @@ -144,7 +149,7 @@ jobs: if [[ ! -z "$cid" ]]; then docker exec "finetune" bash -c "ray stop" fi - + - name: Stop Container if: success() || failure() run: | diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index d4e8d21b6..eb3d978b9 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -17,7 +17,7 @@ jobs: strategy: matrix: # for mistral-7b-v0.1, we use bigdl-cpu to verify - model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl ] + model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1 ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -106,7 +106,8 @@ jobs: if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then echo ${{ matrix.model }} is not supported! else - docker exec "${PREFIX}" bash -c "KEEP_SERVE_TERMINAL='false' MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/run_model_serve.py --deepspeed" + docker exec "${PREFIX}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed" + docker exec "${PREFIX}" bash -c "KEEP_SERVE_TERMINAL='false' python inference/run_model_serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed" docker exec "${PREFIX}" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}" docker exec "${PREFIX}" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response" fi diff --git a/.github/workflows/workflow_orders on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml similarity index 100% rename from .github/workflows/workflow_orders on_merge.yml rename to .github/workflows/workflow_orders_on_merge.yml diff --git a/README.md b/README.md index 8e38da957..258f66102 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ ray start --head Use the following command to finetune a model using an example dataset and default configurations. The finetuned model will be stored in `/tmp/llm-ray/output` by default. To customize the base model, dataset and configurations, please see the [finetuning document](#finetune): ```bash -python finetune/finetune.py --config_path finetune/finetune.conf +python finetune/finetune.py --config_file finetune/finetune.yaml ``` ### Serving diff --git a/common/config.py b/common/config.py index 002189f55..801e48f72 100644 --- a/common/config.py +++ b/common/config.py @@ -1,11 +1,12 @@ import os +import yaml import argparse from typing import Dict def parse_args(): parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task") parser.add_argument( - "--config_path", + "--config_file", type=str, required=False, default=None, @@ -14,14 +15,18 @@ def parse_args(): args, unparsed = parser.parse_known_args() return args -def parse_config(config_path=None): - if config_path is None: +def parse_config(config_file=None): + if config_file is None: args = parse_args() - config_path = args.config_path - if config_path is None: + config_file = args.config_file + if config_file is None: return {} - with open(config_path) as f: - config = eval(f.read()) + if config_file.endswith("yaml"): + with open(config_file) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + else: + with open(config_file) as f: + config = eval(f.read()) assert isinstance(config, dict) return config diff --git a/dev/scripts/head_node_monitor.sh b/dev/scripts/head_node_monitor.sh index b17892ac4..0f3d6a385 100644 --- a/dev/scripts/head_node_monitor.sh +++ b/dev/scripts/head_node_monitor.sh @@ -85,7 +85,7 @@ do echo "Restart ray cluster on head node: ${head_node}, worker nodes: ${worker_node[*]}" ray_cluster up echo "Resubmit job." - RAY_ADDRESS="http://${head_node}:8265" ray job submit --no-wait --working-dir $CURRENT_DIR/Finetune/ -- python $CURRENT_DIR/Finetune/main.py --config_path $CURRENT_DIR/Finetune/llm_finetune_template.conf + RAY_ADDRESS="http://${head_node}:8265" ray job submit --no-wait --working-dir $CURRENT_DIR/Finetune/ -- python $CURRENT_DIR/Finetune/main.py --config_file $CURRENT_DIR/Finetune/llm_finetune_template.conf else echo "Report: head node ${head_node} works well" diff --git a/docs/pretrain.md b/docs/pretrain.md index 639afa8f3..2b3667523 100644 --- a/docs/pretrain.md +++ b/docs/pretrain.md @@ -10,12 +10,19 @@ git checkout main ``` #### 2. build Docker images for pretrain ```bash -cd llm-on-ray/tools/workload_in_containers -./build-image.sh megatron-habana # for Gaudi2 platform +cd llm-on-ray/pretrain/docker ``` -or +Build the habana docker image for Megatron-DeepSpeed. ```bash -./build-image.sh megatron-nvidia # for Nvidia GPU platform +./build-image.sh megatron-habana +``` +Build the habana docker image for Huggingface trainer +```bash +./build-image.sh optimum-habana +``` +Build the Nvidia docker image for both Megatron-DeepSpeed and Huggingface trainer +```bash +./build-image.sh nvidia ``` #### 3. Run the docker containers on head node and worker nodes for pretrain. @@ -24,12 +31,19 @@ make the logs directory for saving the ray logs. mkdir ~/workspace/logs ``` Gaudi2: + +##### Megatron-DeepSpeed ```bash docker run -it --name megatron-habana --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v ~/workspace:/home/user/workspace -v ~/workspace/logs:/tmp --cap-add=sys_nice --net=host --ipc=host llm-ray:megatron-habana ``` + +##### Huggingface trainer +```bash +docker run -it --name megatron-habana --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v ~/workspace:/home/user/workspace -v ~/workspace/logs:/tmp --cap-add=sys_nice --net=host --ipc=host llm-ray:optimum-habana +``` Nvidia GPU: ```bash -docker run --gpus all -it --ulimit memlock=-1 --ulimit stack=67108864 --network host --name megatron-nvidia --shm-size=64g -v ~/workspace/logs:/tmp -v ~/workspace:/home/user/workspace llm-ray:megatron-gpu /bin/bash +docker run --gpus all -it --ulimit memlock=-1 --ulimit stack=67108864 --network host --name megatron-nvidia --shm-size=64g -v ~/workspace/logs:/tmp -v ~/workspace:/home/user/workspace llm-ray:nvidia /bin/bash ``` #### 4. Launch ray cluster @@ -46,22 +60,54 @@ RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' If deploying a ray cluster on multiple nodes, please download the workflow repository on each node. More information about ray cluster, please refer to https://www.ray.io/ ### Pretrain Workflow -This workflow integrates the Megatron-DeepSpeed and Ray for pretrain. +This workflow integrates two different pretrain solutions. +#### Megatron-DeepSpeed For GPU version, we use the [Microsoft Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed). For Gaudi2 version, we use the [HabanaAI Megatron-DeepSpeed](https://github.com/HabanaAI/Model-References/tree/master/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed) +#### Huggingface Trainer +It integrates the megatron dataloader for pretrain. For habana support, it uses the [optimum-habana](https://github.com/huggingface/optimum-habana). It can use deepspeed ZeRO stage3 to train medium and large language models #### 1. Generate megatron datasets -Please refer to [this tutorial](../tools/redpajama_data_processing/README.md). Copy the datasets bin and idx files into ~/workspace/data +Please refer to [this tutorial](https://github.com/intel/e2eAIOK/tree/main/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save). Copy the datasets bin and idx files into ~/workspace/data -#### 2. Download the vocab file and merge table. -Download the GPT [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) into ~/workspace/data. +#### 2. Tokenizer +##### Download the vocab file and merge table. +If using the tokenizer files for Megatron_DeepSpeed pretrain, Download the GPT [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) into ~/workspace/data. ```bash cd ~/workspace/data/ wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt ``` - - +Modify the vocab_file and merge_file of megatron_config in config files +```bash +#llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf +"megatron_config": { + "vocab_file": "megatron-data/gpt2-vocab.json", + "merge_file": "megatron-data/gpt2-merges.txt", +} +``` +##### Huggingface Tokenizer +For Huggingface trainer, the Huggingface tokenizer is preferred. +Modify the tokenizer_type and tokenizer_model of megatron_config for megatron dataset. +```bash +#llama_7b_8Guadi_pretrain.conf +"megatron_config": { + "tokenizer_type": "HFTokenizer", + "tokenizer_model": "huggyllama/llama-7b", +} +``` +Modify the tokenizer parameters of trainer. The tokenizer of trainer and megatron dataset should be consistent +```bash +#llama_7b_8Guadi_pretrain.conf +"tokenizer": { + # The type of dataset, now only HuggingfaceTokenizer is supported. + "type": "HuggingFaceTokenizer", + # The name/path of tokenizer in huggingface. + "name": "huggyllama/llama-7b", + # Config of tokenizer, all items will be transfered to transformers.AutoTokenizer.from_pretrained(). + "config": {} +} +``` #### 3. Pretrain Command @@ -70,19 +116,34 @@ Please ensure that you check and modify the configuration files located in ~/wor After your environment configuration are properly set up, you can use the following instructions to pretrain the language model: ##### Gaudi2: +###### Megatron-DeepSpeed Set up `megatron_deepspeed_path` in the configuration. ```bash cd /home/user/workspace/llm-on-ray #Bloom-7B -python pretrain/megatron_deepspeed_pretrain.py --config_path pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf +python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf +#llama-7B +python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf +``` + +##### Huggingface Trainer +```bash +cd /home/user/workspace/llm-on-ray #llama-7B -python pretrain/megatron_deepspeed_pretrain.py --config_path pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf +python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8Guadi_pretrain.conf ``` ##### Nvidia GPU: +###### Megatron-DeepSpeed ```bash cd /home/user/workspace/llm-on-ray #llama2-7B -python pretrain/megatron_deepspeed_pretrain.py --config_path pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf +python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf ``` +##### Huggingface Trainer +```bash +cd /home/user/workspace/llm-on-ray +#llama-7B +python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8gpu_pretrain.conf +``` \ No newline at end of file diff --git a/docs/rlhf.md b/docs/rlhf.md index 1f18d03ba..5f3fd65b0 100755 --- a/docs/rlhf.md +++ b/docs/rlhf.md @@ -30,7 +30,7 @@ Additionally, we provide a simple data example at the path `examples/data/sample You can run the following command to start a SFT model training: ```bash -python Finetune/finetune.py --config_path Finetune/finetune.conf +python Finetune/finetune.py --config_file Finetune/finetune.conf ``` Once the model training is completed, based on your settings in the configuration file (such as `"checkpoint_dir": "/tmp/llm-ray/checkpoint/sft"`), you will obtain the final trained model in the `/tmp/llm-ray/checkpoint/sft` directory. This trained model will be utilized in the final stage of the PPO training process. @@ -67,7 +67,7 @@ Additionally, we provide a simple data example at the path `examples/data/sample You can run the following command to start a reward model training: ```bash -python rlhf/reward_trainer.py --config_path rlhf/reward.conf +python rlhf/reward_trainer.py --config_file rlhf/reward.conf ``` Once the reward model training is complete, based on your settings in the configuration file (such as `"checkpoint_dir": "/tmp/llm-ray/checkpoint/rm"`), you will obtain the final trained model in the `/tmp/llm-ray/checkpoint/rm` directory. This trained reward model will also be utilized in the final stage of the PPO training process. @@ -151,7 +151,7 @@ Additionally, we provide a simple data example at the path `examples/data/sample You can run the following command to start a ppo training: ```bash -python rlhf/ppo_trainer.py --config_path rlhf/ppo.conf +python rlhf/ppo_trainer.py --config_file rlhf/ppo.conf ``` It is important to note that before training, we need to configure the corresponding settings in `ppo.conf` based on the saved paths of the SFT (Structured Fine-Tuning) model and the RM (Reward Model) model. Assuming that we are using pre-trained models of `EleutherAI/gpt2` and considering the previous model save settings, we should configure the following in the `ppo.conf`: diff --git a/examples/finetune/dolly1/dolly_1_finetune.conf b/examples/finetune/dolly1/dolly_1_finetune.conf deleted file mode 100644 index f23e2b955..000000000 --- a/examples/finetune/dolly1/dolly_1_finetune.conf +++ /dev/null @@ -1,35 +0,0 @@ -{ - "General": { - "base_model": "EleutherAI/gpt-j-6b", - "gpt_base_model": True, - "output_dir": "/tmp/llm-ray/output", - "checkpoint_dir": "/tmp/llm-ray/checkpoint", - "config": { - "trust_remote_code": False, - "use_auth_token": None, - }, - "lora_config": { - "task_type": "CAUSAL_LM", - "r": 8, - "lora_alpha": 32, - "lora_dropout": 0.1 - } - }, - "Dataset": { - "train_file": "examples/finetune/dolly1/data/train/train.jsonl", - "validation_file": None, - "validation_split_percentage": 5 - }, - "Training": { - "optimizer": "AdamW", - "batch_size": 2, - "epochs": 3, - "learning_rate": 1e-5, - "lr_scheduler": "linear", - "weight_decay": 0.0, - "num_training_workers": 2, - "resources_per_worker": { - "CPU": 32 - }, - }, -} diff --git a/examples/finetune/dolly1/dolly_1_finetune.yaml b/examples/finetune/dolly1/dolly_1_finetune.yaml new file mode 100644 index 000000000..eb9a93b93 --- /dev/null +++ b/examples/finetune/dolly1/dolly_1_finetune.yaml @@ -0,0 +1,29 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/finetune/dolly1/data/train/train.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/examples/finetune/dolly2/dolly_2_finetune.conf b/examples/finetune/dolly2/dolly_2_finetune.conf deleted file mode 100644 index 63d937eb5..000000000 --- a/examples/finetune/dolly2/dolly_2_finetune.conf +++ /dev/null @@ -1,35 +0,0 @@ -{ - "General": { - "base_model": "EleutherAI/pythia-6.9b", - "gpt_base_model": True, - "output_dir": "/tmp/llm-ray/output", - "checkpoint_dir": "/tmp/llm-ray/checkpoint", - "config": { - "trust_remote_code": False, - "use_auth_token": None, - }, - "lora_config": { - "task_type": "CAUSAL_LM", - "r": 8, - "lora_alpha": 32, - "lora_dropout": 0.1 - } - }, - "Dataset": { - "train_file": "databricks/databricks-dolly-15k", - "validation_file": None, - "validation_split_percentage": 5 - }, - "Training": { - "optimizer": "AdamW", - "batch_size": 2, - "epochs": 3, - "learning_rate": 1e-5, - "lr_scheduler": "linear", - "weight_decay": 0.0, - "num_training_workers": 2, - "resources_per_worker": { - "CPU": 32 - }, - }, -} \ No newline at end of file diff --git a/examples/finetune/dolly2/dolly_2_finetune.yaml b/examples/finetune/dolly2/dolly_2_finetune.yaml new file mode 100644 index 000000000..95dd0dd86 --- /dev/null +++ b/examples/finetune/dolly2/dolly_2_finetune.yaml @@ -0,0 +1,29 @@ +General: + base_model: EleutherAI/pythia-6.9b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: databricks/databricks-dolly-15k + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.conf b/examples/finetune/gpt_j_6b/finetune_intel_gpu.conf deleted file mode 100644 index 970544011..000000000 --- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.conf +++ /dev/null @@ -1,41 +0,0 @@ -{ - "General": { - "base_model": "EleutherAI/gpt-j-6b", - # fix issue: https://github.com/huggingface/transformers/issues/22482 - # tranformers version 4.26.0 is required for gpt2, gpt-j-6B, pythia... - "gpt_base_model": True, - "output_dir": "/tmp/llm-ray/output", - "checkpoint_dir": "/tmp/llm-ray/checkpoint", - "config": { - "trust_remote_code": False, - "use_auth_token": None, - }, - "lora_config": { - "task_type": "CAUSAL_LM", - "r": 8, - "lora_alpha": 32, - "lora_dropout": 0.1 - } - }, - "Dataset": { - "train_file": "examples/data/sample_finetune_data.jsonl", - "validation_file": None, - "validation_split_percentage": 5 - }, - "Training": { - "optimizer": "AdamW", - "batch_size": 4, - "epochs": 3, - "learning_rate": 1e-5, - "lr_scheduler": "linear", - "weight_decay": 0.0, - "device": "GPU", - "num_training_workers": 2, - "accelerate_mode": "GPU_DDP", - "resources_per_worker": { - "CPU": 1, - "GPU": 1, - }, - }, -} - diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml new file mode 100644 index 000000000..41303f615 --- /dev/null +++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml @@ -0,0 +1,30 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 4 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: GPU + num_training_workers: 2 + accelerate_mode: GPU_DDP + resources_per_worker: + CPU: 1 + GPU: 1 diff --git a/examples/finetune/open_assistant/open_assistant_finetune.conf b/examples/finetune/open_assistant/open_assistant_finetune.conf deleted file mode 100644 index d6913df44..000000000 --- a/examples/finetune/open_assistant/open_assistant_finetune.conf +++ /dev/null @@ -1,35 +0,0 @@ -{ - "General": { - "base_model": "EleutherAI/gpt-j-6b", - "gpt_base_model": True, - "output_dir": "/tmp/llm-ray/output", - "checkpoint_dir": "/tmp/llm-ray/checkpoint", - "config": { - "trust_remote_code": False, - "use_auth_token": None, - }, - "lora_config": { - "task_type": "CAUSAL_LM", - "r": 8, - "lora_alpha": 32, - "lora_dropout": 0.1 - } - }, - "Dataset": { - "train_file": "examples/finetune/open_assistant/data/train/train.jsonl", - "validation_file": "examples/finetune/open_assistant/data/validation/validation.jsonl", - "validation_split_percentage": 0 - }, - "Training": { - "optimizer": "AdamW", - "batch_size": 2, - "epochs": 3, - "learning_rate": 1e-5, - "lr_scheduler": "linear", - "weight_decay": 0.0, - "num_training_workers": 2, - "resources_per_worker": { - "CPU": 32 - }, - }, -} \ No newline at end of file diff --git a/examples/finetune/open_assistant/open_assistant_finetune.yaml b/examples/finetune/open_assistant/open_assistant_finetune.yaml new file mode 100644 index 000000000..96a355cb4 --- /dev/null +++ b/examples/finetune/open_assistant/open_assistant_finetune.yaml @@ -0,0 +1,29 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/finetune/open_assistant/data/train/train.jsonl + validation_file: examples/finetune/open_assistant/data/validation/validation.jsonl + validation_split_percentage: 0 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/finetune.conf b/finetune/finetune.conf deleted file mode 100644 index 794decce1..000000000 --- a/finetune/finetune.conf +++ /dev/null @@ -1,40 +0,0 @@ -{ - "General": { - "base_model": "EleutherAI/gpt-j-6b", - # fix issue: https://github.com/huggingface/transformers/issues/22482 - # tranformers version 4.26.0 is required for gpt2, gpt-j-6B, pythia... - "gpt_base_model": True, - "output_dir": "/tmp/llm-ray/output", - "checkpoint_dir": "/tmp/llm-ray/checkpoint", - "config": { - "trust_remote_code": False, - "use_auth_token": None, - }, - "lora_config": { - "task_type": "CAUSAL_LM", - "r": 8, - "lora_alpha": 32, - "lora_dropout": 0.1 - } - }, - "Dataset": { - "train_file": "examples/data/sample_finetune_data_small.jsonl", - "validation_file": None, - "validation_split_percentage": 5 - }, - "Training": { - "optimizer": "AdamW", - "batch_size": 2, - "epochs": 3, - "learning_rate": 1e-5, - "lr_scheduler": "linear", - "weight_decay": 0.0, - "device": "CPU", - "num_training_workers": 2, - "resources_per_worker": { - "CPU": 32, - # "GPU": 1, - }, - "accelerate_mode": "CPU_DDP", - }, -} diff --git a/finetune/finetune.py b/finetune/finetune.py index 1e7b878d9..088ef89f1 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -2,6 +2,7 @@ import os import time +import argparse import traceback from typing import Any, Dict @@ -13,6 +14,8 @@ from ray.air.config import ScalingConfig from ray.air import RunConfig, FailureConfig +from pydantic_yaml import parse_yaml_raw_as + from accelerate import FullyShardedDataParallelPlugin from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig @@ -20,6 +23,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '..')) import common +from finetune_config import FinetuneConfig def get_accelerate_environment_variable(mode: str) -> dict: @@ -140,8 +144,26 @@ def train_func(config: Dict[str, Any]): exit(1) common.logger.info(f"train finish") + +def get_finetune_config(): + parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task") + parser.add_argument( + "--config_file", + type=str, + required=True, + default=None, + help="The name of the dataset to use (via the datasets library).", + ) + args = parser.parse_args() + config_file = args.config_file + + with open(config_file) as f: + finetune_config = parse_yaml_raw_as(FinetuneConfig, f) + return finetune_config.dict() + + def main(external_config = None): - config = common.Config() + config = get_finetune_config() if external_config is not None: config.merge(external_config) diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml new file mode 100644 index 000000000..44bcadcf4 --- /dev/null +++ b/finetune/finetune.yaml @@ -0,0 +1,29 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py new file mode 100644 index 000000000..fad9d751b --- /dev/null +++ b/finetune/finetune_config.py @@ -0,0 +1,93 @@ +from pydantic import BaseModel, validator +from typing import Optional, List + + +class GeneralConfig(BaseModel): + trust_remote_code: bool + use_auth_token: Optional[str] + + +class LoraConfig(BaseModel): + task_type: str + r: int + lora_alpha: int + lora_dropout: float + target_modules: Optional[List[str]] = None + + +class DeltatunerConfig(BaseModel): + algo: str + denas: bool + best_model_structure: str + + +class General(BaseModel): + base_model: str + gpt_base_model: bool + output_dir: str + checkpoint_dir: str + config: GeneralConfig + lora_config: Optional[LoraConfig] = None + deltatuner_config: Optional[DeltatunerConfig] = None + + +class Dataset(BaseModel): + train_file: str + validation_file: Optional[str] + validation_split_percentage: int + + +class RayResourceConfig(BaseModel): + CPU: int + GPU: int = 0 + + +class Training(BaseModel): + optimizer: str + batch_size: int + epochs: int + learning_rate: float + lr_scheduler: str + weight_decay: float + device: str + num_training_workers: int + resources_per_worker: RayResourceConfig + accelerate_mode: str + + @validator("device") + def check_device(cls, v: str): + devices = ["CPU", "GPU"] + if v not in devices: + raise ValueError(f"device must be one of {devices}") + return v + + @validator("accelerate_mode") + def check_accelerate_mode(cls, v: str): + modes = ["CPU_DDP", "GPU_DDP", "GPU_FSDP"] + if v not in modes: + raise ValueError(f"accelerate_mode must be one of {modes}") + return v + + # @model_validator(mode='after') + # def check_device_and_accelerate_mode(self) -> "Training": + # dev = self.device + # res = self.resources_per_worker + # mode = self.accelerate_mode + # if dev == "CPU": + # if res.GPU is not None and res.GPU > 0: + # raise ValueError("Please not specified GPU resource when use CPU only in Ray.") + # if mode != "CPU_DDP": + # raise ValueError("Please specified CPU related accelerate mode when use CPU only in Ray.") + # elif dev == "GPU": + # if res.GPU is None or res.GPU == 0: + # raise ValueError("Please specified GPU resource when use GPU to fine tune in Ray.") + # if mode not in ["GPU_DDP", "GPU_FSDP"]: + # raise ValueError("Please speicifed GPU related accelerate mode when use GPU to fine tune in Ray.") + + # return self + + +class FinetuneConfig(BaseModel): + General: General + Dataset: Dataset + Training: Training diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml new file mode 100644 index 000000000..4c361fc55 --- /dev/null +++ b/finetune/models/bloom-560m.yaml @@ -0,0 +1,29 @@ +General: + base_model: bigscience/bloom-560m + gpt_base_model: false + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml new file mode 100644 index 000000000..44bcadcf4 --- /dev/null +++ b/finetune/models/finetune_config_template.yaml @@ -0,0 +1,29 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml new file mode 100644 index 000000000..44bcadcf4 --- /dev/null +++ b/finetune/models/gpt-j-6b.yaml @@ -0,0 +1,29 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml new file mode 100644 index 000000000..a0887b324 --- /dev/null +++ b/finetune/models/gpt2.yaml @@ -0,0 +1,29 @@ +General: + base_model: gpt2 + gpt_base_model: true + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml new file mode 100644 index 000000000..c7e7430f4 --- /dev/null +++ b/finetune/models/llama-2-7b-chat-hf.yaml @@ -0,0 +1,29 @@ +General: + base_model: meta-llama/Llama-2-7b-chat-hf + gpt_base_model: false + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/llama-7b.yaml b/finetune/models/llama-7b.yaml new file mode 100644 index 000000000..3bd823253 --- /dev/null +++ b/finetune/models/llama-7b.yaml @@ -0,0 +1,29 @@ +General: + base_model: meta-llama/Llama-2-7b + gpt_base_model: false + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml new file mode 100644 index 000000000..46b05a5a8 --- /dev/null +++ b/finetune/models/mistral-7b-v0.1.yaml @@ -0,0 +1,38 @@ +General: + base_model: mistralai/Mistral-7B-v0.1 + gpt_base_model: false + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 + target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + - lm_head +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml new file mode 100644 index 000000000..149514c07 --- /dev/null +++ b/finetune/models/mpt-7b-chat.yaml @@ -0,0 +1,29 @@ +General: + base_model: mosaicml/mpt-7b-chat + gpt_base_model: false + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: true + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/finetune/models/opt-125m.yaml b/finetune/models/opt-125m.yaml new file mode 100644 index 000000000..4d8dc7e13 --- /dev/null +++ b/finetune/models/opt-125m.yaml @@ -0,0 +1,29 @@ +General: + base_model: facebook/opt-125m + gpt_base_model: false + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + device: CPU + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: CPU_DDP diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py index 3ff008c71..98fbed0aa 100644 --- a/inference/deepspeed_predictor.py +++ b/inference/deepspeed_predictor.py @@ -13,37 +13,39 @@ from typing import List import os from predictor import Predictor -from peft import PeftModel -from deltatuner import DeltaTunerModel -from inference_config import InferenceConfig +from utils import get_torch_dtype + + +from inference_config import InferenceConfig, DEVICE_CPU, DEVICE_XPU, IPEX_PRECISION_BF16 class DSPipeline: def __init__( self, - inferenceConfig: InferenceConfig, + infer_conf: InferenceConfig, pad_token_id, - stopping_criteria, - dtype + stopping_criteria ): - - self.dtype = dtype - self.device = torch.device(inferenceConfig.device) + self.device = torch.device(infer_conf.device) self.pad_token_id = pad_token_id self.stopping_criteria = stopping_criteria - model_desc = inferenceConfig.model_description + model_desc = infer_conf.model_description model_config = model_desc.config - config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code) + hf_config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code) + # get correct torch type for loading HF model + torch_dtype = get_torch_dtype(infer_conf, hf_config) self.model = AutoModelForCausalLM.from_pretrained(model_desc.model_id_or_path, - torch_dtype=dtype, - config=config, + config=hf_config, + torch_dtype=torch_dtype, low_cpu_mem_usage=True, **model_config.dict()) if model_desc.peft_model_id_or_path: + from peft import PeftModel self.model = PeftModel.from_pretrained(self.model, model_desc.peft_model_id_or_path) if model_desc.peft_type == "deltatuner": + from deltatuner import DeltaTunerModel self.model = DeltaTunerModel.from_pretrained(self.model, model_desc.peft_model_id_or_path) self.model = self.model.merge_and_unload() @@ -53,7 +55,7 @@ def __init__( self.model.eval() def streaming_generate(self, inputs, streamer, **generate_kwargs): - self.model.generate(**inputs, + self.model.generate(inputs, pad_token_id=self.pad_token_id, stopping_criteria=self.stopping_criteria, streamer=streamer, @@ -61,7 +63,7 @@ def streaming_generate(self, inputs, streamer, **generate_kwargs): def generate(self, inputs, **config): gen_tokens = self.model.generate( - **inputs, + inputs, pad_token_id=self.pad_token_id, stopping_criteria=self.stopping_criteria, **config @@ -75,19 +77,18 @@ class PredictionWorker(TorchDistributedWorker): Multiple PredictionWorkers of the same WorkerGroup form a PyTorch DDP process group and work together under the orchestration of DeepSpeed. """ - def __init__(self, world_size: int, inferenceConfig: InferenceConfig, amp_dtype, pad_token_id, stopping_criteria): + def __init__(self, world_size: int, infer_conf: InferenceConfig, pad_token_id, stopping_criteria): self.world_size = world_size - self.inferenceConfig = inferenceConfig - self.amp_dtype = amp_dtype + self.infer_conf = infer_conf self.pad_token_id = pad_token_id self.stopping_criteria = stopping_criteria def init_model(self, local_rank: int): """Initialize model for inference.""" - if self.inferenceConfig.device == 'cpu': + if self.infer_conf.device == DEVICE_CPU: replace_with_kernel_inject = False - elif self.inferenceConfig.device == 'xpu': + elif self.infer_conf.device == DEVICE_XPU: replace_with_kernel_inject = False else: replace_with_kernel_inject = True @@ -96,24 +97,26 @@ def init_model(self, local_rank: int): os.environ['WORLD_SIZE'] = str(self.world_size) pipe = DSPipeline( - self.inferenceConfig, + self.infer_conf, pad_token_id=self.pad_token_id, stopping_criteria=self.stopping_criteria, - dtype=self.amp_dtype ) pipe.model = deepspeed.init_inference( pipe.model, - dtype=self.amp_dtype, mp_size=self.world_size, + dtype=torch.bfloat16, replace_with_kernel_inject=replace_with_kernel_inject ) - if self.ipex_enabled: + if self.infer_conf.ipex.enabled: import intel_extension_for_pytorch as ipex try: ipex._C.disable_jit_linear_repack() except: pass - pipe.model = ipex.optimize_transformers(pipe.model.eval(), dtype=self.amp_dtype, inplace=True) + pipe.model = ipex.optimize_transformers( + pipe.model.eval(), + dtype=torch.bfloat16 if self.infer_conf.ipex.precision == IPEX_PRECISION_BF16 else torch.float32, + inplace=True) self.generator = pipe @@ -124,21 +127,18 @@ def generate(self, inputs, **config): return self.generator.generate(inputs, **config) class DeepSpeedPredictor(Predictor): - def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, pad_token_id, stopping_criteria) -> None: - self.inferenceConfig = inferenceConfig - self.amp_dtype = amp_dtype - self.pad_token_id = pad_token_id - self.stopping_criteria = stopping_criteria - - use_gpu = True if (inferenceConfig.device == "cuda") else False - + def __init__(self, infer_conf: InferenceConfig) -> None: + super().__init__(infer_conf) + # TODO this should be removed later + self.pad_token_id = self.tokenizer.pad_token_id # Scaling config for one worker group. - resource = {"CPU": inferenceConfig.cpus_per_worker} - if inferenceConfig.device == "cuda": - resource["GPU"] = inferenceConfig.gpus_per_worker + resource = {"CPU": infer_conf.cpus_per_worker} + use_gpu = True if (infer_conf.device == "cuda") else False + if use_gpu: + resource["GPU"] = infer_conf.gpus_per_worker scaling_conf = ScalingConfig( use_gpu=use_gpu, - num_workers=inferenceConfig.workers_per_group, + num_workers=infer_conf.workers_per_group, resources_per_worker=resource ) @@ -181,13 +181,13 @@ def _init_worker_group(self, scaling_config: ScalingConfig): # Create the prediction workers. self.prediction_workers = [ - prediction_worker_cls.remote(scaling_config.num_workers, self.inferenceConfig, self.amp_dtype, + prediction_worker_cls.remote(scaling_config.num_workers, self.infer_conf, self.pad_token_id, self.stopping_criteria) for i in range(scaling_config.num_workers) ] # Initialize torch distributed process group for the workers. - local_ranks = init_torch_dist_process_group(self.prediction_workers, backend="ccl" if self.inferenceConfig.device != "cuda" else "nccl") + local_ranks = init_torch_dist_process_group(self.prediction_workers, backend="ccl" if self.infer_conf.device != "cuda" else "nccl") # Initialize the model on each worker. ray.get([ @@ -195,21 +195,53 @@ def _init_worker_group(self, scaling_config: ScalingConfig): for worker, local_rank in zip(self.prediction_workers, local_ranks) ]) - def streaming_generate(self, inputs, streamer, **config): - inputs_ref = ray.put(inputs) + def streaming_generate(self, prompt, streamer, **config): + input_ids = self.tokenize_inputs(prompt) + inputs_ref = ray.put(input_ids) self.prediction_workers[0].streaming_generate.remote(inputs_ref, streamer, **config) for worker in self.prediction_workers[1:]: worker.streaming_generate.remote(inputs_ref, self._create_dummy_streamer(), **config) - def generate(self, inputs, **config): - inputs_ref = ray.put(inputs) - prediction = ray.get( + def generate(self, prompt, **config): + input_ids = self.tokenize_inputs(prompt) + inputs_ref = ray.put(input_ids) + gen_tokens = ray.get( [ worker.generate.remote(inputs_ref, **config) for worker in self.prediction_workers ] )[0] - return prediction + return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] + + def get_streamer(self): + from transformers import TextStreamer + from typing import Optional + from ray.util.queue import Queue + + class RayTextIteratorStreamer(TextStreamer): + def __init__( + self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs + ): + super().__init__(tokenizer, skip_prompt, **decode_kwargs) + self.text_queue = Queue() + self.stop_signal = None + self.timeout = timeout + + def on_finalized_text(self, text: str, stream_end: bool = False): + self.text_queue.put(text, timeout=self.timeout) + if stream_end: + self.text_queue.put(self.stop_signal, timeout=self.timeout) + + def __iter__(self): + return self + + def __next__(self): + value = self.text_queue.get(timeout=self.timeout) + if value == self.stop_signal: + raise StopIteration() + else: + return value + return RayTextIteratorStreamer(self.tokenizer, skip_special_tokens=True) def predict( self, diff --git a/inference/inference_config.py b/inference/inference_config.py index 661d83765..9b2a434df 100644 --- a/inference/inference_config.py +++ b/inference/inference_config.py @@ -3,6 +3,14 @@ from pydantic_yaml import parse_yaml_raw_as from typing import List, Dict +IPEX_PRECISION_BF16 = 'bf16' +IPEX_PRECISION_FP32 = 'fp32' + +DEVICE_CPU = "cpu" +DEVICE_HPU = "hpu" +DEVICE_XPU = "xpu" +DEVICE_CUDA = "cuda" + class Prompt(BaseModel): intro: str = "" human_id: str = "" @@ -14,6 +22,16 @@ class ModelConfig(BaseModel): use_auth_token: str = None load_in_4bit: bool = False +class Ipex(BaseModel): + enabled: bool = True + precision: str = 'bf16' + + @validator('precision') + def _check_precision(cls, v: str): + if v: + assert v in [IPEX_PRECISION_BF16, IPEX_PRECISION_FP32] + return v + # for bigdl model class BigDLModelConfig(BaseModel): load_in_low_bit: str = "" @@ -61,14 +79,13 @@ class InferenceConfig(BaseModel): port: int = 8000 name: str = None route_prefix: str = None - precision: str = 'bf16' cpus_per_worker: int = 24 gpus_per_worker: int = 0 hpus_per_worker: int = 0 deepspeed: bool = False workers_per_group: int = 2 - ipex: bool = False - device: str = "cpu" + device: str = DEVICE_CPU + ipex: Ipex = Ipex() model_description: ModelDescription = ModelDescription() # prevent warning of protected namespaces @@ -89,13 +106,7 @@ def _check_port(cls, v: int): @validator('device') def _check_device(cls, v: str): if v: - assert v in ['cpu', 'xpu', 'cuda', 'hpu'] - return v - - @validator('precision') - def _check_precision(cls, v: str): - if v: - assert v in ['bf16', 'fp32'] + assert v in [DEVICE_CPU, DEVICE_XPU, DEVICE_CUDA, DEVICE_HPU] return v @validator('workers_per_group') diff --git a/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml b/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml index b562dfb3c..6da907c24 100644 --- a/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml +++ b/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml @@ -1,13 +1,14 @@ port: 8000 name: mistral-7b-v0.1-bigdl route_prefix: /mistral-7b-v0.1-bigdl -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: false + precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 bigdl: true diff --git a/inference/models/bigdl/mpt-7b-bigdl.yaml b/inference/models/bigdl/mpt-7b-bigdl.yaml index bc05487ab..f306c0507 100644 --- a/inference/models/bigdl/mpt-7b-bigdl.yaml +++ b/inference/models/bigdl/mpt-7b-bigdl.yaml @@ -1,13 +1,14 @@ port: 8000 name: mpt-7b-bigdl route_prefix: /mpt-7b-bigdl -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: false + precision: bf16 model_description: model_id_or_path: mosaicml/mpt-7b bigdl: true diff --git a/inference/models/bloom-560m.yaml b/inference/models/bloom-560m.yaml index 39c956af9..43f63cb62 100644 --- a/inference/models/bloom-560m.yaml +++ b/inference/models/bloom-560m.yaml @@ -1,13 +1,14 @@ port: 8000 name: bloom-560m route_prefix: /bloom-560m -precision: 'bf16' cpus_per_worker: 10 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: true + precision: bf16 model_description: model_id_or_path: bigscience/bloom-560m tokenizer_name_or_path: bigscience/bloom-560m diff --git a/inference/models/gpt-j-6b.yaml b/inference/models/gpt-j-6b.yaml index ca0a0636f..82518baa9 100644 --- a/inference/models/gpt-j-6b.yaml +++ b/inference/models/gpt-j-6b.yaml @@ -1,13 +1,15 @@ port: 8000 name: gpt-j-6b route_prefix: /gpt-j-6b -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + # false here for ci coverage + enabled: false + precision: bf16 model_description: model_id_or_path: EleutherAI/gpt-j-6b tokenizer_name_or_path: EleutherAI/gpt-j-6b diff --git a/inference/models/gpt2.yaml b/inference/models/gpt2.yaml index 4b135cc10..617c8a64d 100644 --- a/inference/models/gpt2.yaml +++ b/inference/models/gpt2.yaml @@ -1,13 +1,14 @@ port: 8000 name: gpt2 route_prefix: /gpt2 -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: true + precision: bf16 model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 diff --git a/inference/models/llama-2-7b-chat-hf.yaml b/inference/models/llama-2-7b-chat-hf.yaml index de94e023f..b0dc029da 100644 --- a/inference/models/llama-2-7b-chat-hf.yaml +++ b/inference/models/llama-2-7b-chat-hf.yaml @@ -1,13 +1,14 @@ port: 8000 name: llama-2-7b-chat-hf route_prefix: /llama-2-7b-chat-hf -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: true + precision: bf16 model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf diff --git a/inference/models/mistral-7b-v0.1.yaml b/inference/models/mistral-7b-v0.1.yaml index 3b3fb732b..60ad1c602 100644 --- a/inference/models/mistral-7b-v0.1.yaml +++ b/inference/models/mistral-7b-v0.1.yaml @@ -1,13 +1,14 @@ port: 8000 name: mistral-7b-v0.1 route_prefix: /mistral-7b-v0.1 -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: true + precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 bigdl: false diff --git a/inference/models/mpt-7b.yaml b/inference/models/mpt-7b.yaml index 1388fa796..b0b2ac7b9 100644 --- a/inference/models/mpt-7b.yaml +++ b/inference/models/mpt-7b.yaml @@ -1,13 +1,14 @@ port: 8000 name: mpt-7b route_prefix: /mpt-7b -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: true + precision: bf16 model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b diff --git a/inference/models/neural-chat-7b-v3-1.yaml b/inference/models/neural-chat-7b-v3-1.yaml new file mode 100644 index 000000000..076ef6f7c --- /dev/null +++ b/inference/models/neural-chat-7b-v3-1.yaml @@ -0,0 +1,24 @@ +port: 8000 +name: neural-chat-7b-v3-1 +route_prefix: /neural-chat-7b-v3-1 +cpus_per_worker: 24 +gpus_per_worker: 0 +deepspeed: false +workers_per_group: 2 +device: "cpu" +ipex: + enabled: true + precision: bf16 +model_description: + model_id_or_path: Intel/neural-chat-7b-v3-1 + tokenizer_name_or_path: Intel/neural-chat-7b-v3-1 + chat_processor: ChatModelGptJ + prompt: + intro: '### System: + You are a chatbot developed by Intel. Please answer all questions to the best of your ability.' + human_id: ' + + ### User' + bot_id: ' + + ### Assistant' diff --git a/inference/models/opt-125m.yaml b/inference/models/opt-125m.yaml index b7801bfa9..c8f40aa04 100644 --- a/inference/models/opt-125m.yaml +++ b/inference/models/opt-125m.yaml @@ -1,13 +1,14 @@ port: 8000 name: opt-125m route_prefix: /opt-125m -precision: 'bf16' cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: "cpu" +ipex: + enabled: false + precision: bf16 model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m diff --git a/inference/models/template/inference_config_template.yaml b/inference/models/template/inference_config_template.yaml index aeeb67064..7a8a18507 100644 --- a/inference/models/template/inference_config_template.yaml +++ b/inference/models/template/inference_config_template.yaml @@ -1,14 +1,15 @@ port: 8000 name: null route_prefix: null -precision: bf16 cpus_per_worker: 24 gpus_per_worker: 0 hpus_per_worker: 0 deepspeed: false workers_per_group: 2 -ipex: false device: cpu +ipex: + enabled: true + precision: bf16 model_description: model_id_or_path: null bigdl:: false diff --git a/inference/predictor.py b/inference/predictor.py index 6a61f0ae6..fb0cc1ef3 100644 --- a/inference/predictor.py +++ b/inference/predictor.py @@ -1,49 +1,88 @@ import re +import torch +from transformers import AutoTokenizer, StoppingCriteriaList +from inference_config import InferenceConfig +from utils import max_input_len, StoppingCriteriaSub class Predictor: - def configure_tokenizer(self, model_name, tokenizer): - model = self.model - if re.search("llama", model.config.architectures[0], re.IGNORECASE): - # unwind broken decapoda-research config - model.generation_config.pad_token_id = 0 - model.generation_config.bos_token_id = 1 - model.generation_config.eos_token_id = 2 - - if ( - hasattr(model.generation_config, "pad_token_id") - and model.generation_config.pad_token_id is not None - and not "chatglm" in model_name - ): - tokenizer.pad_token_id = model.generation_config.pad_token_id - if ( - hasattr(model.generation_config, "eos_token_id") - and model.generation_config.eos_token_id is not None - and not "chatglm" in model_name - ): - tokenizer.eos_token_id = model.generation_config.eos_token_id - if ( - hasattr(model.generation_config, "bos_token_id") - and model.generation_config.bos_token_id is not None - ): - tokenizer.bos_token_id = model.generation_config.bos_token_id - - if tokenizer.pad_token_id is None: - model.generation_config.pad_token_id = ( - tokenizer.pad_token_id - ) = tokenizer.eos_token_id - - if model.generation_config.eos_token_id is None: - model.generation_config.eos_token_id = tokenizer.eos_token_id + def __init__(self, infer_conf: InferenceConfig) -> None: + self.infer_conf = infer_conf + self.tokenizer = AutoTokenizer.from_pretrained(infer_conf.model_description.tokenizer_name_or_path) + self.device = torch.device(infer_conf.device) + # now deepspeed predictor don't have the model + # so configure_tokenizer cannot be called + # this should be solved in the next pr + # where it is also a worker + # This can be removed then + if self.tokenizer.pad_token_id is None: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + + prompt = infer_conf.model_description.prompt + stop_words = prompt.stop_words + stop_words_ids = [self.tokenizer(stop_word, return_tensors='pt').input_ids.squeeze() for stop_word in stop_words] + self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) + + def tokenize_inputs(self, text): + if self.device.type == "hpu": + input_tokens = self.tokenizer( + text, + return_tensors="pt", + padding="max_length", + max_length=max_input_len(input_token_len), + ) + else: + input_tokens = self.tokenizer( + text, return_tensors="pt", padding=True + ) + return input_tokens.input_ids.to(device=self.device) + + def configure_tokenizer(self, model_name): + model = self.model + tokenizer = self.tokenizer + if re.search("llama", model.config.architectures[0], re.IGNORECASE): + # unwind broken decapoda-research config + model.generation_config.pad_token_id = 0 + model.generation_config.bos_token_id = 1 + model.generation_config.eos_token_id = 2 + + if ( + hasattr(model.generation_config, "pad_token_id") + and model.generation_config.pad_token_id is not None + and not "chatglm" in model_name + ): + tokenizer.pad_token_id = model.generation_config.pad_token_id + if ( + hasattr(model.generation_config, "eos_token_id") + and model.generation_config.eos_token_id is not None + and not "chatglm" in model_name + ): + tokenizer.eos_token_id = model.generation_config.eos_token_id + if ( + hasattr(model.generation_config, "bos_token_id") + and model.generation_config.bos_token_id is not None + ): + tokenizer.bos_token_id = model.generation_config.bos_token_id + + if tokenizer.pad_token_id is None: + model.generation_config.pad_token_id = ( + tokenizer.pad_token_id + ) = tokenizer.eos_token_id + + if model.generation_config.eos_token_id is None: + model.generation_config.eos_token_id = tokenizer.eos_token_id + + if not model.config.is_encoder_decoder: + tokenizer.padding_side = "left" + + if tokenizer.pad_token is None and tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + model.generation_config.pad_token_id = model.generation_config.eos_token_id - if not model.config.is_encoder_decoder: - tokenizer.padding_side = "left" - - if tokenizer.pad_token is None and tokenizer.pad_token_id is None: - tokenizer.pad_token = tokenizer.eos_token - model.generation_config.pad_token_id = model.generation_config.eos_token_id - - def generate(self, inputs, **config): - pass - - def streaming_generate(self, inputs, streamer, **config): - pass \ No newline at end of file + def generate(self, prompt, **config): + pass + + def streaming_generate(self, prompt, streamer, **config): + pass + + def get_streamer(self): + pass diff --git a/inference/run_model_batch_predict.py b/inference/run_model_batch_predict.py index 880a0e860..30d74a5c3 100644 --- a/inference/run_model_batch_predict.py +++ b/inference/run_model_batch_predict.py @@ -46,12 +46,9 @@ def __call__(self, batch: pd.DataFrame) -> pd.DataFrame: import argparse parser = argparse.ArgumentParser('GPT-J generation script', add_help=False) - parser.add_argument('--precision', default='bf16', type=str, help="fp32 or bf16") parser.add_argument('--model', default='EleutherAI/gpt-j-6B', type=str, help="model name or path") parser.add_argument('--max-new-tokens', default=100, type=int, help="output max new tokens") args = parser.parse_args() - amp_enabled = True if args.precision != "fp32" else False - amp_dtype = torch.bfloat16 if args.precision != "fp32" else torch.float32 ray.init(address="auto") prompt = ( @@ -68,8 +65,6 @@ def __call__(self, batch: pd.DataFrame) -> pd.DataFrame: batch_size=4, fn_constructor_kwargs=dict( model_id=args.model, - amp_enabled=amp_enabled, - amp_dtype=amp_dtype, max_new_tokens=args.max_new_tokens ), compute="actors" diff --git a/inference/run_model_infer.py b/inference/run_model_infer.py index ba3214d29..30b158bf8 100644 --- a/inference/run_model_infer.py +++ b/inference/run_model_infer.py @@ -32,7 +32,7 @@ print("iter: ", i) tic = time.time() proxies = { "http": None, "https": None} - outputs = requests.post(args.model_endpoint, proxies=proxies, json=[sample_input], stream=args.streaming_response) + outputs = requests.post(args.model_endpoint, proxies=proxies, json=sample_input, stream=args.streaming_response) if args.streaming_response: outputs.raise_for_status() for output in outputs.iter_content(chunk_size=None, decode_unicode=True): diff --git a/inference/run_model_serve.py b/inference/run_model_serve.py index 9dd33b9e3..d5e2c8ac5 100644 --- a/inference/run_model_serve.py +++ b/inference/run_model_serve.py @@ -6,135 +6,38 @@ from starlette.requests import Request from queue import Empty import torch -from transformers import AutoTokenizer, TextIteratorStreamer -from transformers import StoppingCriteria, StoppingCriteriaList +from transformers import TextIteratorStreamer from inference_config import ModelDescription, InferenceConfig, all_models import sys - +from utils import get_deployment_actor_options from typing import Generator, Union, Optional, List from starlette.responses import StreamingResponse from pydantic_yaml import parse_yaml_raw_as -class StoppingCriteriaSub(StoppingCriteria): - - def __init__(self, stops = [], encounters=1): - super().__init__() - self.stops = stops - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): - for stop in self.stops: - length = 1 if len(stop.size())==0 else stop.size()[0] - if torch.all((stop == input_ids[0][-length:])).item(): - return True - return False - -def max_input_len(input_text_length): - if input_text_length <= 128: - return 128 - elif input_text_length <= 512: - return 512 - elif input_text_length <= 2048: - return 2048 - else: - print("Max support length is 4096") - return 4096 - @serve.deployment class PredictDeployment: - def __init__(self, inferenceConfig: InferenceConfig): - self.device = torch.device(inferenceConfig.device) - self.tokenizer = AutoTokenizer.from_pretrained(inferenceConfig.model_description.tokenizer_name_or_path) - if self.tokenizer.pad_token_id is None: - self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + def __init__(self, infer_conf: InferenceConfig): + self.device = torch.device(infer_conf.device) self.process_tool = None - chat_processor_name = inferenceConfig.model_description.chat_processor - prompt = inferenceConfig.model_description.prompt + chat_processor_name = infer_conf.model_description.chat_processor + prompt = infer_conf.model_description.prompt if chat_processor_name: module = __import__("chat_process") chat_processor = getattr(module, chat_processor_name, None) if chat_processor is None: - raise ValueError(inferenceConfig.name + " deployment failed. chat_processor(" + chat_processor_name + ") does not exist.") + raise ValueError(infer_conf.name + " deployment failed. chat_processor(" + chat_processor_name + ") does not exist.") self.process_tool = chat_processor(**prompt.dict()) - stop_words = prompt.stop_words - stop_words_ids = [self.tokenizer(stop_word, return_tensors='pt').input_ids.squeeze() for stop_word in stop_words] - self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) - self.use_deepspeed = inferenceConfig.deepspeed - self.amp_dtype = torch.bfloat16 if inferenceConfig.precision != "fp32" else torch.float32 + + self.use_deepspeed = infer_conf.deepspeed if self.use_deepspeed: from deepspeed_predictor import DeepSpeedPredictor - self.streamer = self.create_streamer() - # now deepspeed predictor don't have the model - # this should be solved in the next pr - # where it is also a worker - if self.tokenizer.pad_token_id is None: - self.tokenizer.pad_token_id = self.tokenizer.eos_token_id - self.predictor = DeepSpeedPredictor(inferenceConfig, self.amp_dtype, self.tokenizer.pad_token_id, self.stopping_criteria) + self.predictor = DeepSpeedPredictor(infer_conf) + self.streamer = self.predictor.get_streamer() else: from transformer_predictor import TransformerPredictor - self.predictor = TransformerPredictor(inferenceConfig, self.amp_dtype, self.stopping_criteria) - self.predictor.configure_tokenizer(inferenceConfig.model_description.model_id_or_path, self.tokenizer) + self.predictor = TransformerPredictor(infer_conf) self.loop = asyncio.get_running_loop() - - def create_streamer(self): - from transformers import TextStreamer - from typing import Optional - from ray.util.queue import Queue - - class RayTextIteratorStreamer(TextStreamer): - def __init__( - self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs - ): - super().__init__(tokenizer, skip_prompt, **decode_kwargs) - self.text_queue = Queue() - self.stop_signal = None - self.timeout = timeout - - def on_finalized_text(self, text: str, stream_end: bool = False): - self.text_queue.put(text, timeout=self.timeout) - if stream_end: - self.text_queue.put(self.stop_signal, timeout=self.timeout) - - def __iter__(self): - return self - - def __next__(self): - value = self.text_queue.get(timeout=self.timeout) - if value == self.stop_signal: - raise StopIteration() - else: - return value - return RayTextIteratorStreamer(self.tokenizer, skip_special_tokens=True) - - def tokenize_inputs(self, text: List[str]): - if self.device.type == "hpu": - input_tokens_no_pad = self.tokenizer(text, return_tensors="pt") - input_token_len = input_tokens_no_pad.input_ids.shape[-1] - input_tokens = self.tokenizer.batch_encode_plus( - text, - return_tensors="pt", - padding="max_length", - max_length=max_input_len(input_token_len), - ) - else: - input_tokens = self.tokenizer.batch_encode_plus( - text, return_tensors="pt", padding=True - ) - input_token_len = input_tokens.input_ids.shape[-1] - inputs = {k: v.to(device=self.device) \ - for k,v in input_tokens.items() \ - if torch.is_tensor(v)} - return inputs, input_token_len - - def predict(self, text: List[str], **config) -> str: - inputs, _ = self.tokenize_inputs(text) - gen_tokens = self.predictor.generate(inputs, **config) - return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] - - def predict_stream(self, text: List[str], streamer: TextIteratorStreamer, **config) -> Generator[str, None, None]: - # with torch.cpu.amp.autocast(enabled=self.amp_enabled, dtype=self.amp_dtype): - inputs, _ = self.tokenize_inputs(text) - self.predictor.streaming_generate(inputs, streamer, **config) def consume_streamer(self): for text in self.streamer: @@ -155,37 +58,27 @@ async def consume_streamer_async(self, streamer: TextIteratorStreamer): async def __call__(self, http_request: Request) -> Union[StreamingResponse, str]: json_request: str = await http_request.json() prompts = [] - for prompt in json_request: - text = prompt["text"] - config = prompt["config"] if "config" in prompt else {} - streaming_response = prompt["stream"] - if isinstance(text, list): - if self.process_tool is not None: - prompt = self.process_tool.get_prompt(text) - prompts.append(prompt) - else: - prompts.extend(text) + text = json_request["text"] + config = json_request["config"] if "config" in json_request else {} + streaming_response = json_request["stream"] + if isinstance(text, list): + if self.process_tool is not None: + prompt = self.process_tool.get_prompt(text) + prompts.append(prompt) else: - prompts.append(text) + prompts.extend(text) + else: + prompts.append(text) if not streaming_response: - return self.predict(prompts, **config) + return self.predictor.generate(prompts, **config) if self.use_deepspeed: - self.predict_stream(prompts, self.streamer, **config) + self.predictor.streaming_generate(prompts, self.streamer, **config) return StreamingResponse(self.consume_streamer(), status_code=200, media_type="text/plain") else: - streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, timeout=0, skip_special_tokens=True) - self.loop.run_in_executor(None, functools.partial(self.predict_stream, prompts, streamer, **config)) + streamer = self.predictor.get_streamer() + self.loop.run_in_executor(None, functools.partial(self.predictor.streaming_generate, prompts, streamer, **config)) return StreamingResponse(self.consume_streamer_async(streamer), status_code=200, media_type="text/plain") -_ray_env_key = "env_vars" -# OMP_NUM_THREADS will be set by num_cpus, so not set in env -_predictor_runtime_env_ipex = { - "KMP_BLOCKTIME": "1", - "KMP_SETTINGS": "1", - "KMP_AFFINITY": "granularity=fine,compact,1,0", - "MALLOC_CONF": "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" -} - # make it unittest friendly def main(argv=None): # args @@ -203,7 +96,6 @@ def main(argv=None): parser.add_argument("--workers_per_group", default="2", type=int, help="workers per group, used with --deepspeed") parser.add_argument("--ipex", action='store_true', help="enable ipex optimization") parser.add_argument("--device", default="cpu", type=str, help="cpu, xpu, hpu or cuda") - parser.add_argument("--precision", default="bf16", type=str, help="fp32 or bf16") parser.add_argument("--serve_local_only", action="store_true", help="Only support local access to url") args = parser.parse_args(argv) @@ -216,55 +108,40 @@ def main(argv=None): if args.config_file: print("reading from config file, " + args.config_file) with open(args.config_file, "r") as f: - inferenceConfig = parse_yaml_raw_as(InferenceConfig, f) + infer_conf = parse_yaml_raw_as(InferenceConfig, f) else: # args.model should be set print("reading from command line, " + args.model) model_desc = ModelDescription() model_desc.model_id_or_path = args.model model_desc.tokenizer_name_or_path = args.tokenizer if args.tokenizer is not None else args.model - inferenceConfig = InferenceConfig(model_description=model_desc) - inferenceConfig.host = "127.0.0.1" if args.serve_local_only else "0.0.0.0" - inferenceConfig.port = args.port + infer_conf = InferenceConfig(model_description=model_desc) + infer_conf.host = "127.0.0.1" if args.serve_local_only else "0.0.0.0" + infer_conf.port = args.port rp = args.route_prefix if args.route_prefix else "custom_model" - inferenceConfig.route_prefix = "/{}".format(rp) - inferenceConfig.name = rp - inferenceConfig.ipex = args.ipex + infer_conf.route_prefix = "/{}".format(rp) + infer_conf.name = rp + infer_conf.ipex.enabled = args.ipex model_list = {} - model_list[inferenceConfig.name] = inferenceConfig + model_list[infer_conf.name] = infer_conf ray.init(address="auto") deployments = [] - for model_id, inferCfg in model_list.items(): + for model_id, infer_conf in model_list.items(): print("deploy model: ", model_id) - runtime_env = {_ray_env_key: {}} - if inferCfg.ipex: - runtime_env[_ray_env_key].update(_predictor_runtime_env_ipex) - if inferCfg.deepspeed: - runtime_env[_ray_env_key]["DS_ACCELERATOR"] = inferCfg.device - # now PredictDeployment itself is a worker, we should require resources for it - ray_actor_options = {"runtime_env": runtime_env} - if inferCfg.device == "cpu": - ray_actor_options["num_cpus"] = inferCfg.cpus_per_worker - elif inferCfg.device == "cuda": - ray_actor_options["num_gpus"] = inferCfg.gpus_per_worker - elif inferCfg.device == "hpu": - ray_actor_options["resources"] = {"HPU": inferCfg.hpus_per_worker} - else: - # TODO add xpu - pass - deployment = PredictDeployment.options(ray_actor_options=ray_actor_options).bind(inferCfg) - handle = serve.run(deployment, _blocking=True, host=inferCfg.host, port=inferCfg.port, name=inferCfg.name, route_prefix=inferCfg.route_prefix) - deployment_name = inferCfg.name - if inferCfg.host == "0.0.0.0": + ray_actor_options = get_deployment_actor_options(infer_conf) + deployment = PredictDeployment.options(ray_actor_options=ray_actor_options).bind(infer_conf) + handle = serve.run(deployment, _blocking=True, host=infer_conf.host, port=infer_conf.port, name=infer_conf.name, route_prefix=infer_conf.route_prefix) + deployment_name = infer_conf.name + if infer_conf.host == "0.0.0.0": all_nodes = ray.nodes() for node in all_nodes: if "node:__internal_head__" in node["Resources"]: host_ip = node["NodeManagerAddress"] break else: - host_ip = inferCfg.host - url = f"http://{host_ip}:{inferCfg.port}{inferCfg.route_prefix}" + host_ip = infer_conf.host + url = f"http://{host_ip}:{infer_conf.port}{infer_conf.route_prefix}" print(f"Deployment '{deployment_name}' is ready at `{url}`.") deployments.append(handle) diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py index e1178af2f..942d2a26b 100644 --- a/inference/transformer_predictor.py +++ b/inference/transformer_predictor.py @@ -1,15 +1,17 @@ import torch -from transformers import AutoModelForCausalLM, AutoConfig -from inference_config import InferenceConfig +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig +from transformers import TextIteratorStreamer +from inference_config import InferenceConfig, IPEX_PRECISION_BF16 from predictor import Predictor +from utils import get_torch_dtype class TransformerPredictor(Predictor): - def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteria): - self.amp_dtype = amp_dtype - self.device = torch.device(inferenceConfig.device) - model_desc = inferenceConfig.model_description + def __init__(self, infer_conf: InferenceConfig): + super().__init__(infer_conf) + + model_desc = infer_conf.model_description model_config = model_desc.config - config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code) + hf_config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code) if self.device.type == "hpu": from optimum.habana.transformers.modeling_utils import ( @@ -17,6 +19,8 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri ) adapt_transformers_to_gaudi() + # get correct torch type for loading HF model + torch_dtype = get_torch_dtype(infer_conf, hf_config) if model_desc.bigdl: from bigdl.llm.transformers import AutoModelForCausalLM as BigDLAutoModelForCLM bmodel_config = {} @@ -25,16 +29,16 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri bmodel_config.update(model_desc.bigdl_config.dict()) model = BigDLAutoModelForCLM.from_pretrained( model_desc.model_id_or_path, - torch_dtype=amp_dtype, - config=config, + torch_dtype=torch_dtype, + config=hf_config, low_cpu_mem_usage=True, **bmodel_config ) else: model = AutoModelForCausalLM.from_pretrained( model_desc.model_id_or_path, - torch_dtype=amp_dtype, - config=config, + torch_dtype=torch_dtype, + config=hf_config, low_cpu_mem_usage=True, **model_config.dict() ) @@ -58,7 +62,7 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri # to channels last model = model.to(memory_format=torch.channels_last) # to ipex - if inferenceConfig.ipex: + if infer_conf.ipex.enabled: import intel_extension_for_pytorch as ipex torch._C._jit_set_texpr_fuser_enabled(False) @@ -66,13 +70,12 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri except: pass model = ipex.optimize_transformers( model.eval(), - dtype=amp_dtype, + dtype=torch.bfloat16 if infer_conf.ipex.precision == IPEX_PRECISION_BF16 else torch.float32, inplace=True ) self.model = model - self.stopping_criteria = stopping_criteria - def _process_config(self, **config): + def _process_config(self, config): if self.device.type == "hpu": if "max_new_tokens" not in config: # hpu requires setting max_new_tokens @@ -82,18 +85,23 @@ def _process_config(self, **config): # lazy mode should be True when using hpu graphs config["lazy_mode"] = True - def streaming_generate(self, inputs, streamer, **config): - self._process_config(**config) - self.model.generate(**inputs, + def streaming_generate(self, prompt, streamer, **config): + self._process_config(config) + input_ids = self.tokenize_inputs(prompt) + self.model.generate(input_ids, stopping_criteria=self.stopping_criteria, streamer=streamer, **config) - def generate(self, inputs, **config): - self._process_config(**config) + def generate(self, prompt, **config): + self._process_config(config) + input_ids = self.tokenize_inputs(prompt) gen_tokens = self.model.generate( - **inputs, + input_ids, stopping_criteria=self.stopping_criteria, **config ) - return gen_tokens + return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] + + def get_streamer(self): + return TextIteratorStreamer(self.tokenizer, skip_prompt=True, timeout=0, skip_special_tokens=True) diff --git a/inference/utils.py b/inference/utils.py new file mode 100644 index 000000000..43149435c --- /dev/null +++ b/inference/utils.py @@ -0,0 +1,76 @@ +from transformers import StoppingCriteria +import torch + +from inference_config import InferenceConfig, DEVICE_CPU + +def get_deployment_actor_options(infer_conf: InferenceConfig): + _ray_env_key = "env_vars" + # OMP_NUM_THREADS will be set by num_cpus, so not set in env + _predictor_runtime_env_ipex = { + "KMP_BLOCKTIME": "1", + "KMP_SETTINGS": "1", + "KMP_AFFINITY": "granularity=fine,compact,1,0", + "MALLOC_CONF": "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" + } + runtime_env = {_ray_env_key: {}} + if infer_conf.ipex.enabled: + runtime_env[_ray_env_key].update(_predictor_runtime_env_ipex) + if infer_conf.deepspeed: + runtime_env[_ray_env_key]["DS_ACCELERATOR"] = infer_conf.device + # now PredictDeployment itself is a worker, we should require resources for it + ray_actor_options = {"runtime_env": runtime_env} + if infer_conf.device == "cpu": + ray_actor_options["num_cpus"] = infer_conf.cpus_per_worker + elif infer_conf.device == "cuda": + ray_actor_options["num_gpus"] = infer_conf.gpus_per_worker + elif infer_conf.device == "hpu": + ray_actor_options["resources"] = {"HPU": infer_conf.hpus_per_worker} + else: + # TODO add xpu + pass + return ray_actor_options + +class StoppingCriteriaSub(StoppingCriteria): + + def __init__(self, stops = [], encounters=1): + super().__init__() + self.stops = stops + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): + for stop in self.stops: + length = 1 if len(stop.size())==0 else stop.size()[0] + if torch.all((stop == input_ids[0][-length:])).item(): + return True + return False + +# used in inference with Gaudi +def max_input_len(input_text_length): + if input_text_length <= 128: + return 128 + elif input_text_length <= 512: + return 512 + elif input_text_length <= 2048: + return 2048 + else: + print("Max support length is 4096") + return 4096 + +def get_torch_dtype(infer_conf: InferenceConfig, hf_config) -> torch.dtype: + ''' + return torch default dtype, a.k.a float32, if it's cpu only inference without ipex because + bfloat16 is too slow and float16 is not supported in CPU + ''' + if hf_config is None or is_cpu_without_ipex(infer_conf): + return torch.get_default_dtype() + if hasattr(hf_config, 'torch_dtype'): + t = hf_config.torch_dtype + if t: + return t + if hasattr(hf_config, '__getitem__'): + t = hf_config['torch_dtype'] + if t: + return t + return torch.get_default_dtype() + +def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool: + return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU diff --git a/pretrain/config/llama2_7b_8Guadi_pretrain.conf b/pretrain/config/llama_7b_8Guadi_pretrain.conf similarity index 80% rename from pretrain/config/llama2_7b_8Guadi_pretrain.conf rename to pretrain/config/llama_7b_8Guadi_pretrain.conf index 5de0e91f0..62a0c32b1 100644 --- a/pretrain/config/llama2_7b_8Guadi_pretrain.conf +++ b/pretrain/config/llama_7b_8Guadi_pretrain.conf @@ -11,17 +11,18 @@ "type":"MegatronInitializer", "name": "megatron", "megatron_config": { - "data_path": ["/home/user/workspace/data/my-gpt2_text_document"], + "data_path": ["/home/user/workspace/data/tokenized_NIH"], "data_impl": "mmap", "micro_batch_size": 1, "global_batch_size": 8, "seq_length": 2048, "use_dataset_only": True, - "vocab_file": "/home/user/workspace/data/gpt2-vocab.json", - "tokenizer_type": "GPT2BPETokenizer", - "merge_file": "/home/user/workspace/data/gpt2-merges.txt", - "train_iters": 300, - "eval_interval": 10, + #"vocab_file": "/home/user/workspace/data/gpt2-vocab.json", + "tokenizer_type": "HFTokenizer", + "tokenizer_model": "huggyllama/llama-7b", + #"merge_file": "/home/user/workspace/data/gpt2-merges.txt", + "eval_interval": 1000, + "train_samples": 300_000_000, "split": "949,50,1", }, }, @@ -62,21 +63,25 @@ "per_device_eval_batch_size": 1, "do_train": True, "do_eval": True, - "save_strategy": "no", - "output_dir": "/tmp/hf_trainer/", + "save_strategy": "steps", + "save_steps": 1000, + "output_dir": "/home/user/workspace/data/hf_trainer/", "gaudi_config_name": "Habana/gpt2", "use_habana": True, + "max_steps": 100000, "throughput_warmup_steps": 3, "use_lazy_mode": True, "overwrite_output_dir": True, - "max_steps": 300, "seed": 42, "bf16": True, + "report_to":'tensorboard', "deepspeed":{ "steps_per_print": 64, "train_batch_size": 8, "train_micro_batch_size_per_gpu": 1, - "gradient_accumulation_steps": "auto", + "gradient_accumulation_steps": 1, + "gradient_checkpoint": True, + "memory_efficient_linear": False, "bf16": { "enabled": True }, @@ -85,7 +90,8 @@ "stage": 3, "overlap_comm": False, "reduce_scatter": False, - "contiguous_gradients": False + "contiguous_gradients": False, + "stage3_gather_16bit_weights_on_model_save": True } }, }, @@ -103,10 +109,10 @@ "runtime_env": { "env_vars": { "OMP_NUM_THREADS": "56", - #"ACCELERATE_USE_CPU": "True", "ACCELERATE_MIXED_PRECISION": "no", - #"CCL_WORKER_COUNT": "1", # CCL setting - #"CCL_LOG_LEVEL": "info", + "ACCELERATE_USE_DEEPSPEED": "true", + "HABANA_VISIBLE_MODULES":"0,1,2,3,4,5,6,7", + "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES":"true", "WORLD_SIZE": "8", # Enable multi-process } }, @@ -121,6 +127,7 @@ # The amount of resources per worker. "resources_per_worker": { "CPU": 10, + "HPU": 1 }, # The placement strategy to use for the placement group of the Ray actors. "placement_strategy": "SPREAD" diff --git a/pretrain/config/llama2_7b_8gpu_pretrain.conf b/pretrain/config/llama_7b_8gpu_pretrain.conf similarity index 98% rename from pretrain/config/llama2_7b_8gpu_pretrain.conf rename to pretrain/config/llama_7b_8gpu_pretrain.conf index 71a7f1192..43afe1140 100644 --- a/pretrain/config/llama2_7b_8gpu_pretrain.conf +++ b/pretrain/config/llama_7b_8gpu_pretrain.conf @@ -97,6 +97,7 @@ "overlap_comm": False, "reduce_scatter": False, "contiguous_gradients": False, + "stage3_gather_16bit_weights_on_model_save": True # "stage3_max_live_parameters" : 1e8, # "stage3_max_reuse_distance" : 1e8, # "stage3_prefetch_bucket_size" : 2e8, diff --git a/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf b/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf index 00b6d7d37..5848fdae8 100644 --- a/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf +++ b/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf @@ -7,6 +7,7 @@ # you should setup this path based on your environment. "megatron_deepspeed_path": '/home/user/Model-References/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed', + "pretrain_module": 'pretrain_llama', "megatron_config": { "num_layers": 32, "hidden_size": 4096, @@ -15,9 +16,10 @@ #"max_position_embeddings":2048, #"num_key_value_heads": 32, # setup the file path based on your data dir - "vocab_file": "/home/user/workspace/data/gpt2-vocab.json", - "tokenizer_type": "GPT2BPETokenizer", - "merge_file": "/home/user/workspace/data/gpt2-merges.txt", + #"vocab_file": "/home/user/workspace/data/gpt2-vocab.json", + "tokenizer_type": "HFTokenizer", + "tokenizer_model_file": "huggyllama/llama-7b", + #"merge_file": "/home/user/workspace/data/gpt2-merges.txt", "seq_length": 2048, "micro_batch_size": 2, #"eval_micro_batch_size": 2, @@ -57,6 +59,8 @@ "layernorm_type":"rmsnorm", "activation_func_type": "swiglu", "layernorm_epsilon": 1e-6, + "use_fused_sdpa": False, + "use_fused_sdpa_with_recompute": True, "bf16": True, #"checkpoint_activations": True, #"deepspeed_activation_checkpointing": True, @@ -80,14 +84,17 @@ #"min_loss_scale": 1, #"initial_scale_power": 12 }, - "bf16": {"enabled": True}, + "bf16": { + "enabled": True, + "accumulate_grads_via_hooks": True + }, "wall_clock_breakdown": False }, "zero_stage":0, "deepspeed_activation_checkpointing": True, "save": "./checkpoint_megatron", # setup the file path based on your data dir - "data_path": ["/home/user/workspace/data/my-gpt2_text_document"], + "data_path": ["/home/user/workspace/data/tokenized_NIH"], "data_impl": "mmap", "split": "949,50,1", "distributed_backend": "hccl", diff --git a/tools/workload_in_containers/Dockerfile.megatron.habana b/pretrain/docker/Dockerfile.megatron.habana similarity index 83% rename from tools/workload_in_containers/Dockerfile.megatron.habana rename to pretrain/docker/Dockerfile.megatron.habana index ee7739e4a..a12fe7dc6 100644 --- a/tools/workload_in_containers/Dockerfile.megatron.habana +++ b/pretrain/docker/Dockerfile.megatron.habana @@ -1,16 +1,18 @@ FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu20.04/habanalabs/pytorch-installer-2.1.0:latest ENV DEBIAN_FRONTEND=noninteractive WORKDIR /home/user -RUN pip install lz4 numpy==1.24.4 tensorboard gpustat==1.0.0 sentencepiece accelerate==0.19.0 datasets==2.12.0 gymnasium transformers==4.26.0 dm-tree scikit-image peft deltatuner==1.1.9 +RUN pip install lz4 numpy==1.24.4 tensorboard gpustat==1.0.0 sentencepiece accelerate datasets==2.12.0 gymnasium transformers dm-tree scikit-image peft deltatuner==1.1.9 RUN pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0 COPY pretrain/patch/hpu/0001-Init-megatron-deepspeed-with-Ray-cluster.patch . +COPY pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch . RUN git config --global user.email "root@example.com" RUN git config --global user.name "root" RUN git clone https://github.com/HabanaAI/Model-References.git && \ cd Model-References && \ git checkout -b ray bde21928ea8c295cd029fafe2cf737d50e715fe2 && \ git am /home/user/0001-Init-megatron-deepspeed-with-Ray-cluster.patch && \ + git am /home/user/0001-Add-the-Huggingface-tokenizer.patch && \ cd PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/ && \ pip install . # enable password-less ssh diff --git a/tools/workload_in_containers/Dockerfile.megatron.gpu b/pretrain/docker/Dockerfile.nvidia similarity index 100% rename from tools/workload_in_containers/Dockerfile.megatron.gpu rename to pretrain/docker/Dockerfile.nvidia diff --git a/pretrain/docker/Dockerfile.optimum.habana b/pretrain/docker/Dockerfile.optimum.habana new file mode 100644 index 000000000..52e7d7d89 --- /dev/null +++ b/pretrain/docker/Dockerfile.optimum.habana @@ -0,0 +1,12 @@ +FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu20.04/habanalabs/pytorch-installer-2.1.0:latest +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /home/user +COPY pretrain/requirements.optimum-habana.txt /home/user/ +RUN pip install -r requirements.optimum-habana.txt +RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0 +RUN pip install --upgrade-strategy eager optimum[habana] +# enable password-less ssh +RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -P '' && \ + cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config +CMD ["sh", "-c", "service ssh restart; bash"] \ No newline at end of file diff --git a/tools/workload_in_containers/build-image.sh b/pretrain/docker/build-image.sh similarity index 75% rename from tools/workload_in_containers/build-image.sh rename to pretrain/docker/build-image.sh index a6452bdfc..7b4d05293 100755 --- a/tools/workload_in_containers/build-image.sh +++ b/pretrain/docker/build-image.sh @@ -1,9 +1,7 @@ #!/bin/bash dockerfile=Dockerfile -if [[ $1 = "megatron-gpu" ]]; then - dockerfile=Dockerfile.megatron.gpu -elif [[ $1 = "dp" ]]; then - dockerfile=Dockerfile.dp +if [[ $1 = "nvidia" ]]; then + dockerfile=Dockerfile.nvidia elif [[ $1 = "megatron-habana" ]]; then dockerfile=Dockerfile.megatron.habana elif [[ $1 = "optimum-habana" ]]; then diff --git a/pretrain/megatron_deepspeed_pretrain.py b/pretrain/megatron_deepspeed_pretrain.py index 3ee347f6a..2890a4044 100644 --- a/pretrain/megatron_deepspeed_pretrain.py +++ b/pretrain/megatron_deepspeed_pretrain.py @@ -26,29 +26,39 @@ def train_func(config: Dict[str, Any]): os.chdir(cwd) try: - import pretrain_gpt + import pretrain_gpt as pretrain_module except ImportError: megatron_deepspeed_path = config.get("megatron_deepspeed_path", None) if megatron_deepspeed_path is not None: sys.path.append(megatron_deepspeed_path) - pretrain_gpt = importlib.import_module('pretrain_gpt') + pretrain_module_name = config.get("pretrain_module", None) + if pretrain_module_name is not None: + pretrain_module = importlib.import_module(pretrain_module_name) + else: + pretrain_module = importlib.import_module('pretrain_gpt') else: raise ImportError("Please set megatron_deepspeed_path in config") common.init(config) megatron_config = config.get('megatron_config', {}) - if hasattr(pretrain_gpt, 'ModelType'): - pretrain(pretrain_gpt.train_valid_test_datasets_provider, - pretrain_gpt.model_provider, - pretrain_gpt.ModelType.encoder_or_decoder, - pretrain_gpt.forward_step, + if hasattr(pretrain_module, 'ModelType'): + pretrain(pretrain_module.train_valid_test_datasets_provider, + pretrain_module.model_provider, + pretrain_module.ModelType.encoder_or_decoder, + pretrain_module.forward_step, args_defaults=megatron_config, - data_post_process=pretrain_gpt.data_post_process) + data_post_process=pretrain_module.data_post_process) + elif hasattr(pretrain_module, 'llama_argument_handler'): + pretrain(pretrain_module.train_valid_test_datasets_provider, + pretrain_module.model_provider, + pretrain_module.forward_step, + pretrain_module.llama_argument_handler, + args_defaults=megatron_config) else: - pretrain(pretrain_gpt.train_valid_test_datasets_provider, - pretrain_gpt.model_provider, - pretrain_gpt.forward_step, + pretrain(pretrain_module.train_valid_test_datasets_provider, + pretrain_module.model_provider, + pretrain_module.forward_step, args_defaults=megatron_config) def main(external_config = None): diff --git a/pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch b/pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch new file mode 100644 index 000000000..4d2f206b5 --- /dev/null +++ b/pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch @@ -0,0 +1,145 @@ +From 1df9ba8d085f55d5141cdbe0857987dec12f1f7b Mon Sep 17 00:00:00 2001 +From: yuanwu +Date: Fri, 8 Dec 2023 04:53:13 +0000 +Subject: [PATCH] Add the Huggingface tokenizer + +Signed-off-by: yuanwu +--- + .../Megatron-DeepSpeed/megatron/arguments.py | 6 +- + .../megatron/tokenizer/tokenizer.py | 86 +++++++++++++++++++ + 2 files changed, 90 insertions(+), 2 deletions(-) + +diff --git a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py +index b9861fa0..516c2abb 100644 +--- a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py ++++ b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py +@@ -871,8 +871,10 @@ def _add_data_args(parser): + 'BertWordPieceCase', + 'GPT2BPETokenizer', + 'SentencePieceTokenizer', +- 'LlamaTokenizer'], ++ 'HFTokenizer'], + help='What type of tokenizer to use.') ++ group.add_argument('--tokenizer-model', type=str, default=None, ++ help='Sentencepiece tokenizer model.') + group.add_argument('--data-impl', type=str, default='infer', + choices=['lazy', 'cached', 'mmap', 'infer'], + help='Implementation of indexed datasets.') +@@ -1174,4 +1176,4 @@ def _add_hpu_optimizations_args(parser): + action='store_true', + help='Flatten operands of linear layers what yields better performance') + +- return parser +\ No newline at end of file ++ return parser +diff --git a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py +index e4a49306..7989be48 100644 +--- a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py ++++ b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py +@@ -18,6 +18,7 @@ + from abc import ABC + from abc import abstractmethod + ++from transformers import AutoTokenizer + from .bert_tokenization import FullTokenizer as FullBertTokenizer + from .gpt2_tokenization import GPT2Tokenizer + from .sentencepiece_tokenization import SentencePieceTokenizer +@@ -47,6 +48,10 @@ def build_tokenizer(args): + elif args.tokenizer_type == 'SentencePieceTokenizer': + assert args.tokenizer_model_file is not None + tokenizer = _SentencePieceTokenizer(args.tokenizer_model_file, args.tokenizer_eod_id) ++ elif args.tokenizer_type == 'HFTokenizer': ++ assert args.tokenizer_model is not None ++ tokenizer = _HFTokenizer(args.tokenizer_model) ++ + else: + raise NotImplementedError('{} tokenizer is not ' + 'implemented.'.format(args.tokenizer_type)) +@@ -328,3 +333,84 @@ class _SentencePieceTokenizer(AbstractTokenizer): + @property + def eod(self): + return self.eod_id ++ ++class _HFTokenizer(AbstractTokenizer): ++ """HF Tokenizer""" ++ def __init__(self, tokenizer_name_or_path): ++ name = tokenizer_name_or_path ++ super().__init__(name) ++ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) ++ self.encoder = self.tokenizer.get_vocab() ++ self.decoder = {v: k for k, v in self.encoder.items()} ++ ++ @property ++ def vocab_size(self): ++ return self.tokenizer.vocab_size ++ ++ @property ++ def vocab(self): ++ return self.encoder ++ ++ @property ++ def inv_vocab(self): ++ return self.decoder ++ ++ def tokenize(self, text): ++ return self.tokenizer.encode(text) ++ ++ def detokenize(self, token_ids): ++ return self.tokenizer.decode(token_ids) ++ ++ @property ++ def bos(self): ++ return self.bos_token_id ++ ++ @property ++ def bos_token_id(self): ++ candidate = self.tokenizer.eos_token_id ++ return self._check_token_candidate(candidate) ++ ++ @property ++ def cls(self): ++ candidate = self.tokenizer.cls_token_id ++ return self._check_token_candidate(candidate) ++ ++ @property ++ def sep(self): ++ candidate = self.tokenizer.sep_token_id ++ return self._check_token_candidate(candidate) ++ ++ @property ++ def pad(self): ++ candidate = self.tokenizer.pad_token_id ++ return self._check_token_candidate(candidate) ++ ++ @property ++ def eod(self): ++ candidate = self.tokenizer.eos_token_id ++ return self._check_token_candidate(candidate) ++ ++ @property ++ def eos(self): ++ return self.eos_token_id ++ ++ @property ++ def eos_token_id(self): ++ candidate = self.tokenizer.eos_token_id ++ return self._check_token_candidate(candidate) ++ ++ @property ++ def mask(self): ++ candidate = self.tokenizer.mask_token_id ++ return self._check_token_candidate(candidate) ++ ++ @property ++ def additional_special_tokens_ids(self): ++ return self.tokenizer.additional_special_tokens_ids ++ ++ @staticmethod ++ def _check_token_candidate(candidate): ++ """Checks whether the candidate is None or not, and raises an exception if it is.""" ++ if candidate is None: ++ raise AttributeError("Requested token doesn't exist in current tokenizer") ++ return candidate +-- +2.25.1 + diff --git a/pretrain/plugin/megatron_dataset.py b/pretrain/plugin/megatron_dataset.py index 41f5a0355..36fc33c9a 100644 --- a/pretrain/plugin/megatron_dataset.py +++ b/pretrain/plugin/megatron_dataset.py @@ -1,7 +1,7 @@ import numpy as np from megatron import get_args, print_rank_0 -from megatron.training import build_train_valid_test_datasets +from megatron.training import build_train_valid_test_datasets, update_train_iters from megatron.data import gpt_dataset from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset @@ -30,6 +30,8 @@ def _train_valid_test_datasets_provider(train_val_test_num_samples): return train_ds, valid_ds, test_ds + args = get_args() + update_train_iters(args) datasets = build_train_valid_test_datasets(_train_valid_test_datasets_provider) print_rank_0(datasets) return datasets diff --git a/pretrain/plugin/megtron_initializer.py b/pretrain/plugin/megtron_initializer.py index 959bcbe8d..5b520038e 100644 --- a/pretrain/plugin/megtron_initializer.py +++ b/pretrain/plugin/megtron_initializer.py @@ -16,4 +16,4 @@ def init(self): args = self.config["megatron_config"] initialize_megatron(ignore_unknown_args=True, args_defaults=args, allow_no_cuda=True) else: - logger.error("cannot initialize the megatron without the megatron_config") \ No newline at end of file + logger.error("cannot initialize the megatron without the megatron_config") diff --git a/pretrain/pretrain.py b/pretrain/pretrain.py index 46dce45a6..69ce217d3 100644 --- a/pretrain/pretrain.py +++ b/pretrain/pretrain.py @@ -17,9 +17,11 @@ import common import importlib +use_habana = False loader = importlib.util.find_spec('habana_frameworks') if loader is not None: from backend.habana_backend import TorchConfig + use_habana = True else: from ray.train.torch import TorchConfig from backend.deepspeed_backend import TorchConfig as DeepSpeedTorchConfig @@ -126,7 +128,8 @@ def main(external_config = None): if ( config['trainer'].get("training_config", None) and - config['trainer'].get("training_config").get("deepspeed", None) + config['trainer'].get("training_config").get("deepspeed", None) and + use_habana == False ): torch_config = DeepSpeedTorchConfig(**ray_config.get("torch_config", {})) else: diff --git a/pretrain/requirements.optimum-habana.txt b/pretrain/requirements.optimum-habana.txt new file mode 100644 index 000000000..4ff265841 --- /dev/null +++ b/pretrain/requirements.optimum-habana.txt @@ -0,0 +1,22 @@ +accelerate==0.21.0 +datasets==2.12.0 +numpy==1.24.4 +https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl +transformers==4.31.0 +typing==3.7.4.3 +tabulate +ray[tune] +ray[serve] +gradio +gymnasium +dm-tree +scikit-image +pydantic==1.10.11 +tensorboard +einops +gpustat==1.0.0 +peft==0.4.0 +evaluate +deltatuner==1.1.9 +scikit-learn +git+https://github.com/microsoft/Megatron-DeepSpeed.git#egg=megatron-core diff --git a/pyproject.toml b/pyproject.toml index 2a980317a..01d5160cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ "Framework :: Ray" ] dependencies = [ - "accelerate>=0.21.0", + "accelerate", "datasets>=2.14.6", "numpy", "ray @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl", @@ -61,7 +61,7 @@ deepspeed = [ "psutil", "tqdm", # 0.10.2 is required to support bloom - "deepspeed==0.10.2" + "deepspeed>=0.10.2, <0.11.2" ] bigdl-cpu = [ diff --git a/rlhf/ppo.conf b/rlhf/ppo.conf deleted file mode 100644 index 076a270c3..000000000 --- a/rlhf/ppo.conf +++ /dev/null @@ -1,25 +0,0 @@ -# I am python, not json -{ - "General": { - "model_name": "EleutherAI/gpt2", - "model_pretrain": None, - "rm_name": "EleutherAI/gpt2", - "rm_pretrain": None, - }, - "Dataset": { - "train_file": "examples/data/sample_ppo_data.jsonl", - "validation_file": None, - "validation_split_percentage": 5 - }, - "Training": { - "optimizer": "AdamW", - "experience_batch_size": 2, - "training_iteration": 1000, - "learning_rate": 1e-5, - "kl_coeff": 0.2, - "num_training_workers": 2, - "resources_per_worker": { - "CPU": 56 - }, - }, -} diff --git a/rlhf/ppo.yaml b/rlhf/ppo.yaml new file mode 100644 index 000000000..36e8fbbc5 --- /dev/null +++ b/rlhf/ppo.yaml @@ -0,0 +1,18 @@ +General: + model_name: EleutherAI/gpt2 + model_pretrain: null + rm_name: EleutherAI/gpt2 + rm_pretrain: null +Dataset: + train_file: examples/data/sample_ppo_data.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + experience_batch_size: 2 + training_iteration: 1000 + learning_rate: 1.0e-05 + kl_coeff: 0.2 + num_training_workers: 2 + resources_per_worker: + CPU: 56 diff --git a/rlhf/reward.conf b/rlhf/reward.conf deleted file mode 100644 index dccaa149d..000000000 --- a/rlhf/reward.conf +++ /dev/null @@ -1,24 +0,0 @@ -{ - "General": { - "base_model": "EleutherAI/gpt2", - "output_dir": "/tmp/llm-ray/output/rm", - "checkpoint_dir": "/tmp/llm-ray/checkpoint/rm" - }, - "Dataset": { - "train_file": "examples/data/sample_rm_data.jsonl", - "validation_file": None, - "validation_split_percentage": 5 - }, - "Training": { - "optimizer": "AdamW", - "batch_size": 2, - "epochs": 3, - "learning_rate": 1e-5, - "lr_scheduler": "linear", - "weight_decay": 0.0, - "num_training_workers": 2, - "resources_per_worker": { - "CPU": 32 - }, - }, -} diff --git a/rlhf/reward.yaml b/rlhf/reward.yaml new file mode 100644 index 000000000..77da9bd54 --- /dev/null +++ b/rlhf/reward.yaml @@ -0,0 +1,18 @@ +General: + base_model: EleutherAI/gpt2 + output_dir: /tmp/llm-ray/output/rm + checkpoint_dir: /tmp/llm-ray/checkpoint/rm +Dataset: + train_file: examples/data/sample_rm_data.jsonl + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: AdamW + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + num_training_workers: 2 + resources_per_worker: + CPU: 32 diff --git a/tools/workload_in_containers/Dockerfile.dp b/tools/workload_in_containers/Dockerfile.dp deleted file mode 100755 index 7a8df2ab3..000000000 --- a/tools/workload_in_containers/Dockerfile.dp +++ /dev/null @@ -1,34 +0,0 @@ -FROM ubuntu:22.04 - -RUN apt-get update && apt-get install -y \ - python3.10 \ - python3-pip \ - python-is-python3 \ - wget \ - git \ - build-essential \ - vim \ - htop \ - ssh \ - net-tools - -WORKDIR /home/user - -RUN pip install -U ray[default,data] - -RUN pip install astunparse nltk gymnasium pyyaml datasets presidio_analyzer presidio_anonymizer sentencepiece transformers -RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu -RUN python -m spacy download en_core_web_lg parquet-tools - -#install PII detection/redaction related libs for code -RUN pip install gibberish-detector -RUN pip install detect-secrets - -# enable password-less ssh -RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -P '' && \ - cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \ - sed -i 's/# Port 22/Port 12345/' /etc/ssh/ssh_config && \ - sed -i 's/#Port 22/Port 12345/' /etc/ssh/sshd_config - -CMD ["sh", "-c", "service ssh start; bash"] - diff --git a/tools/workload_in_containers/Dockerfile.optimum.habana b/tools/workload_in_containers/Dockerfile.optimum.habana deleted file mode 100644 index f09c0d7f0..000000000 --- a/tools/workload_in_containers/Dockerfile.optimum.habana +++ /dev/null @@ -1,23 +0,0 @@ -FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu20.04/habanalabs/pytorch-installer-2.1.0:latest -ENV DEBIAN_FRONTEND=noninteractive -WORKDIR /home/user -RUN pip install lz4 numpy==1.24.4 \ - https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl \ - tensorboard gpustat==1.0.0 sentencepiece \ - accelerate==0.19.0 \ - datasets==2.12.0 gymnasium transformers==4.34.0 \ - dm-tree scikit-image evaluate peft==0.5.0 scikit-learn -RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0 -RUN pip install --upgrade-strategy eager optimum[habana] -COPY pretrain/patch/hpu/constants.py /usr/local/lib/python3.8/dist-packages/deepspeed/checkpoint/ -COPY pretrain/patch/hpu/state.py /usr/local/lib/python3.8/dist-packages/optimum/habana/accelerate/ -COPY pretrain/patch/gpu/0001-Change-the-sample-s-column-name.patch . -COPY pretrain/patch/gpu/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch . -RUN git config --global user.email "root@example.com" -RUN git config --global user.name "root" -RUN git clone https://github.com/microsoft/Megatron-DeepSpeed.git && \ - cd Megatron-DeepSpeed && \ - git checkout -b ray 796866fa74f23850b977d4023a7ed4f0031844ae && \ - git am /home/user/0001-Change-the-sample-s-column-name.patch && \ - git am /home/user/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch && \ - pip install . From c8a0da495dc67687d51e5ab3eb7ab644fe6c51da Mon Sep 17 00:00:00 2001 From: Jiafu Zhang Date: Thu, 21 Dec 2023 21:43:00 +0800 Subject: [PATCH 07/14] add pyproject.toml to be ci monitored Signed-off-by: Jiafu Zhang --- .github/workflows/workflow_orders_on_merge.yml | 1 + .github/workflows/workflow_orders_on_pr.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml index e453f242b..56bda5006 100644 --- a/.github/workflows/workflow_orders_on_merge.yml +++ b/.github/workflows/workflow_orders_on_merge.yml @@ -13,6 +13,7 @@ on: - 'inference/**' - 'rlhf/**' - 'tools/**' + - 'pyproject.toml' jobs: diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml index e13bccecf..2c8f93f3d 100644 --- a/.github/workflows/workflow_orders_on_pr.yml +++ b/.github/workflows/workflow_orders_on_pr.yml @@ -13,6 +13,7 @@ on: - 'inference/**' - 'rlhf/**' - 'tools/**' + - 'pyproject.toml' jobs: From 019ba9ae82136cd0103daa91260514e3d0f7d571 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 21 Dec 2023 15:08:53 +0800 Subject: [PATCH 08/14] Bump paramiko from 3.2.0 to 3.4.0 (#7) Bumps [paramiko](https://github.com/paramiko/paramiko) from 3.2.0 to 3.4.0. - [Commits](https://github.com/paramiko/paramiko/compare/3.2.0...3.4.0) --- updated-dependencies: - dependency-name: paramiko dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 01d5160cc..47dccaec7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "deltatuner==1.1.9", "py-cpuinfo", "pydantic-yaml", - "paramiko==3.2.0", + "paramiko==3.4.0", ] [project.optional-dependencies] From 6ab749f044c383888b8309b4e0f638a6c7935efc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 21 Dec 2023 16:04:10 +0800 Subject: [PATCH 09/14] Bump transformers from 4.31.0 to 4.36.0 in /pretrain (#8) Bumps [transformers](https://github.com/huggingface/transformers) from 4.31.0 to 4.36.0. - [Release notes](https://github.com/huggingface/transformers/releases) - [Commits](https://github.com/huggingface/transformers/compare/v4.31.0...v4.36.0) --- updated-dependencies: - dependency-name: transformers dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- pretrain/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/requirements.txt b/pretrain/requirements.txt index fa0d041cd..e25a3e55b 100644 --- a/pretrain/requirements.txt +++ b/pretrain/requirements.txt @@ -4,7 +4,7 @@ numpy==1.24.4 https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl torchvision==0.14.1 torch==1.13.1 -transformers==4.31.0 +transformers==4.36.0 typing==3.7.4.3 tabulate ray[tune] From c8e46c6b8e35bd52a5a9ce9dfa0ee8d4e34add3b Mon Sep 17 00:00:00 2001 From: Jiafu Zhang Date: Wed, 27 Dec 2023 18:37:00 +0800 Subject: [PATCH 10/14] fixed some merge error Signed-off-by: Jiafu Zhang --- .github/workflows/workflow_finetune.yml | 2 +- .github/workflows/workflow_inference.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index 4749750e4..106e9118b 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@v2 - name: Load environment variables - run: cat /root/actions-runner-config/.env >> $GITHUB_ENV + run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV - name: Build Docker Image run: | diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index eb3d978b9..5c3f0f5f6 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -16,7 +16,6 @@ jobs: name: inference test strategy: matrix: - # for mistral-7b-v0.1, we use bigdl-cpu to verify model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1 ] isPR: - ${{inputs.ci_type == 'pr'}} From 2730576700948fea8452a52c0e0ed2ac4a73bcd6 Mon Sep 17 00:00:00 2001 From: Jiafu Zhang Date: Wed, 27 Dec 2023 21:15:22 +0800 Subject: [PATCH 11/14] fixed some merge error Signed-off-by: Jiafu Zhang --- .github/workflows/workflow_finetune.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index 106e9118b..df236c76b 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@v2 - name: Load environment variables - run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV + run: cat ${{ ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV - name: Build Docker Image run: | From 8db74ec6b3c9c47e242f721f840821b46cb490e8 Mon Sep 17 00:00:00 2001 From: Jiafu Zhang Date: Wed, 27 Dec 2023 21:17:46 +0800 Subject: [PATCH 12/14] fixed some merge error Signed-off-by: Jiafu Zhang --- .github/workflows/workflow_finetune.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index df236c76b..261421c42 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@v2 - name: Load environment variables - run: cat ${{ ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV + run: cat ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV - name: Build Docker Image run: | From 91df9e9733d92089abeab443779a433d4821065a Mon Sep 17 00:00:00 2001 From: Jiafu Zhang Date: Wed, 27 Dec 2023 21:20:21 +0800 Subject: [PATCH 13/14] fixed some merge error Signed-off-by: Jiafu Zhang --- .github/workflows/workflow_finetune.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index 261421c42..4749750e4 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@v2 - name: Load environment variables - run: cat ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV + run: cat /root/actions-runner-config/.env >> $GITHUB_ENV - name: Build Docker Image run: | From 7a2b54ac1fdba6bb7145420f45ea3bee857e5d52 Mon Sep 17 00:00:00 2001 From: harborn Date: Thu, 28 Dec 2023 10:05:45 +0800 Subject: [PATCH 14/14] remove nightly ray, and use newest release version (#15) * remove nightly ray, and use newest release version * update * update * update --- common/trainer/default_trainer.py | 6 ++++-- finetune/finetune.py | 3 +++ pyproject.toml | 6 +++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py index d013f28e2..f3aa965b9 100644 --- a/common/trainer/default_trainer.py +++ b/common/trainer/default_trainer.py @@ -57,8 +57,10 @@ def recovery(self, config): self.starting_epoch = checkpoint_epoch["epoch"] + 1 logger.info(f"recovery to epoch {self.starting_epoch}") + except FileNotFoundError as e: + logger.info(e) except Exception as e: - logger.warning(f"recovery error", exc_info=True) + logger.warning("recovery error", exc_info=True) def _coordinate(self, accelerator): self.accelerator = accelerator @@ -174,7 +176,7 @@ def train(self): except OverflowError: eval_loss = float("inf") perplexity = float("inf") - logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss}]\tppl:[{perplexity}]\ttime:[{time.time()-start}]") + logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]") if checkpoint is not None: self.save(checkpoint, idx) diff --git a/finetune/finetune.py b/finetune/finetune.py index 088ef89f1..430c452de 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -108,6 +108,7 @@ def train_func(config: Dict[str, Any]): trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(config = { "num_train_epochs": config["Training"]["epochs"], "max_train_step": config["Training"].get("max_train_steps", None), + "log_step": 1, "output": config["General"]["output_dir"], "dataprocesser": { "type": "GeneralProcesser", @@ -200,6 +201,8 @@ def main(external_config = None): ray.init(runtime_env = runtime_env) + common.logger.info(f"ray available resources = {ray.available_resources()}") + scaling_config = ScalingConfig( num_workers = num_training_workers, use_gpu = use_gpu, diff --git a/pyproject.toml b/pyproject.toml index 47dccaec7..d5c4396c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "accelerate", "datasets>=2.14.6", "numpy", - "ray @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl", + "ray>=2.9", "typing>=3.7.4.3", "tabulate", "ray[tune]", @@ -52,8 +52,8 @@ gpu = [ "torch==2.0.1a0", "torchvision==0.15.2a0", "intel-extension-for-pytorch==2.0.110+xpu", - "oneccl_bind_pt", - "dpctl" + "oneccl_bind_pt==2.0.100+gpu", + "dpctl==0.14.5" ] deepspeed = [