From b6c28ff4dd4ad0002ea4abdcd85cc097d4d226fb Mon Sep 17 00:00:00 2001
From: Robert Dower <robert.dower@intel.com>
Date: Sun, 12 Nov 2023 21:08:24 -0800
Subject: [PATCH 01/14] add code_of_conduct, contributing agreement, and
 security.md file

---
 CODE_OF_CONDUCT.md | 131 +++++++++++++++++++++++++++++++++++++++++++++
 CONTRIBUTING.md    |  57 ++++++++++++++++++++
 security.md        |   5 ++
 3 files changed, 193 insertions(+)
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 CONTRIBUTING.md
 create mode 100644 security.md

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..58dba18db
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,131 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+CommunityCodeOfConduct AT intel DOT com.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..f682f4e4c
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,57 @@
+# Contributing
+
+### License
+
+<PROJECT NAME> is licensed under the terms in [LICENSE]<link to license file in repo>. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
+
+### Sign your work
+
+Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
+the below (from [developercertificate.org](http://developercertificate.org/)):
+
+```
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+660 York Street, Suite 102,
+San Francisco, CA 94110 USA
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
+```
+
+Then you just add a line to every git commit message:
+
+    Signed-off-by: Joe Smith <joe.smith@email.com>
+
+Use your real name (sorry, no pseudonyms or anonymous contributions.)
+
+If you set your `user.name` and `user.email` git configs, you can sign your
+commit automatically with `git commit -s`.
diff --git a/security.md b/security.md
new file mode 100644
index 000000000..cb59eb893
--- /dev/null
+++ b/security.md
@@ -0,0 +1,5 @@
+# Security Policy
+Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
+
+## Reporting a Vulnerability
+Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).

From d17a473f1f56ff69c83de4f7710c2a10848b720f Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Thu, 7 Dec 2023 19:30:23 +0800
Subject: [PATCH 02/14] Remove duplicate Security.md

---
 Security.md | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 Security.md

diff --git a/Security.md b/Security.md
deleted file mode 100644
index d85d4358b..000000000
--- a/Security.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Security Policy
-Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
-
-## Reporting a Vulnerability
-Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).

From cbb9898b34a81ab9e5814e92df94dde8ac9dd959 Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Thu, 7 Dec 2023 19:43:36 +0800
Subject: [PATCH 03/14] Update image link in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 55198574f..8e38da957 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ LLM-on-Ray's modular workflow structure is designed to comprehensively cater to
 * **Interactive Web UI for Enhanced Usability**: Except for command line, LLM-on-Ray introduces a Web UI, allowing users to easily finetune and deploy LLMs through a user-friendly interface. Additionally, the UI includes a chatbot application, enabling users to immediately test and refine the models.
 
 
-![image](https://github.com/intel-sandbox/llm-ray/assets/9278199/addd7a7f-83ef-43ae-b3ac-dd81cc2570e4)
+![llm-on-ray](https://github.com/intel/llm-on-ray/assets/9278199/68017c14-c0be-4b91-8d71-4b74ab89bd81)
 
 
 ## Getting Started

From 83cb052ee9cc032139d6349db845cfbc37d1bc22 Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Fri, 15 Dec 2023 13:59:24 +0800
Subject: [PATCH 04/14] Update web_ui.md

Signed-off-by: Carson Wang <carson.wang@intel.com>
---
 docs/web_ui.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/web_ui.md b/docs/web_ui.md
index 3b0578643..da92b7f66 100644
--- a/docs/web_ui.md
+++ b/docs/web_ui.md
@@ -3,7 +3,7 @@
 LLM-on-Ray introduces a Web UI, allowing users to easily finetune and deploy LLMs through a user-friendly interface. Additionally, the UI includes a chatbot application, enabling users to immediately test and refine the models.
 
 ## Setup
-Please follow [docs/setup.md](docs/setup.md) to setup the environment first.
+Please follow [setup.md](setup.md) to setup the environment first.
 
 ## Start Web UI
 
@@ -17,17 +17,19 @@ python -u inference/start_ui.py --node_user_name $user --conda_env_name $conda_e
 ## Finetune LLMs
 On the `Finetune` tab, you can configure the base model, finetuning parameters, the dataset path and the new model name. Click `Start To Finetune` to start finetuning.
 
-![image](https://github.com/carsonwang/llm-ray/assets/9278199/38cb6f1f-b5de-495e-a4db-741eb1e15980)
+![webui1](https://github.com/intel/llm-on-ray/assets/9278199/895be765-13d3-455e-a00d-c9ba67ac6781)
+
 
 
 ## Deploy and Serve LLM
 On the `Deployment` tab, you can choose a model to deploy, configure parameter `Model Replica Number`, `Cpus per Worker` and `Gpus per Worker`. Click `Deploy` and you will get a model endpoint.
 
-![image](https://github.com/carsonwang/llm-ray/assets/9278199/937613ad-951c-4543-9e2d-e5b8e7f38d1b)
+![webui2](https://github.com/intel/llm-on-ray/assets/9278199/2a1fb8f2-a2a8-4868-9d1c-418c5c2a6180)
+
 
 ## Chatbot
 On the `Inferenc` tab, you can now test the model by asking questions.
 
-![image](https://github.com/carsonwang/llm-ray/assets/9278199/5aa3dace-238a-4b34-9ce2-b3abbd6de2ba)
+![webui3](https://github.com/intel/llm-on-ray/assets/9278199/f7b9dc79-92fe-4e75-85fa-2cf7f36bb58d)
 
 

From c5738a1e2dd3bf9bf9a04c01e7cbd4b7dd66159a Mon Sep 17 00:00:00 2001
From: jiafu zhang <jiafu.zhang@intel.com>
Date: Wed, 20 Dec 2023 07:55:32 +0000
Subject: [PATCH 05/14] Setup ci (#6)

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* verify

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

---------

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
---
 .github/workflows/workflow_finetune.yml       | 30 ++++++++++++-------
 .github/workflows/workflow_finetune_gpu.yml   | 19 +++++++++---
 .github/workflows/workflow_inference.yml      | 24 ++++++++++++---
 .../workflows/workflow_orders on_merge.yml    | 23 ++++++++++++++
 .github/workflows/workflow_orders_nightly.yml |  6 ++--
 ...w_orders.yml => workflow_orders_on_pr.yml} |  0
 dev/docker/Dockerfile.bigdl-cpu               |  9 +++---
 dev/docker/Dockerfile.cpu_and_deepspeed       |  9 +++---
 8 files changed, 91 insertions(+), 29 deletions(-)
 create mode 100644 .github/workflows/workflow_orders on_merge.yml
 rename .github/workflows/{workflow_orders.yml => workflow_orders_on_pr.yml} (100%)

diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index 1aaaf4df5..bb7c99326 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -19,7 +19,7 @@ jobs:
         model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
-        
+
         exclude:
           - { isPR: true }
         include:
@@ -27,22 +27,36 @@ jobs:
           - { model: "meta-llama/Llama-2-7b-chat-hf"}
 
     runs-on: self-hosted
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
+      env:
+        http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
+        https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+        - ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}:/root/actions-runner-config
+
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
       - name: Load environment variables
-        run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
 
       - name: Build Docker Image
-        run: docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes | docker image prune
+        run: |
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest && yes | docker container prune && yes
+          docker image prune -f
 
       - name: Start Docker Container
         run: |
           cid=$(docker ps -q --filter "name=finetune")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-          docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="finetune" --hostname="finetune-container" finetune:latest
-
+          docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="finetune" --hostname="finetune-container" finetune:latest
       - name: Run Finetune Test
         run: |
           docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
@@ -76,7 +90,6 @@ jobs:
           )
           docker exec "finetune" python -c "$CMD"
           docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
-
       - name: Run PEFT-LoRA Test
         run: |
           docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
@@ -96,7 +109,6 @@ jobs:
           )
           docker exec "finetune" python -c "$CMD"
           docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
-
       - name: Run Deltatuner Test on DENAS-LoRA Model
         run: |
           if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf)$ ]]; then
@@ -126,7 +138,6 @@ jobs:
             docker exec "finetune" python -c "$CMD"
             docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
           fi
-
       - name: Stop Ray
         run: |
           cid=$(docker ps -q --filter "name=finetune")
@@ -139,6 +150,5 @@ jobs:
         run: |
           cid=$(docker ps -q --filter "name=finetune")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-
       - name: Test Summary
-        run: echo "to be continued"
\ No newline at end of file
+        run: echo "to be continued"
diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
index e3adb7923..f18e4eaf5 100644
--- a/.github/workflows/workflow_finetune_gpu.yml
+++ b/.github/workflows/workflow_finetune_gpu.yml
@@ -10,16 +10,27 @@ jobs:
       matrix:
         model: [ pythia-6.9b, gpt-j-6b ]
     runs-on: self-hosted
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
+      env:
+        http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
+        https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
       - name: Running task on Intel GPU
         run: |
-          rm ~/borealis-runner/llm-ray.tar.gz -f
-          tar zcf ~/borealis-runner/llm-ray.tar.gz -C ~/actions-runner/_work/llm-ray .
+          rm ~/borealis-runner/llm-on-ray.tar.gz -f
+          tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
           cd ~/borealis-runner/
           python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
-
       - name: Test Summary
-        run: echo "to be continued"
+        run: echo "to be continued"
\ No newline at end of file
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
index 4662ee5eb..d4e8d21b6 100644
--- a/.github/workflows/workflow_inference.yml
+++ b/.github/workflows/workflow_inference.yml
@@ -32,10 +32,22 @@ jobs:
             model: mpt-7b
 
     runs-on: self-hosted
+
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: ${{ vars.ACTIONS_RUNNER_CONTAINER_IMAGE }}
+      env:
+        http_proxy: ${{ vars.HTTP_PROXY_CONTAINER }}
+        https_proxy: ${{ vars.HTTPS_PROXY_CONTAINER }}
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+
     steps:
       - name: Checkout
         uses: actions/checkout@v2
-
+      
       - name: Set Name Prefix
         id: "prefix"
         run: |
@@ -54,14 +66,15 @@ jobs:
             DF_SUFFIX=".cpu_and_deepspeed"
           fi
           PREFIX=${{steps.prefix.outputs.prefix}}
-          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_IMAGE_BUILD }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_IMAGE_BUILD }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes | docker image prune
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} --build-arg https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${PREFIX}:latest && yes | docker container prune && yes
+          docker image prune -f
 
       - name: Start Docker Container
         run: |
           PREFIX=${{steps.prefix.outputs.prefix}}
           cid=$(docker ps -q --filter "name=${PREFIX}")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-          docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/llm-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER_RUN }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER_RUN }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest
+          docker run -tid -v ${{ vars.MODEL_CACHE_PATH }}:/root/.cache/huggingface/hub -v ${{ vars.CODE_CHECKOUT_PATH }}:/root/llm-on-ray -e http_proxy=${{ vars.HTTP_PROXY_CONTAINER }} -e https_proxy=${{ vars.HTTPS_PROXY_CONTAINER }} --name="${PREFIX}" --hostname="${PREFIX}-container" ${PREFIX}:latest
 
       - name: Start Ray Cluster
         run: |
@@ -126,4 +139,7 @@ jobs:
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
 
       - name: Test Summary
-        run: echo "to be continued"
\ No newline at end of file
+        run: echo "to be continued"
+
+
+      
\ No newline at end of file
diff --git a/.github/workflows/workflow_orders on_merge.yml b/.github/workflows/workflow_orders on_merge.yml
new file mode 100644
index 000000000..e453f242b
--- /dev/null
+++ b/.github/workflows/workflow_orders on_merge.yml	
@@ -0,0 +1,23 @@
+name: llm-ray inference & finetune
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - '.github/**'
+      - 'docker/**'
+      - 'common/**'
+      - 'dev/docker/**'
+      - 'finetune/**'
+      - 'inference/**'
+      - 'rlhf/**'
+      - 'tools/**'
+
+jobs:
+
+  call-inference:
+    uses: ./.github/workflows/workflow_inference.yml
+
+  call-finetune:
+    uses: ./.github/workflows/workflow_finetune.yml
diff --git a/.github/workflows/workflow_orders_nightly.yml b/.github/workflows/workflow_orders_nightly.yml
index 2ba24db1a..9ee0fd202 100644
--- a/.github/workflows/workflow_orders_nightly.yml
+++ b/.github/workflows/workflow_orders_nightly.yml
@@ -1,8 +1,8 @@
-name: llm-ray inference & finetune
+name: llm-ray inference & finetune nightly
 
 on:
   schedule:
-    - cron: "0 21 * * *"
+    - cron: "0 16 * * *"
 
 jobs:
 
@@ -17,4 +17,4 @@ jobs:
       ci_type: nightly
 
   call-finetune-on-intel-gpu:
-    uses: ./.github/workflows/workflow_finetune_gpu.yml
+    uses: ./.github/workflows/workflow_finetune_gpu.yml
\ No newline at end of file
diff --git a/.github/workflows/workflow_orders.yml b/.github/workflows/workflow_orders_on_pr.yml
similarity index 100%
rename from .github/workflows/workflow_orders.yml
rename to .github/workflows/workflow_orders_on_pr.yml
diff --git a/dev/docker/Dockerfile.bigdl-cpu b/dev/docker/Dockerfile.bigdl-cpu
index 449a456b4..403848876 100644
--- a/dev/docker/Dockerfile.bigdl-cpu
+++ b/dev/docker/Dockerfile.bigdl-cpu
@@ -1,10 +1,11 @@
+# syntax=docker/dockerfile:1
 FROM ubuntu:22.04
 
 ENV LANG C.UTF-8
 
-WORKDIR /root/llm-ray
+WORKDIR /root/llm-on-ray
 
-RUN apt-get update -y \
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
     && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH
 # setup env
 SHELL ["/bin/bash", "--login", "-c"]
 
-RUN conda init bash && \
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
     unset -f conda && \
     export PATH=$CONDA_DIR/bin/:${PATH} && \
     conda config --add channels intel && \
@@ -27,7 +28,7 @@ COPY ./pyproject.toml .
 
 RUN mkdir ./finetune && mkdir ./inference
 
-RUN pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[bigdl-cpu] -f https://developer.intel.com/ipex-whl-stable-cpu \
     -f https://download.pytorch.org/whl/torch_stable.html
 
 # Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed b/dev/docker/Dockerfile.cpu_and_deepspeed
index a84ed7bdc..c907d775f 100644
--- a/dev/docker/Dockerfile.cpu_and_deepspeed
+++ b/dev/docker/Dockerfile.cpu_and_deepspeed
@@ -1,10 +1,11 @@
+# syntax=docker/dockerfile:1
 FROM ubuntu:22.04
 
 ENV LANG C.UTF-8
 
-WORKDIR /root/llm-ray
+WORKDIR /root/llm-on-ray
 
-RUN apt-get update -y \
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
     && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -17,7 +18,7 @@ ENV PATH $CONDA_DIR/bin:$PATH
 # setup env
 SHELL ["/bin/bash", "--login", "-c"]
 
-RUN conda init bash && \
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
     unset -f conda && \
     export PATH=$CONDA_DIR/bin/:${PATH} && \
     conda config --add channels intel && \
@@ -27,7 +28,7 @@ COPY ./pyproject.toml .
 
 RUN mkdir ./finetune && mkdir ./inference
 
-RUN pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] -f https://developer.intel.com/ipex-whl-stable-cpu \
     -f https://download.pytorch.org/whl/torch_stable.html
 
 RUN ds_report

From 3498b4df3dd3888a46467dee19ed48f507b30ffc Mon Sep 17 00:00:00 2001
From: jiafu zhang <jiafu.zhang@intel.com>
Date: Thu, 21 Dec 2023 06:05:27 +0000
Subject: [PATCH 06/14] Sync with internal (cherry-picked from 644488 to
 25118e3) (#9)

* merged [common] unified conf to yaml

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* reconstruct config by moving ipex and precision to ipex struct (#168)

* reconstruct config by moving ipex and precision to ipex struct

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* reconstruct config by moving ipex and precision to ipex struct

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

---------

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* [Inference] Add Neural-chat inference support (#149)

* add neural chat inference

* change transformers from 4.31 to 4.35

* update prompt

* nit

* trigger ci

* remove from ci

* add auth token to all models

* revert

* merged [Inference] Add Neural-chat inference support

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* remove llama-2-7b in inferencce ci since ipex failed to optimize it

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

* Add the HFTonkenizer patch for Model-References (#169)

Add the HFTokenizer patch
Add the pretrain_module to invoke different pretrain module

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Abstract common features into Predictor (#166)

* fix bug of precess config; use tokenizer.__call__

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* init

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* create utils & move tokenizer to predictor

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* fix

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* change inferCfg to infer_conf; simplify code

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* fix

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* replace inferenceConfig with infer_conf

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* fix deepspeed

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* further simplify

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* move actor_options to utils

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* fix

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* remove input len

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* remove input len follow-up

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

---------

Signed-off-by: Zhi Lin <zhi.lin@intel.com>

* Update the Dockerfile.optimum.habana (#184)

Updathe the dockerfile
Fix the HABANA_VISIBLE_MODULES  envs issue

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Update the Pretrain ReadME (#186)

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Move the dockerfiles of pretrain into pretrain/docker (#187)

Delete the useless dp dockerfile

Change the nvidia GPU dockerfile name, because it use the same
dockerfile for both megatron-deepspeed and huggingface trainer

refactor the folder path of pretrain

Signed-off-by: yuanwu <yuan.wu@intel.com>

* renamed workflow_orders to workflow_orders_on_pr

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>

---------

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
Signed-off-by: yuanwu <yuan.wu@intel.com>
Signed-off-by: Zhi Lin <zhi.lin@intel.com>
Co-authored-by: harborn <gangsheng.wu@intel.com>
Co-authored-by: Yizhong Zhang <zyzzxycj@163.com>
Co-authored-by: yuanwu2017 <yuan.wu@intel.com>
Co-authored-by: Zhi Lin <zhi.lin@intel.com>
---
 .github/workflows/config/mpt_deltatuner.yaml  |   5 +-
 .../config/mpt_deltatuner_deepspeed.yaml      |   5 +-
 .../update_finetune_config_on_intel_gpu.py    |   7 +-
 .../config/update_inference_config.py         |  27 +++
 .github/workflows/workflow_finetune.yml       |  31 +--
 .github/workflows/workflow_inference.yml      |   5 +-
 ...merge.yml => workflow_orders_on_merge.yml} |   0
 README.md                                     |   2 +-
 common/config.py                              |  19 +-
 dev/scripts/head_node_monitor.sh              |   2 +-
 docs/pretrain.md                              |  89 ++++++--
 docs/rlhf.md                                  |   6 +-
 .../finetune/dolly1/dolly_1_finetune.conf     |  35 ---
 .../finetune/dolly1/dolly_1_finetune.yaml     |  29 +++
 .../finetune/dolly2/dolly_2_finetune.conf     |  35 ---
 .../finetune/dolly2/dolly_2_finetune.yaml     |  29 +++
 .../finetune/gpt_j_6b/finetune_intel_gpu.conf |  41 ----
 .../finetune/gpt_j_6b/finetune_intel_gpu.yaml |  30 +++
 .../open_assistant_finetune.conf              |  35 ---
 .../open_assistant_finetune.yaml              |  29 +++
 finetune/finetune.conf                        |  40 ----
 finetune/finetune.py                          |  24 +-
 finetune/finetune.yaml                        |  29 +++
 finetune/finetune_config.py                   |  93 ++++++++
 finetune/models/bloom-560m.yaml               |  29 +++
 finetune/models/finetune_config_template.yaml |  29 +++
 finetune/models/gpt-j-6b.yaml                 |  29 +++
 finetune/models/gpt2.yaml                     |  29 +++
 finetune/models/llama-2-7b-chat-hf.yaml       |  29 +++
 finetune/models/llama-7b.yaml                 |  29 +++
 finetune/models/mistral-7b-v0.1.yaml          |  38 ++++
 finetune/models/mpt-7b-chat.yaml              |  29 +++
 finetune/models/opt-125m.yaml                 |  29 +++
 inference/deepspeed_predictor.py              | 122 +++++++----
 inference/inference_config.py                 |  31 ++-
 .../models/bigdl/mistral-7b-v0.1-bigdl.yaml   |   5 +-
 inference/models/bigdl/mpt-7b-bigdl.yaml      |   5 +-
 inference/models/bloom-560m.yaml              |   5 +-
 inference/models/gpt-j-6b.yaml                |   6 +-
 inference/models/gpt2.yaml                    |   5 +-
 inference/models/llama-2-7b-chat-hf.yaml      |   5 +-
 inference/models/mistral-7b-v0.1.yaml         |   5 +-
 inference/models/mpt-7b.yaml                  |   5 +-
 inference/models/neural-chat-7b-v3-1.yaml     |  24 ++
 inference/models/opt-125m.yaml                |   5 +-
 .../template/inference_config_template.yaml   |   5 +-
 inference/predictor.py                        | 129 +++++++----
 inference/run_model_batch_predict.py          |   5 -
 inference/run_model_infer.py                  |   2 +-
 inference/run_model_serve.py                  | 207 ++++--------------
 inference/transformer_predictor.py            |  52 +++--
 inference/utils.py                            |  76 +++++++
 ...ain.conf => llama_7b_8Guadi_pretrain.conf} |  35 +--
 ...train.conf => llama_7b_8gpu_pretrain.conf} |   1 +
 ...egatron_deepspeed_zs0_8Gaudi_pretrain.conf |  17 +-
 .../docker}/Dockerfile.megatron.habana        |   4 +-
 .../docker/Dockerfile.nvidia                  |   0
 pretrain/docker/Dockerfile.optimum.habana     |  12 +
 .../docker}/build-image.sh                    |   6 +-
 pretrain/megatron_deepspeed_pretrain.py       |  32 ++-
 .../0001-Add-the-Huggingface-tokenizer.patch  | 145 ++++++++++++
 pretrain/plugin/megatron_dataset.py           |   4 +-
 pretrain/plugin/megtron_initializer.py        |   2 +-
 pretrain/pretrain.py                          |   5 +-
 pretrain/requirements.optimum-habana.txt      |  22 ++
 pyproject.toml                                |   4 +-
 rlhf/ppo.conf                                 |  25 ---
 rlhf/ppo.yaml                                 |  18 ++
 rlhf/reward.conf                              |  24 --
 rlhf/reward.yaml                              |  18 ++
 tools/workload_in_containers/Dockerfile.dp    |  34 ---
 .../Dockerfile.optimum.habana                 |  23 --
 72 files changed, 1353 insertions(+), 694 deletions(-)
 create mode 100644 .github/workflows/config/update_inference_config.py
 rename .github/workflows/{workflow_orders on_merge.yml => workflow_orders_on_merge.yml} (100%)
 delete mode 100644 examples/finetune/dolly1/dolly_1_finetune.conf
 create mode 100644 examples/finetune/dolly1/dolly_1_finetune.yaml
 delete mode 100644 examples/finetune/dolly2/dolly_2_finetune.conf
 create mode 100644 examples/finetune/dolly2/dolly_2_finetune.yaml
 delete mode 100644 examples/finetune/gpt_j_6b/finetune_intel_gpu.conf
 create mode 100644 examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
 delete mode 100644 examples/finetune/open_assistant/open_assistant_finetune.conf
 create mode 100644 examples/finetune/open_assistant/open_assistant_finetune.yaml
 delete mode 100644 finetune/finetune.conf
 create mode 100644 finetune/finetune.yaml
 create mode 100644 finetune/finetune_config.py
 create mode 100644 finetune/models/bloom-560m.yaml
 create mode 100644 finetune/models/finetune_config_template.yaml
 create mode 100644 finetune/models/gpt-j-6b.yaml
 create mode 100644 finetune/models/gpt2.yaml
 create mode 100644 finetune/models/llama-2-7b-chat-hf.yaml
 create mode 100644 finetune/models/llama-7b.yaml
 create mode 100644 finetune/models/mistral-7b-v0.1.yaml
 create mode 100644 finetune/models/mpt-7b-chat.yaml
 create mode 100644 finetune/models/opt-125m.yaml
 create mode 100644 inference/models/neural-chat-7b-v3-1.yaml
 create mode 100644 inference/utils.py
 rename pretrain/config/{llama2_7b_8Guadi_pretrain.conf => llama_7b_8Guadi_pretrain.conf} (80%)
 rename pretrain/config/{llama2_7b_8gpu_pretrain.conf => llama_7b_8gpu_pretrain.conf} (98%)
 rename {tools/workload_in_containers => pretrain/docker}/Dockerfile.megatron.habana (83%)
 rename tools/workload_in_containers/Dockerfile.megatron.gpu => pretrain/docker/Dockerfile.nvidia (100%)
 create mode 100644 pretrain/docker/Dockerfile.optimum.habana
 rename {tools/workload_in_containers => pretrain/docker}/build-image.sh (75%)
 create mode 100644 pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch
 create mode 100644 pretrain/requirements.optimum-habana.txt
 delete mode 100644 rlhf/ppo.conf
 create mode 100644 rlhf/ppo.yaml
 delete mode 100644 rlhf/reward.conf
 create mode 100644 rlhf/reward.yaml
 delete mode 100755 tools/workload_in_containers/Dockerfile.dp
 delete mode 100644 tools/workload_in_containers/Dockerfile.optimum.habana

diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml
index 7a801045f..d9a41398d 100644
--- a/.github/workflows/config/mpt_deltatuner.yaml
+++ b/.github/workflows/config/mpt_deltatuner.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: mpt-7b
 route_prefix: /mpt-7b
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: true
+  precision: bf16
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
diff --git a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
index c0aca37f8..227f79cc1 100644
--- a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
+++ b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: mpt-7b
 route_prefix: /mpt-7b
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: true
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: false
+  precision: bf16
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
index fd1f579eb..f0e2a715e 100644
--- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py
+++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
@@ -1,10 +1,11 @@
+import yaml
 import argparse
 
 
 def update_finetune_config(base_model):
-    conf_file = "finetune/finetune.conf"
+    conf_file = "finetune/finetune.yaml"
     with open(conf_file) as f:
-        config = eval(f.read())
+        config = yaml.load(f, Loader=yaml.FullLoader)
         # due to compute node can't connect network
         # base models are downloaded as local files in directory ~/models/
         # avaiable base models are:
@@ -29,7 +30,7 @@ def update_finetune_config(base_model):
         config["Training"]["accelerate_mode"] = "GPU_DDP"
 
     with open(conf_file, "w") as f:
-        f.write(str(config))
+        yaml.dump(config, f, sort_keys=False)
 
 
 def get_parser():
diff --git a/.github/workflows/config/update_inference_config.py b/.github/workflows/config/update_inference_config.py
new file mode 100644
index 000000000..c1c700cdd
--- /dev/null
+++ b/.github/workflows/config/update_inference_config.py
@@ -0,0 +1,27 @@
+import yaml
+import argparse
+
+
+def update_inference_config(config_file: str, output_file: str, deepspeed: bool, ipex: bool):
+    with open(config_file) as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+        config["deepspeed"] = deepspeed
+        config["ipex"]["enabled"] = ipex
+
+    with open(output_file, "w") as f:
+        yaml.dump(config, f, sort_keys=False)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Adjust Inference Config File")
+    parser.add_argument("--config_file", type=str, required=True)
+    parser.add_argument("--output_file", type=str, required=True)
+    parser.add_argument("--deepspeed", action='store_true')
+    parser.add_argument("--ipex", action='store_true')
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    update_inference_config(args.config_file, args.output_file, args.deepspeed, args.ipex)
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index bb7c99326..4749750e4 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -61,9 +61,10 @@ jobs:
         run: |
           docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379' --ray-debugger-external"
           CMD=$(cat << EOF
-          conf_path = "finetune/finetune.conf"
+          import yaml
+          conf_path = "finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
-              result = eval(reader.read())
+              result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['base_model'] = "${{ matrix.model }}"
               if "${{ matrix.model }}" == "mosaicml/mpt-7b-chat":
                   result['General']['config']['trust_remote_code'] = True
@@ -85,18 +86,20 @@ jobs:
                   result['Training']['num_training_workers'] = 1
               result['General']['lora_config'] = None
           with open(conf_path, 'w') as output:
-              print(result, file=output)
+              yaml.dump(result, output, sort_keys=False)
           EOF
           )
           docker exec "finetune" python -c "$CMD"
-          docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
+          docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+
       - name: Run PEFT-LoRA Test
         run: |
           docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
           CMD=$(cat << EOF
-          conf_path = "finetune/finetune.conf"
+          import yaml
+          conf_path = "finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
-              result = eval(reader.read())
+              result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['lora_config'] = {
                   "task_type": "CAUSAL_LM",
                   "r": 8,
@@ -104,11 +107,12 @@ jobs:
                   "lora_dropout": 0.1
               }
           with open(conf_path, 'w') as output:
-              print(result, file=output)
+              yaml.dump(result, output, sort_keys=False)
           EOF
           )
           docker exec "finetune" python -c "$CMD"
-          docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
+          docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
+
       - name: Run Deltatuner Test on DENAS-LoRA Model
         run: |
           if [[ ${{ matrix.model }} =~ ^(mosaicml\/mpt-7b-chat|huggyllama\/llama-7b|meta-llama\/Llama-2-7b-chat-hf)$ ]]; then
@@ -117,10 +121,11 @@ jobs:
             docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*"
             CMD=$(cat << EOF
           import os
+          import yaml
           os.system("cp -r $(python -m pip show deltatuner | grep Location | cut -d: -f2)/deltatuner/conf/best_structure examples/")
-          conf_path = "finetune/finetune.conf"
+          conf_path = "finetune/finetune.yaml"
           with open(conf_path, encoding="utf-8") as reader:
-              result = eval(reader.read())
+              result = yaml.load(reader, Loader=yaml.FullLoader)
               result['General']['lora_config'] = {
                   "task_type": "CAUSAL_LM",
                   "r": 8,
@@ -133,10 +138,10 @@ jobs:
                   "best_model_structure": f"examples/best_structure/${{ matrix.model }}-best_structure.jsonl",
               }
           with open(conf_path, 'w') as output:
-              print(result, file=output)
+              yaml.dump(result, output, sort_keys=False)
           EOF)
             docker exec "finetune" python -c "$CMD"
-            docker exec "finetune" bash -c "python finetune/finetune.py --config_path finetune/finetune.conf"
+            docker exec "finetune" bash -c "python finetune/finetune.py --config_file finetune/finetune.yaml"
           fi
       - name: Stop Ray
         run: |
@@ -144,7 +149,7 @@ jobs:
           if [[ ! -z "$cid" ]]; then
             docker exec "finetune" bash -c "ray stop"
           fi
-          
+
       - name: Stop Container
         if: success() || failure()
         run: |
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
index d4e8d21b6..eb3d978b9 100644
--- a/.github/workflows/workflow_inference.yml
+++ b/.github/workflows/workflow_inference.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       matrix:
         # for mistral-7b-v0.1, we use bigdl-cpu to verify
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl ]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1 ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -106,7 +106,8 @@ jobs:
           if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
-            docker exec "${PREFIX}" bash -c "KEEP_SERVE_TERMINAL='false' MODEL_TO_SERVE=\"${{ matrix.model }}\" python inference/run_model_serve.py --deepspeed"
+            docker exec "${PREFIX}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
+            docker exec "${PREFIX}" bash -c "KEEP_SERVE_TERMINAL='false' python inference/run_model_serve.py --config_file \"${{ matrix.model }}\".yaml.deepspeed"
             docker exec "${PREFIX}" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"
             docker exec "${PREFIX}" bash -c "python inference/run_model_infer.py --num_iter 1 --model_endpoint http://127.0.0.1:8000/${{ matrix.model }} --streaming_response"
           fi
diff --git a/.github/workflows/workflow_orders on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
similarity index 100%
rename from .github/workflows/workflow_orders on_merge.yml
rename to .github/workflows/workflow_orders_on_merge.yml
diff --git a/README.md b/README.md
index 8e38da957..258f66102 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ ray start --head
 Use the following command to finetune a model using an example dataset and default configurations. The finetuned model will be stored in `/tmp/llm-ray/output` by default. To customize the base model, dataset and configurations, please see the [finetuning document](#finetune):
 
 ```bash
-python finetune/finetune.py --config_path finetune/finetune.conf
+python finetune/finetune.py --config_file finetune/finetune.yaml
 ```
 
 ### Serving
diff --git a/common/config.py b/common/config.py
index 002189f55..801e48f72 100644
--- a/common/config.py
+++ b/common/config.py
@@ -1,11 +1,12 @@
 import os
+import yaml
 import argparse
 from typing import Dict
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
     parser.add_argument(
-        "--config_path",
+        "--config_file",
         type=str,
         required=False,
         default=None,
@@ -14,14 +15,18 @@ def parse_args():
     args, unparsed = parser.parse_known_args()
     return args
 
-def parse_config(config_path=None):
-    if config_path is None:
+def parse_config(config_file=None):
+    if config_file is None:
         args = parse_args()
-        config_path = args.config_path
-    if config_path is None:
+        config_file = args.config_file
+    if config_file is None:
         return {}
-    with open(config_path) as f:
-        config = eval(f.read())
+    if config_file.endswith("yaml"):
+        with open(config_file) as f:
+            config = yaml.load(f, Loader=yaml.FullLoader)
+    else:
+        with open(config_file) as f:
+            config = eval(f.read())
     assert isinstance(config, dict)
     return config
 
diff --git a/dev/scripts/head_node_monitor.sh b/dev/scripts/head_node_monitor.sh
index b17892ac4..0f3d6a385 100644
--- a/dev/scripts/head_node_monitor.sh
+++ b/dev/scripts/head_node_monitor.sh
@@ -85,7 +85,7 @@ do
         echo "Restart ray cluster on head node: ${head_node}, worker nodes: ${worker_node[*]}"
         ray_cluster up
         echo "Resubmit job."
-        RAY_ADDRESS="http://${head_node}:8265" ray job submit --no-wait --working-dir $CURRENT_DIR/Finetune/ -- python $CURRENT_DIR/Finetune/main.py --config_path $CURRENT_DIR/Finetune/llm_finetune_template.conf
+        RAY_ADDRESS="http://${head_node}:8265" ray job submit --no-wait --working-dir $CURRENT_DIR/Finetune/ -- python $CURRENT_DIR/Finetune/main.py --config_file $CURRENT_DIR/Finetune/llm_finetune_template.conf
 
     else
         echo "Report: head node ${head_node} works well"
diff --git a/docs/pretrain.md b/docs/pretrain.md
index 639afa8f3..2b3667523 100644
--- a/docs/pretrain.md
+++ b/docs/pretrain.md
@@ -10,12 +10,19 @@ git checkout main
 ```
 #### 2. build Docker images for pretrain
 ```bash
-cd llm-on-ray/tools/workload_in_containers
-./build-image.sh megatron-habana # for Gaudi2 platform
+cd llm-on-ray/pretrain/docker
 ```
-or
+Build the habana docker image for Megatron-DeepSpeed.
 ```bash
-./build-image.sh megatron-nvidia # for Nvidia GPU platform
+./build-image.sh megatron-habana
+```
+Build the habana docker image for Huggingface trainer
+```bash
+./build-image.sh optimum-habana
+```
+Build the Nvidia docker image for both Megatron-DeepSpeed and Huggingface trainer
+```bash
+./build-image.sh nvidia
 ```
 
 #### 3. Run the docker containers on head node and worker nodes for pretrain.
@@ -24,12 +31,19 @@ make the logs directory for saving the ray logs.
 mkdir ~/workspace/logs
 ```
 Gaudi2:
+
+##### Megatron-DeepSpeed
 ```bash
 docker run -it --name megatron-habana --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v ~/workspace:/home/user/workspace -v ~/workspace/logs:/tmp --cap-add=sys_nice --net=host --ipc=host llm-ray:megatron-habana
 ```
+
+##### Huggingface trainer
+```bash
+docker run -it --name megatron-habana --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v ~/workspace:/home/user/workspace -v ~/workspace/logs:/tmp --cap-add=sys_nice --net=host --ipc=host llm-ray:optimum-habana
+```
 Nvidia GPU:
 ```bash
-docker run --gpus all -it --ulimit memlock=-1 --ulimit stack=67108864 --network host --name megatron-nvidia --shm-size=64g -v ~/workspace/logs:/tmp -v ~/workspace:/home/user/workspace llm-ray:megatron-gpu  /bin/bash
+docker run --gpus all -it --ulimit memlock=-1 --ulimit stack=67108864 --network host --name megatron-nvidia --shm-size=64g -v ~/workspace/logs:/tmp -v ~/workspace:/home/user/workspace llm-ray:nvidia  /bin/bash
 ```
 
 #### 4. Launch ray cluster
@@ -46,22 +60,54 @@ RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1  ray start --address='127.0.0.1:6379'
 If deploying a ray cluster on multiple nodes, please download the workflow repository on each node. More information about ray cluster, please refer to https://www.ray.io/
 
 ### Pretrain Workflow
-This workflow integrates the Megatron-DeepSpeed and Ray for pretrain.
+This workflow integrates two different pretrain solutions.
+#### Megatron-DeepSpeed
 For GPU version, we use the [Microsoft Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed). For Gaudi2 version, we use the [HabanaAI Megatron-DeepSpeed](https://github.com/HabanaAI/Model-References/tree/master/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed)
 
+#### Huggingface Trainer
+It integrates the megatron dataloader for pretrain. For habana support, it uses the [optimum-habana](https://github.com/huggingface/optimum-habana). It can use deepspeed ZeRO stage3 to train medium and large language models
 
 #### 1. Generate megatron datasets
-Please refer to [this tutorial](../tools/redpajama_data_processing/README.md). Copy the datasets bin and idx files into ~/workspace/data
+Please refer to [this tutorial](https://github.com/intel/e2eAIOK/tree/main/RecDP/pyrecdp/primitives/llmutils/tokenize_and_save). Copy the datasets bin and idx files into ~/workspace/data
 
-#### 2. Download the vocab file and merge table.
-Download the GPT [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) into ~/workspace/data.
+#### 2. Tokenizer
+##### Download the vocab file and merge table.
+If using the tokenizer files for Megatron_DeepSpeed pretrain, Download the GPT [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) into ~/workspace/data.
 ```bash
 cd ~/workspace/data/
 wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
 wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt 
 ```
-
-
+Modify the vocab_file and merge_file of megatron_config in config files
+```bash
+#llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+"megatron_config": {
+        "vocab_file": "megatron-data/gpt2-vocab.json",
+        "merge_file": "megatron-data/gpt2-merges.txt",
+}
+```
+##### Huggingface Tokenizer
+For Huggingface trainer, the Huggingface tokenizer is preferred.
+Modify the tokenizer_type and tokenizer_model of megatron_config for megatron dataset.
+```bash
+#llama_7b_8Guadi_pretrain.conf
+"megatron_config": {
+        "tokenizer_type": "HFTokenizer",
+        "tokenizer_model": "huggyllama/llama-7b",
+}
+```
+Modify the tokenizer parameters of trainer. The tokenizer of trainer and megatron dataset should be consistent
+```bash
+#llama_7b_8Guadi_pretrain.conf
+"tokenizer": {
+        # The type of dataset, now only HuggingfaceTokenizer is supported.
+        "type": "HuggingFaceTokenizer",
+        # The name/path of tokenizer in huggingface.
+        "name": "huggyllama/llama-7b",
+        # Config of tokenizer, all items will be transfered to transformers.AutoTokenizer.from_pretrained().
+        "config": {}
+}
+```
 
 #### 3. Pretrain Command
 
@@ -70,19 +116,34 @@ Please ensure that you check and modify the configuration files located in ~/wor
 After your environment configuration are properly set up, you can use the following instructions to pretrain the language model:
 
 ##### Gaudi2:
+###### Megatron-DeepSpeed
 
 Set up `megatron_deepspeed_path` in the configuration.
 
 ```bash
 cd /home/user/workspace/llm-on-ray
 #Bloom-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_path pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/bloom_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+#llama-7B
+python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+```
+
+##### Huggingface Trainer
+```bash
+cd /home/user/workspace/llm-on-ray
 #llama-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_path pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8Guadi_pretrain.conf
 ```
 ##### Nvidia GPU:
+###### Megatron-DeepSpeed
 ```bash
 cd /home/user/workspace/llm-on-ray
 #llama2-7B
-python pretrain/megatron_deepspeed_pretrain.py --config_path pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf
+python pretrain/megatron_deepspeed_pretrain.py --config_file pretrain/config/llama2_3b_megatron_deepspeed_zs0_8gpus_pretrain.conf
 ```
+##### Huggingface Trainer
+```bash
+cd /home/user/workspace/llm-on-ray
+#llama-7B
+python pretrain/pretrain.py --config_file pretrain/config/llama_7b_8gpu_pretrain.conf
+```
\ No newline at end of file
diff --git a/docs/rlhf.md b/docs/rlhf.md
index 1f18d03ba..5f3fd65b0 100755
--- a/docs/rlhf.md
+++ b/docs/rlhf.md
@@ -30,7 +30,7 @@ Additionally, we provide a simple data example at the path `examples/data/sample
 You can run the following command to start a SFT model training:
 
 ```bash
-python Finetune/finetune.py --config_path Finetune/finetune.conf 
+python Finetune/finetune.py --config_file Finetune/finetune.conf 
 ```
 
 Once the model training is completed, based on your settings in the configuration file (such as `"checkpoint_dir": "/tmp/llm-ray/checkpoint/sft"`), you will obtain the final trained model in the `/tmp/llm-ray/checkpoint/sft` directory. This trained model will be utilized in the final stage of the PPO training process.
@@ -67,7 +67,7 @@ Additionally, we provide a simple data example at the path `examples/data/sample
 You can run the following command to start a reward model training:
 
 ```bash
-python rlhf/reward_trainer.py --config_path rlhf/reward.conf
+python rlhf/reward_trainer.py --config_file rlhf/reward.conf
 ```
 Once the reward model training is complete, based on your settings in the configuration file (such as `"checkpoint_dir": "/tmp/llm-ray/checkpoint/rm"`), you will obtain the final trained model in the `/tmp/llm-ray/checkpoint/rm` directory. This trained reward model will also be utilized in the final stage of the PPO training process.
 
@@ -151,7 +151,7 @@ Additionally, we provide a simple data example at the path `examples/data/sample
 You can run the following command to start a ppo training:
 
 ```bash
-python rlhf/ppo_trainer.py --config_path rlhf/ppo.conf
+python rlhf/ppo_trainer.py --config_file rlhf/ppo.conf
 ```
 
 It is important to note that before training, we need to configure the corresponding settings in `ppo.conf` based on the saved paths of the SFT (Structured Fine-Tuning) model and the RM (Reward Model) model. Assuming that we are using pre-trained models of `EleutherAI/gpt2` and considering the previous model save settings, we should configure the following in the `ppo.conf`:
diff --git a/examples/finetune/dolly1/dolly_1_finetune.conf b/examples/finetune/dolly1/dolly_1_finetune.conf
deleted file mode 100644
index f23e2b955..000000000
--- a/examples/finetune/dolly1/dolly_1_finetune.conf
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "General": {
-        "base_model": "EleutherAI/gpt-j-6b",
-        "gpt_base_model": True,
-        "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
-        "config": {
-            "trust_remote_code": False,
-            "use_auth_token": None,
-        },
-        "lora_config": {
-            "task_type": "CAUSAL_LM",
-            "r": 8,
-            "lora_alpha": 32,
-            "lora_dropout": 0.1
-        }
-    },
-    "Dataset": {
-        "train_file": "examples/finetune/dolly1/data/train/train.jsonl",
-        "validation_file": None,
-        "validation_split_percentage": 5
-    },
-    "Training": {
-        "optimizer": "AdamW",
-        "batch_size": 2,
-        "epochs": 3,
-        "learning_rate": 1e-5,
-        "lr_scheduler": "linear",
-        "weight_decay": 0.0,
-        "num_training_workers": 2,
-        "resources_per_worker": {
-            "CPU": 32
-        },
-    },
-}
diff --git a/examples/finetune/dolly1/dolly_1_finetune.yaml b/examples/finetune/dolly1/dolly_1_finetune.yaml
new file mode 100644
index 000000000..eb9a93b93
--- /dev/null
+++ b/examples/finetune/dolly1/dolly_1_finetune.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/finetune/dolly1/data/train/train.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/examples/finetune/dolly2/dolly_2_finetune.conf b/examples/finetune/dolly2/dolly_2_finetune.conf
deleted file mode 100644
index 63d937eb5..000000000
--- a/examples/finetune/dolly2/dolly_2_finetune.conf
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "General": {
-        "base_model": "EleutherAI/pythia-6.9b",
-        "gpt_base_model": True,
-        "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
-        "config": {
-            "trust_remote_code": False,
-            "use_auth_token": None,
-        },
-        "lora_config": {
-            "task_type": "CAUSAL_LM",
-            "r": 8,
-            "lora_alpha": 32,
-            "lora_dropout": 0.1
-        }
-    },
-    "Dataset": {
-        "train_file": "databricks/databricks-dolly-15k",
-        "validation_file": None,
-        "validation_split_percentage": 5
-    },
-    "Training": {
-        "optimizer": "AdamW",
-        "batch_size": 2,
-        "epochs": 3,
-        "learning_rate": 1e-5,
-        "lr_scheduler": "linear",
-        "weight_decay": 0.0,
-        "num_training_workers": 2,
-        "resources_per_worker": {
-            "CPU": 32
-        },
-    },
-}
\ No newline at end of file
diff --git a/examples/finetune/dolly2/dolly_2_finetune.yaml b/examples/finetune/dolly2/dolly_2_finetune.yaml
new file mode 100644
index 000000000..95dd0dd86
--- /dev/null
+++ b/examples/finetune/dolly2/dolly_2_finetune.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: EleutherAI/pythia-6.9b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: databricks/databricks-dolly-15k
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.conf b/examples/finetune/gpt_j_6b/finetune_intel_gpu.conf
deleted file mode 100644
index 970544011..000000000
--- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.conf
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-    "General": {
-        "base_model": "EleutherAI/gpt-j-6b",
-        # fix issue: https://github.com/huggingface/transformers/issues/22482
-        # tranformers version 4.26.0 is required for gpt2, gpt-j-6B, pythia...
-        "gpt_base_model": True,
-        "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
-        "config": {
-            "trust_remote_code": False,
-            "use_auth_token": None,
-        },
-        "lora_config": {
-            "task_type": "CAUSAL_LM",
-            "r": 8,
-            "lora_alpha": 32,
-            "lora_dropout": 0.1
-        }
-    },
-    "Dataset": {
-        "train_file": "examples/data/sample_finetune_data.jsonl",
-        "validation_file": None,
-        "validation_split_percentage": 5
-    },
-    "Training": {
-        "optimizer": "AdamW",
-        "batch_size": 4,
-        "epochs": 3,
-        "learning_rate": 1e-5,
-        "lr_scheduler": "linear",
-        "weight_decay": 0.0,
-        "device": "GPU",
-        "num_training_workers": 2,
-        "accelerate_mode": "GPU_DDP",
-        "resources_per_worker": {
-            "CPU": 1,
-            "GPU": 1,
-        },
-    },
-}
-
diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
new file mode 100644
index 000000000..41303f615
--- /dev/null
+++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
@@ -0,0 +1,30 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 4
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: GPU
+  num_training_workers: 2
+  accelerate_mode: GPU_DDP
+  resources_per_worker:
+    CPU: 1
+    GPU: 1
diff --git a/examples/finetune/open_assistant/open_assistant_finetune.conf b/examples/finetune/open_assistant/open_assistant_finetune.conf
deleted file mode 100644
index d6913df44..000000000
--- a/examples/finetune/open_assistant/open_assistant_finetune.conf
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "General": {
-        "base_model": "EleutherAI/gpt-j-6b",
-        "gpt_base_model": True,
-        "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
-        "config": {
-            "trust_remote_code": False,
-            "use_auth_token": None,
-        },
-        "lora_config": {
-            "task_type": "CAUSAL_LM",
-            "r": 8,
-            "lora_alpha": 32,
-            "lora_dropout": 0.1
-        }
-    },
-    "Dataset": {
-        "train_file": "examples/finetune/open_assistant/data/train/train.jsonl",
-        "validation_file": "examples/finetune/open_assistant/data/validation/validation.jsonl",
-        "validation_split_percentage": 0
-    },
-    "Training": {
-        "optimizer": "AdamW",
-        "batch_size": 2,
-        "epochs": 3,
-        "learning_rate": 1e-5,
-        "lr_scheduler": "linear",
-        "weight_decay": 0.0,
-        "num_training_workers": 2,
-        "resources_per_worker": {
-            "CPU": 32
-        },
-    },
-}
\ No newline at end of file
diff --git a/examples/finetune/open_assistant/open_assistant_finetune.yaml b/examples/finetune/open_assistant/open_assistant_finetune.yaml
new file mode 100644
index 000000000..96a355cb4
--- /dev/null
+++ b/examples/finetune/open_assistant/open_assistant_finetune.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/finetune/open_assistant/data/train/train.jsonl
+  validation_file: examples/finetune/open_assistant/data/validation/validation.jsonl
+  validation_split_percentage: 0
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/finetune.conf b/finetune/finetune.conf
deleted file mode 100644
index 794decce1..000000000
--- a/finetune/finetune.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-    "General": {
-        "base_model": "EleutherAI/gpt-j-6b",
-        # fix issue: https://github.com/huggingface/transformers/issues/22482
-        # tranformers version 4.26.0 is required for gpt2, gpt-j-6B, pythia...
-        "gpt_base_model": True,
-        "output_dir": "/tmp/llm-ray/output",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint",
-        "config": {
-            "trust_remote_code": False,
-            "use_auth_token": None,
-        },
-        "lora_config": {
-            "task_type": "CAUSAL_LM",
-            "r": 8,
-            "lora_alpha": 32,
-            "lora_dropout": 0.1
-        }
-    },
-    "Dataset": {
-        "train_file": "examples/data/sample_finetune_data_small.jsonl",
-        "validation_file": None,
-        "validation_split_percentage": 5
-    },
-    "Training": {
-        "optimizer": "AdamW",
-        "batch_size": 2,
-        "epochs": 3,
-        "learning_rate": 1e-5,
-        "lr_scheduler": "linear",
-        "weight_decay": 0.0,
-        "device": "CPU",
-        "num_training_workers": 2,
-        "resources_per_worker": {
-            "CPU": 32,
-            # "GPU": 1,
-        },
-        "accelerate_mode": "CPU_DDP",
-    },
-}
diff --git a/finetune/finetune.py b/finetune/finetune.py
index 1e7b878d9..088ef89f1 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -2,6 +2,7 @@
 
 import os
 import time
+import argparse
 import traceback
 from typing import Any, Dict
 
@@ -13,6 +14,8 @@
 from ray.air.config import ScalingConfig
 from ray.air import RunConfig, FailureConfig
 
+from pydantic_yaml import parse_yaml_raw_as
+
 from accelerate import FullyShardedDataParallelPlugin
 from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
 
@@ -20,6 +23,7 @@
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 
 import common
+from finetune_config import FinetuneConfig
 
 
 def get_accelerate_environment_variable(mode: str) -> dict:
@@ -140,8 +144,26 @@ def train_func(config: Dict[str, Any]):
         exit(1)
     common.logger.info(f"train finish")
 
+
+def get_finetune_config():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        required=True,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    args = parser.parse_args()
+    config_file = args.config_file
+
+    with open(config_file) as f:
+        finetune_config = parse_yaml_raw_as(FinetuneConfig, f)
+    return finetune_config.dict()
+
+
 def main(external_config = None):
-    config = common.Config()
+    config = get_finetune_config()
     if external_config is not None:
         config.merge(external_config)
 
diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml
new file mode 100644
index 000000000..44bcadcf4
--- /dev/null
+++ b/finetune/finetune.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
new file mode 100644
index 000000000..fad9d751b
--- /dev/null
+++ b/finetune/finetune_config.py
@@ -0,0 +1,93 @@
+from pydantic import BaseModel, validator
+from typing import Optional, List
+
+
+class GeneralConfig(BaseModel):
+    trust_remote_code: bool
+    use_auth_token: Optional[str]
+
+
+class LoraConfig(BaseModel):
+    task_type: str
+    r: int
+    lora_alpha: int
+    lora_dropout: float
+    target_modules: Optional[List[str]] = None
+
+
+class DeltatunerConfig(BaseModel):
+    algo: str
+    denas: bool
+    best_model_structure: str
+
+
+class General(BaseModel):
+    base_model: str
+    gpt_base_model: bool
+    output_dir: str
+    checkpoint_dir: str
+    config: GeneralConfig
+    lora_config: Optional[LoraConfig] = None
+    deltatuner_config: Optional[DeltatunerConfig] = None
+
+
+class Dataset(BaseModel):
+    train_file: str
+    validation_file: Optional[str]
+    validation_split_percentage: int
+
+
+class RayResourceConfig(BaseModel):
+    CPU: int
+    GPU: int = 0
+
+
+class Training(BaseModel):
+    optimizer: str
+    batch_size: int
+    epochs: int
+    learning_rate: float
+    lr_scheduler: str
+    weight_decay: float
+    device: str
+    num_training_workers: int
+    resources_per_worker: RayResourceConfig
+    accelerate_mode: str
+
+    @validator("device")
+    def check_device(cls, v: str):
+        devices = ["CPU", "GPU"]
+        if v not in devices:
+            raise ValueError(f"device must be one of {devices}")
+        return v
+
+    @validator("accelerate_mode")
+    def check_accelerate_mode(cls, v: str):
+        modes = ["CPU_DDP", "GPU_DDP", "GPU_FSDP"]
+        if v not in modes:
+            raise ValueError(f"accelerate_mode must be one of {modes}")
+        return v
+
+    # @model_validator(mode='after')
+    # def check_device_and_accelerate_mode(self) -> "Training":
+    #     dev = self.device
+    #     res = self.resources_per_worker
+    #     mode = self.accelerate_mode
+    #     if dev == "CPU":
+    #         if res.GPU is not None and res.GPU > 0:
+    #             raise ValueError("Please not specified GPU resource when use CPU only in Ray.")
+    #         if mode != "CPU_DDP":
+    #             raise ValueError("Please specified CPU related accelerate mode when use CPU only in Ray.")
+    #     elif dev == "GPU":
+    #         if res.GPU is None or res.GPU == 0:
+    #             raise ValueError("Please specified GPU resource when use GPU to fine tune in Ray.")
+    #         if mode not in ["GPU_DDP", "GPU_FSDP"]:
+    #             raise ValueError("Please speicifed GPU related accelerate mode when use GPU to fine tune in Ray.")
+
+    #     return self
+
+
+class FinetuneConfig(BaseModel):
+    General: General
+    Dataset: Dataset
+    Training: Training
diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml
new file mode 100644
index 000000000..4c361fc55
--- /dev/null
+++ b/finetune/models/bloom-560m.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: bigscience/bloom-560m
+  gpt_base_model: false
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml
new file mode 100644
index 000000000..44bcadcf4
--- /dev/null
+++ b/finetune/models/finetune_config_template.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml
new file mode 100644
index 000000000..44bcadcf4
--- /dev/null
+++ b/finetune/models/gpt-j-6b.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml
new file mode 100644
index 000000000..a0887b324
--- /dev/null
+++ b/finetune/models/gpt2.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: gpt2
+  gpt_base_model: true
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml
new file mode 100644
index 000000000..c7e7430f4
--- /dev/null
+++ b/finetune/models/llama-2-7b-chat-hf.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: meta-llama/Llama-2-7b-chat-hf
+  gpt_base_model: false
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/llama-7b.yaml b/finetune/models/llama-7b.yaml
new file mode 100644
index 000000000..3bd823253
--- /dev/null
+++ b/finetune/models/llama-7b.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: meta-llama/Llama-2-7b
+  gpt_base_model: false
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml
new file mode 100644
index 000000000..46b05a5a8
--- /dev/null
+++ b/finetune/models/mistral-7b-v0.1.yaml
@@ -0,0 +1,38 @@
+General:
+  base_model: mistralai/Mistral-7B-v0.1
+  gpt_base_model: false
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+    target_modules:
+    - q_proj
+    - k_proj
+    - v_proj
+    - o_proj
+    - gate_proj
+    - up_proj
+    - down_proj
+    - lm_head
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml
new file mode 100644
index 000000000..149514c07
--- /dev/null
+++ b/finetune/models/mpt-7b-chat.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: mosaicml/mpt-7b-chat
+  gpt_base_model: false
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: true
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/finetune/models/opt-125m.yaml b/finetune/models/opt-125m.yaml
new file mode 100644
index 000000000..4d8dc7e13
--- /dev/null
+++ b/finetune/models/opt-125m.yaml
@@ -0,0 +1,29 @@
+General:
+  base_model: facebook/opt-125m
+  gpt_base_model: false
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
+  config:
+    trust_remote_code: false
+    use_auth_token: null
+  lora_config:
+    task_type: CAUSAL_LM
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+Dataset:
+  train_file: examples/data/sample_finetune_data_small.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  device: CPU
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
+  accelerate_mode: CPU_DDP
diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py
index 3ff008c71..98fbed0aa 100644
--- a/inference/deepspeed_predictor.py
+++ b/inference/deepspeed_predictor.py
@@ -13,37 +13,39 @@
 from typing import List
 import os
 from predictor import Predictor
-from peft import PeftModel
-from deltatuner import DeltaTunerModel
-from inference_config import InferenceConfig
+from utils import get_torch_dtype
+
+
+from inference_config import InferenceConfig, DEVICE_CPU, DEVICE_XPU, IPEX_PRECISION_BF16
 
 class DSPipeline:
     def __init__(
         self,
-        inferenceConfig: InferenceConfig,
+        infer_conf: InferenceConfig,
         pad_token_id,
-        stopping_criteria,
-        dtype
+        stopping_criteria
     ):
-
-        self.dtype = dtype
-        self.device = torch.device(inferenceConfig.device)
+        self.device = torch.device(infer_conf.device)
         self.pad_token_id = pad_token_id
         self.stopping_criteria = stopping_criteria
 
-        model_desc = inferenceConfig.model_description
+        model_desc = infer_conf.model_description
         model_config = model_desc.config
-        config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code)
+        hf_config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code)
 
+        # get correct torch type for loading HF model
+        torch_dtype = get_torch_dtype(infer_conf, hf_config)
         self.model = AutoModelForCausalLM.from_pretrained(model_desc.model_id_or_path,
-                                                          torch_dtype=dtype,
-                                                          config=config,
+                                                          config=hf_config,
+                                                          torch_dtype=torch_dtype,
                                                           low_cpu_mem_usage=True,
                                                           **model_config.dict())
         
         if model_desc.peft_model_id_or_path:
+            from peft import PeftModel
             self.model = PeftModel.from_pretrained(self.model, model_desc.peft_model_id_or_path)
             if model_desc.peft_type == "deltatuner":
+                from deltatuner import DeltaTunerModel
                 self.model = DeltaTunerModel.from_pretrained(self.model, model_desc.peft_model_id_or_path)
             self.model = self.model.merge_and_unload()
 
@@ -53,7 +55,7 @@ def __init__(
         self.model.eval()
 
     def streaming_generate(self, inputs, streamer, **generate_kwargs):
-        self.model.generate(**inputs,
+        self.model.generate(inputs,
                     pad_token_id=self.pad_token_id,
                     stopping_criteria=self.stopping_criteria,
                     streamer=streamer,
@@ -61,7 +63,7 @@ def streaming_generate(self, inputs, streamer, **generate_kwargs):
 
     def generate(self, inputs, **config):
         gen_tokens = self.model.generate(
-            **inputs,
+            inputs,
             pad_token_id=self.pad_token_id,
             stopping_criteria=self.stopping_criteria,
             **config
@@ -75,19 +77,18 @@ class PredictionWorker(TorchDistributedWorker):
     Multiple PredictionWorkers of the same WorkerGroup form a PyTorch DDP process
     group and work together under the orchestration of DeepSpeed.
     """
-    def __init__(self, world_size: int, inferenceConfig: InferenceConfig, amp_dtype, pad_token_id, stopping_criteria):
+    def __init__(self, world_size: int, infer_conf: InferenceConfig, pad_token_id, stopping_criteria):
         self.world_size = world_size
-        self.inferenceConfig = inferenceConfig
-        self.amp_dtype = amp_dtype
+        self.infer_conf = infer_conf
         self.pad_token_id = pad_token_id
         self.stopping_criteria = stopping_criteria
 
     def init_model(self, local_rank: int):
         """Initialize model for inference."""
 
-        if self.inferenceConfig.device == 'cpu':
+        if self.infer_conf.device == DEVICE_CPU:
             replace_with_kernel_inject = False
-        elif self.inferenceConfig.device == 'xpu':
+        elif self.infer_conf.device == DEVICE_XPU:
             replace_with_kernel_inject = False
         else:
             replace_with_kernel_inject = True
@@ -96,24 +97,26 @@ def init_model(self, local_rank: int):
         os.environ['WORLD_SIZE'] = str(self.world_size)
 
         pipe = DSPipeline(
-            self.inferenceConfig,
+            self.infer_conf,
             pad_token_id=self.pad_token_id,
             stopping_criteria=self.stopping_criteria,
-            dtype=self.amp_dtype
         )
 
         pipe.model = deepspeed.init_inference(
             pipe.model,
-            dtype=self.amp_dtype,
             mp_size=self.world_size,
+            dtype=torch.bfloat16,
             replace_with_kernel_inject=replace_with_kernel_inject
         )
 
-        if self.ipex_enabled:
+        if self.infer_conf.ipex.enabled:
             import intel_extension_for_pytorch as ipex
             try: ipex._C.disable_jit_linear_repack()
             except: pass
-            pipe.model = ipex.optimize_transformers(pipe.model.eval(), dtype=self.amp_dtype, inplace=True)
+            pipe.model = ipex.optimize_transformers(
+                pipe.model.eval(),
+                dtype=torch.bfloat16 if self.infer_conf.ipex.precision == IPEX_PRECISION_BF16 else torch.float32,
+                inplace=True)
 
         self.generator = pipe
 
@@ -124,21 +127,18 @@ def generate(self, inputs, **config):
         return self.generator.generate(inputs, **config)
 
 class DeepSpeedPredictor(Predictor):
-    def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, pad_token_id, stopping_criteria) -> None:
-        self.inferenceConfig = inferenceConfig
-        self.amp_dtype = amp_dtype
-        self.pad_token_id = pad_token_id
-        self.stopping_criteria = stopping_criteria
-
-        use_gpu = True if (inferenceConfig.device == "cuda") else False
-
+    def __init__(self, infer_conf: InferenceConfig) -> None:
+        super().__init__(infer_conf)
+        # TODO this should be removed later
+        self.pad_token_id = self.tokenizer.pad_token_id
         # Scaling config for one worker group.
-        resource = {"CPU": inferenceConfig.cpus_per_worker}
-        if inferenceConfig.device == "cuda":
-            resource["GPU"] = inferenceConfig.gpus_per_worker
+        resource = {"CPU": infer_conf.cpus_per_worker}
+        use_gpu = True if (infer_conf.device == "cuda") else False
+        if use_gpu:
+            resource["GPU"] = infer_conf.gpus_per_worker
         scaling_conf = ScalingConfig(
             use_gpu=use_gpu,
-            num_workers=inferenceConfig.workers_per_group,
+            num_workers=infer_conf.workers_per_group,
             resources_per_worker=resource
         )
 
@@ -181,13 +181,13 @@ def _init_worker_group(self, scaling_config: ScalingConfig):
 
         # Create the prediction workers.
         self.prediction_workers = [
-            prediction_worker_cls.remote(scaling_config.num_workers, self.inferenceConfig, self.amp_dtype,
+            prediction_worker_cls.remote(scaling_config.num_workers, self.infer_conf,
                 self.pad_token_id, self.stopping_criteria)
             for i in range(scaling_config.num_workers)
         ]
 
         # Initialize torch distributed process group for the workers.
-        local_ranks = init_torch_dist_process_group(self.prediction_workers, backend="ccl" if self.inferenceConfig.device != "cuda" else "nccl")
+        local_ranks = init_torch_dist_process_group(self.prediction_workers, backend="ccl" if self.infer_conf.device != "cuda" else "nccl")
 
         # Initialize the model on each worker.
         ray.get([
@@ -195,21 +195,53 @@ def _init_worker_group(self, scaling_config: ScalingConfig):
             for worker, local_rank in zip(self.prediction_workers, local_ranks)
         ])
 
-    def streaming_generate(self, inputs, streamer, **config):
-        inputs_ref = ray.put(inputs)
+    def streaming_generate(self, prompt, streamer, **config):
+        input_ids = self.tokenize_inputs(prompt)
+        inputs_ref = ray.put(input_ids)
         self.prediction_workers[0].streaming_generate.remote(inputs_ref, streamer, **config)
         for worker in self.prediction_workers[1:]:
                 worker.streaming_generate.remote(inputs_ref, self._create_dummy_streamer(), **config)
 
-    def generate(self, inputs, **config):
-        inputs_ref = ray.put(inputs)
-        prediction = ray.get(
+    def generate(self, prompt, **config):
+        input_ids = self.tokenize_inputs(prompt)
+        inputs_ref = ray.put(input_ids)
+        gen_tokens = ray.get(
             [
                 worker.generate.remote(inputs_ref, **config)
                 for worker in self.prediction_workers
             ]
         )[0]
-        return prediction
+        return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
+
+    def get_streamer(self):
+        from transformers import TextStreamer
+        from typing import Optional
+        from ray.util.queue import Queue
+
+        class RayTextIteratorStreamer(TextStreamer):
+            def __init__(
+                self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
+            ):
+                super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+                self.text_queue = Queue()
+                self.stop_signal = None
+                self.timeout = timeout
+
+            def on_finalized_text(self, text: str, stream_end: bool = False):
+                self.text_queue.put(text, timeout=self.timeout)
+                if stream_end:
+                    self.text_queue.put(self.stop_signal, timeout=self.timeout)
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                value = self.text_queue.get(timeout=self.timeout)
+                if value == self.stop_signal:
+                    raise StopIteration()
+                else:
+                    return value
+        return RayTextIteratorStreamer(self.tokenizer, skip_special_tokens=True)
 
     def predict(
         self,
diff --git a/inference/inference_config.py b/inference/inference_config.py
index 661d83765..9b2a434df 100644
--- a/inference/inference_config.py
+++ b/inference/inference_config.py
@@ -3,6 +3,14 @@
 from pydantic_yaml import parse_yaml_raw_as
 from typing import List, Dict
 
+IPEX_PRECISION_BF16 = 'bf16'
+IPEX_PRECISION_FP32 = 'fp32'
+
+DEVICE_CPU = "cpu"
+DEVICE_HPU = "hpu"
+DEVICE_XPU = "xpu"
+DEVICE_CUDA = "cuda"
+
 class Prompt(BaseModel):
     intro: str = ""
     human_id: str = ""
@@ -14,6 +22,16 @@ class ModelConfig(BaseModel):
     use_auth_token: str = None
     load_in_4bit: bool = False
 
+class Ipex(BaseModel):
+    enabled: bool = True
+    precision: str = 'bf16'
+
+    @validator('precision')
+    def _check_precision(cls, v: str):
+        if v:
+            assert v in [IPEX_PRECISION_BF16, IPEX_PRECISION_FP32]
+        return v
+
 # for bigdl model
 class BigDLModelConfig(BaseModel):
     load_in_low_bit: str = ""
@@ -61,14 +79,13 @@ class InferenceConfig(BaseModel):
     port: int = 8000
     name: str = None
     route_prefix: str = None
-    precision: str = 'bf16'
     cpus_per_worker: int = 24
     gpus_per_worker: int = 0
     hpus_per_worker: int = 0
     deepspeed: bool = False
     workers_per_group: int = 2
-    ipex: bool = False
-    device: str = "cpu"
+    device: str = DEVICE_CPU
+    ipex: Ipex = Ipex()
     model_description: ModelDescription = ModelDescription()
 
     # prevent warning of protected namespaces
@@ -89,13 +106,7 @@ def _check_port(cls, v: int):
     @validator('device')
     def _check_device(cls, v: str):
         if v:
-            assert v in ['cpu', 'xpu', 'cuda', 'hpu']
-        return v
-
-    @validator('precision')
-    def _check_precision(cls, v: str):
-        if v:
-            assert v in ['bf16', 'fp32']
+            assert v in [DEVICE_CPU, DEVICE_XPU, DEVICE_CUDA, DEVICE_HPU]
         return v
 
     @validator('workers_per_group')
diff --git a/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml b/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml
index b562dfb3c..6da907c24 100644
--- a/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml
+++ b/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: mistral-7b-v0.1-bigdl
 route_prefix: /mistral-7b-v0.1-bigdl
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: false
+  precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   bigdl: true
diff --git a/inference/models/bigdl/mpt-7b-bigdl.yaml b/inference/models/bigdl/mpt-7b-bigdl.yaml
index bc05487ab..f306c0507 100644
--- a/inference/models/bigdl/mpt-7b-bigdl.yaml
+++ b/inference/models/bigdl/mpt-7b-bigdl.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: mpt-7b-bigdl
 route_prefix: /mpt-7b-bigdl
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: false
+  precision: bf16
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   bigdl: true
diff --git a/inference/models/bloom-560m.yaml b/inference/models/bloom-560m.yaml
index 39c956af9..43f63cb62 100644
--- a/inference/models/bloom-560m.yaml
+++ b/inference/models/bloom-560m.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: bloom-560m
 route_prefix: /bloom-560m
-precision: 'bf16'
 cpus_per_worker: 10
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: true
+  precision: bf16
 model_description:  
   model_id_or_path: bigscience/bloom-560m
   tokenizer_name_or_path: bigscience/bloom-560m
diff --git a/inference/models/gpt-j-6b.yaml b/inference/models/gpt-j-6b.yaml
index ca0a0636f..82518baa9 100644
--- a/inference/models/gpt-j-6b.yaml
+++ b/inference/models/gpt-j-6b.yaml
@@ -1,13 +1,15 @@
 port: 8000
 name: gpt-j-6b
 route_prefix: /gpt-j-6b
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  # false here for ci coverage
+  enabled: false
+  precision: bf16
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
diff --git a/inference/models/gpt2.yaml b/inference/models/gpt2.yaml
index 4b135cc10..617c8a64d 100644
--- a/inference/models/gpt2.yaml
+++ b/inference/models/gpt2.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: gpt2
 route_prefix: /gpt2
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: true
+  precision: bf16
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
diff --git a/inference/models/llama-2-7b-chat-hf.yaml b/inference/models/llama-2-7b-chat-hf.yaml
index de94e023f..b0dc029da 100644
--- a/inference/models/llama-2-7b-chat-hf.yaml
+++ b/inference/models/llama-2-7b-chat-hf.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: llama-2-7b-chat-hf
 route_prefix: /llama-2-7b-chat-hf
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: true
+  precision: bf16
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
diff --git a/inference/models/mistral-7b-v0.1.yaml b/inference/models/mistral-7b-v0.1.yaml
index 3b3fb732b..60ad1c602 100644
--- a/inference/models/mistral-7b-v0.1.yaml
+++ b/inference/models/mistral-7b-v0.1.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: mistral-7b-v0.1
 route_prefix: /mistral-7b-v0.1
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: true
+  precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   bigdl: false
diff --git a/inference/models/mpt-7b.yaml b/inference/models/mpt-7b.yaml
index 1388fa796..b0b2ac7b9 100644
--- a/inference/models/mpt-7b.yaml
+++ b/inference/models/mpt-7b.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: mpt-7b
 route_prefix: /mpt-7b
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: true
+  precision: bf16
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
diff --git a/inference/models/neural-chat-7b-v3-1.yaml b/inference/models/neural-chat-7b-v3-1.yaml
new file mode 100644
index 000000000..076ef6f7c
--- /dev/null
+++ b/inference/models/neural-chat-7b-v3-1.yaml
@@ -0,0 +1,24 @@
+port: 8000
+name: neural-chat-7b-v3-1
+route_prefix: /neural-chat-7b-v3-1
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+device: "cpu"
+ipex:
+  enabled: true
+  precision: bf16
+model_description:  
+  model_id_or_path: Intel/neural-chat-7b-v3-1
+  tokenizer_name_or_path: Intel/neural-chat-7b-v3-1
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: '### System:
+      You are a chatbot developed by Intel. Please answer all questions to the best of your ability.'
+    human_id: '
+
+      ### User'
+    bot_id: '
+
+      ### Assistant'
diff --git a/inference/models/opt-125m.yaml b/inference/models/opt-125m.yaml
index b7801bfa9..c8f40aa04 100644
--- a/inference/models/opt-125m.yaml
+++ b/inference/models/opt-125m.yaml
@@ -1,13 +1,14 @@
 port: 8000
 name: opt-125m
 route_prefix: /opt-125m
-precision: 'bf16'
 cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: "cpu"
+ipex:
+  enabled: false
+  precision: bf16
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
diff --git a/inference/models/template/inference_config_template.yaml b/inference/models/template/inference_config_template.yaml
index aeeb67064..7a8a18507 100644
--- a/inference/models/template/inference_config_template.yaml
+++ b/inference/models/template/inference_config_template.yaml
@@ -1,14 +1,15 @@
 port: 8000
 name: null
 route_prefix: null
-precision: bf16
 cpus_per_worker: 24
 gpus_per_worker: 0
 hpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-ipex: false
 device: cpu
+ipex:
+  enabled: true
+  precision: bf16
 model_description:
   model_id_or_path: null
   bigdl:: false
diff --git a/inference/predictor.py b/inference/predictor.py
index 6a61f0ae6..fb0cc1ef3 100644
--- a/inference/predictor.py
+++ b/inference/predictor.py
@@ -1,49 +1,88 @@
 import re
+import torch
+from transformers import AutoTokenizer, StoppingCriteriaList
+from inference_config import InferenceConfig
+from utils import max_input_len, StoppingCriteriaSub
 
 class Predictor:
-  def configure_tokenizer(self, model_name, tokenizer):
-    model = self.model
-    if re.search("llama", model.config.architectures[0], re.IGNORECASE):
-        # unwind broken decapoda-research config
-        model.generation_config.pad_token_id = 0
-        model.generation_config.bos_token_id = 1
-        model.generation_config.eos_token_id = 2
-
-    if (
-        hasattr(model.generation_config, "pad_token_id")
-        and model.generation_config.pad_token_id is not None
-        and not "chatglm" in model_name
-    ):
-        tokenizer.pad_token_id = model.generation_config.pad_token_id
-    if (
-        hasattr(model.generation_config, "eos_token_id")
-        and model.generation_config.eos_token_id is not None
-        and not "chatglm" in model_name
-    ):
-        tokenizer.eos_token_id = model.generation_config.eos_token_id
-    if (
-        hasattr(model.generation_config, "bos_token_id")
-        and model.generation_config.bos_token_id is not None
-    ):
-        tokenizer.bos_token_id = model.generation_config.bos_token_id
-
-    if tokenizer.pad_token_id is None:
-        model.generation_config.pad_token_id = (
-            tokenizer.pad_token_id
-        ) = tokenizer.eos_token_id
-
-    if model.generation_config.eos_token_id is None:
-        model.generation_config.eos_token_id = tokenizer.eos_token_id
+    def __init__(self, infer_conf: InferenceConfig) -> None:
+        self.infer_conf = infer_conf
+        self.tokenizer = AutoTokenizer.from_pretrained(infer_conf.model_description.tokenizer_name_or_path)
+        self.device = torch.device(infer_conf.device)
+        # now deepspeed predictor don't have the model
+        # so configure_tokenizer cannot be called
+        # this should be solved in the next pr
+        # where it is also a worker
+        # This can be removed then
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+
+        prompt = infer_conf.model_description.prompt
+        stop_words = prompt.stop_words
+        stop_words_ids = [self.tokenizer(stop_word, return_tensors='pt').input_ids.squeeze() for stop_word in stop_words]
+        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+
+    def tokenize_inputs(self, text):
+        if self.device.type == "hpu":
+            input_tokens = self.tokenizer(
+                text,
+                return_tensors="pt",
+                padding="max_length",
+                max_length=max_input_len(input_token_len),
+            )
+        else:
+            input_tokens = self.tokenizer(
+                text, return_tensors="pt", padding=True
+            )
+        return input_tokens.input_ids.to(device=self.device)
+
+    def configure_tokenizer(self, model_name):
+        model = self.model
+        tokenizer = self.tokenizer
+        if re.search("llama", model.config.architectures[0], re.IGNORECASE):
+            # unwind broken decapoda-research config
+            model.generation_config.pad_token_id = 0
+            model.generation_config.bos_token_id = 1
+            model.generation_config.eos_token_id = 2
+
+        if (
+            hasattr(model.generation_config, "pad_token_id")
+            and model.generation_config.pad_token_id is not None
+            and not "chatglm" in model_name
+        ):
+            tokenizer.pad_token_id = model.generation_config.pad_token_id
+        if (
+            hasattr(model.generation_config, "eos_token_id")
+            and model.generation_config.eos_token_id is not None
+            and not "chatglm" in model_name
+        ):
+            tokenizer.eos_token_id = model.generation_config.eos_token_id
+        if (
+            hasattr(model.generation_config, "bos_token_id")
+            and model.generation_config.bos_token_id is not None
+        ):
+            tokenizer.bos_token_id = model.generation_config.bos_token_id
+
+        if tokenizer.pad_token_id is None:
+            model.generation_config.pad_token_id = (
+                tokenizer.pad_token_id
+            ) = tokenizer.eos_token_id
+
+        if model.generation_config.eos_token_id is None:
+            model.generation_config.eos_token_id = tokenizer.eos_token_id
+        
+        if not model.config.is_encoder_decoder:
+            tokenizer.padding_side = "left"
+
+        if tokenizer.pad_token is None and tokenizer.pad_token_id is None:
+            tokenizer.pad_token = tokenizer.eos_token
+            model.generation_config.pad_token_id = model.generation_config.eos_token_id
     
-    if not model.config.is_encoder_decoder:
-        tokenizer.padding_side = "left"
-
-    if tokenizer.pad_token is None and tokenizer.pad_token_id is None:
-        tokenizer.pad_token = tokenizer.eos_token
-        model.generation_config.pad_token_id = model.generation_config.eos_token_id
-  
-  def generate(self, inputs, **config):
-    pass
-
-  def streaming_generate(self, inputs, streamer, **config):
-    pass
\ No newline at end of file
+    def generate(self, prompt, **config):
+        pass
+
+    def streaming_generate(self, prompt, streamer, **config):
+        pass
+
+    def get_streamer(self):
+        pass
diff --git a/inference/run_model_batch_predict.py b/inference/run_model_batch_predict.py
index 880a0e860..30d74a5c3 100644
--- a/inference/run_model_batch_predict.py
+++ b/inference/run_model_batch_predict.py
@@ -46,12 +46,9 @@ def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
 
     import argparse
     parser = argparse.ArgumentParser('GPT-J generation script', add_help=False)
-    parser.add_argument('--precision', default='bf16', type=str, help="fp32 or bf16")
     parser.add_argument('--model', default='EleutherAI/gpt-j-6B', type=str, help="model name or path")
     parser.add_argument('--max-new-tokens', default=100, type=int, help="output max new tokens")
     args = parser.parse_args()
-    amp_enabled = True if args.precision != "fp32" else False
-    amp_dtype = torch.bfloat16 if args.precision != "fp32" else torch.float32
 
     ray.init(address="auto")
     prompt = (
@@ -68,8 +65,6 @@ def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
             batch_size=4,
             fn_constructor_kwargs=dict(
                 model_id=args.model,
-                amp_enabled=amp_enabled,
-                amp_dtype=amp_dtype,
                 max_new_tokens=args.max_new_tokens
             ),
             compute="actors"
diff --git a/inference/run_model_infer.py b/inference/run_model_infer.py
index ba3214d29..30b158bf8 100644
--- a/inference/run_model_infer.py
+++ b/inference/run_model_infer.py
@@ -32,7 +32,7 @@
     print("iter: ", i)
     tic = time.time()
     proxies = { "http": None, "https": None}
-    outputs = requests.post(args.model_endpoint, proxies=proxies, json=[sample_input], stream=args.streaming_response)
+    outputs = requests.post(args.model_endpoint, proxies=proxies, json=sample_input, stream=args.streaming_response)
     if args.streaming_response:
         outputs.raise_for_status()
         for output in outputs.iter_content(chunk_size=None, decode_unicode=True):
diff --git a/inference/run_model_serve.py b/inference/run_model_serve.py
index 9dd33b9e3..d5e2c8ac5 100644
--- a/inference/run_model_serve.py
+++ b/inference/run_model_serve.py
@@ -6,135 +6,38 @@
 from starlette.requests import Request
 from queue import Empty
 import torch
-from transformers import AutoTokenizer, TextIteratorStreamer
-from transformers import StoppingCriteria, StoppingCriteriaList
+from transformers import TextIteratorStreamer
 from inference_config import ModelDescription, InferenceConfig, all_models
 import sys
-
+from utils import get_deployment_actor_options
 from typing import Generator, Union, Optional, List
 from starlette.responses import StreamingResponse
 
 from pydantic_yaml import parse_yaml_raw_as
 
-class StoppingCriteriaSub(StoppingCriteria):
-
-    def __init__(self, stops = [], encounters=1):
-        super().__init__()
-        self.stops = stops
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
-        for stop in self.stops:
-            length = 1  if len(stop.size())==0 else stop.size()[0]
-            if torch.all((stop == input_ids[0][-length:])).item():
-                return True
-        return False
-
-def max_input_len(input_text_length):
-    if input_text_length <= 128:
-        return 128
-    elif input_text_length <= 512:
-        return 512
-    elif input_text_length <= 2048:
-        return 2048
-    else:
-        print("Max support length is 4096")
-        return 4096
-
 @serve.deployment
 class PredictDeployment:
-    def __init__(self, inferenceConfig: InferenceConfig):
-        self.device = torch.device(inferenceConfig.device)
-        self.tokenizer = AutoTokenizer.from_pretrained(inferenceConfig.model_description.tokenizer_name_or_path)
-        if self.tokenizer.pad_token_id is None:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+    def __init__(self, infer_conf: InferenceConfig):
+        self.device = torch.device(infer_conf.device)
         self.process_tool = None
-        chat_processor_name = inferenceConfig.model_description.chat_processor
-        prompt = inferenceConfig.model_description.prompt
+        chat_processor_name = infer_conf.model_description.chat_processor
+        prompt = infer_conf.model_description.prompt
         if chat_processor_name:
             module = __import__("chat_process")
             chat_processor = getattr(module, chat_processor_name, None)
             if chat_processor is None:
-                raise ValueError(inferenceConfig.name + " deployment failed. chat_processor(" + chat_processor_name + ") does not exist.")
+                raise ValueError(infer_conf.name + " deployment failed. chat_processor(" + chat_processor_name + ") does not exist.")
             self.process_tool = chat_processor(**prompt.dict())
-        stop_words = prompt.stop_words
-        stop_words_ids = [self.tokenizer(stop_word, return_tensors='pt').input_ids.squeeze() for stop_word in stop_words]
-        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
-        self.use_deepspeed = inferenceConfig.deepspeed
-        self.amp_dtype = torch.bfloat16 if inferenceConfig.precision != "fp32" else torch.float32
+        
+        self.use_deepspeed = infer_conf.deepspeed
         if self.use_deepspeed:
             from deepspeed_predictor import DeepSpeedPredictor
-            self.streamer = self.create_streamer()
-            # now deepspeed predictor don't have the model
-            # this should be solved in the next pr
-            # where it is also a worker
-            if self.tokenizer.pad_token_id is None:
-                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-            self.predictor = DeepSpeedPredictor(inferenceConfig, self.amp_dtype, self.tokenizer.pad_token_id, self.stopping_criteria)
+            self.predictor = DeepSpeedPredictor(infer_conf)
+            self.streamer = self.predictor.get_streamer()
         else:
             from transformer_predictor import TransformerPredictor
-            self.predictor = TransformerPredictor(inferenceConfig, self.amp_dtype, self.stopping_criteria)
-            self.predictor.configure_tokenizer(inferenceConfig.model_description.model_id_or_path, self.tokenizer)
+            self.predictor = TransformerPredictor(infer_conf)
         self.loop = asyncio.get_running_loop()
-
-    def create_streamer(self):
-        from transformers import TextStreamer
-        from typing import Optional
-        from ray.util.queue import Queue
-
-        class RayTextIteratorStreamer(TextStreamer):
-            def __init__(
-                self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
-            ):
-                super().__init__(tokenizer, skip_prompt, **decode_kwargs)
-                self.text_queue = Queue()
-                self.stop_signal = None
-                self.timeout = timeout
-
-            def on_finalized_text(self, text: str, stream_end: bool = False):
-                self.text_queue.put(text, timeout=self.timeout)
-                if stream_end:
-                    self.text_queue.put(self.stop_signal, timeout=self.timeout)
-
-            def __iter__(self):
-                return self
-
-            def __next__(self):
-                value = self.text_queue.get(timeout=self.timeout)
-                if value == self.stop_signal:
-                    raise StopIteration()
-                else:
-                    return value
-        return RayTextIteratorStreamer(self.tokenizer, skip_special_tokens=True)
-
-    def tokenize_inputs(self, text: List[str]):
-        if self.device.type == "hpu":
-            input_tokens_no_pad = self.tokenizer(text, return_tensors="pt")
-            input_token_len = input_tokens_no_pad.input_ids.shape[-1]
-            input_tokens = self.tokenizer.batch_encode_plus(
-                text,
-                return_tensors="pt",
-                padding="max_length",
-                max_length=max_input_len(input_token_len),
-            )
-        else:
-            input_tokens = self.tokenizer.batch_encode_plus(
-                text, return_tensors="pt", padding=True
-            )
-            input_token_len = input_tokens.input_ids.shape[-1]
-        inputs = {k: v.to(device=self.device) \
-                  for k,v in input_tokens.items() \
-                  if torch.is_tensor(v)}
-        return inputs, input_token_len
-
-    def predict(self, text: List[str], **config) -> str:
-        inputs, _ = self.tokenize_inputs(text)
-        gen_tokens = self.predictor.generate(inputs, **config)
-        return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
-
-    def predict_stream(self, text: List[str], streamer: TextIteratorStreamer, **config) -> Generator[str, None, None]:
-        # with torch.cpu.amp.autocast(enabled=self.amp_enabled, dtype=self.amp_dtype):
-        inputs, _ = self.tokenize_inputs(text)
-        self.predictor.streaming_generate(inputs, streamer, **config)
     
     def consume_streamer(self):
         for text in self.streamer:
@@ -155,37 +58,27 @@ async def consume_streamer_async(self, streamer: TextIteratorStreamer):
     async def __call__(self, http_request: Request) -> Union[StreamingResponse, str]:
         json_request: str = await http_request.json()
         prompts = []
-        for prompt in json_request:
-            text = prompt["text"]
-            config = prompt["config"]  if "config" in prompt else {}
-            streaming_response = prompt["stream"]
-            if isinstance(text, list):
-                if self.process_tool is not None:
-                    prompt = self.process_tool.get_prompt(text)
-                    prompts.append(prompt)
-                else:
-                    prompts.extend(text)
+        text = json_request["text"]
+        config = json_request["config"]  if "config" in json_request else {}
+        streaming_response = json_request["stream"]
+        if isinstance(text, list):
+            if self.process_tool is not None:
+                prompt = self.process_tool.get_prompt(text)
+                prompts.append(prompt)
             else:
-                prompts.append(text)
+                prompts.extend(text)
+        else:
+            prompts.append(text)
         if not streaming_response:
-            return self.predict(prompts, **config)
+            return self.predictor.generate(prompts, **config)
         if self.use_deepspeed:
-            self.predict_stream(prompts, self.streamer, **config)
+            self.predictor.streaming_generate(prompts, self.streamer, **config)
             return StreamingResponse(self.consume_streamer(), status_code=200, media_type="text/plain")
         else:
-            streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, timeout=0, skip_special_tokens=True)
-            self.loop.run_in_executor(None, functools.partial(self.predict_stream, prompts, streamer, **config))
+            streamer = self.predictor.get_streamer()
+            self.loop.run_in_executor(None, functools.partial(self.predictor.streaming_generate, prompts, streamer, **config))
             return StreamingResponse(self.consume_streamer_async(streamer), status_code=200, media_type="text/plain")
 
-_ray_env_key = "env_vars"
-# OMP_NUM_THREADS will be set by num_cpus, so not set in env
-_predictor_runtime_env_ipex = {
-    "KMP_BLOCKTIME": "1",
-    "KMP_SETTINGS": "1",
-    "KMP_AFFINITY": "granularity=fine,compact,1,0",
-    "MALLOC_CONF": "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
-}
-
 # make it unittest friendly
 def main(argv=None):
     # args
@@ -203,7 +96,6 @@ def main(argv=None):
     parser.add_argument("--workers_per_group", default="2", type=int, help="workers per group, used with --deepspeed")
     parser.add_argument("--ipex", action='store_true', help="enable ipex optimization")
     parser.add_argument("--device", default="cpu", type=str, help="cpu, xpu, hpu or cuda")
-    parser.add_argument("--precision", default="bf16", type=str, help="fp32 or bf16")
     parser.add_argument("--serve_local_only", action="store_true", help="Only support local access to url")
 
     args = parser.parse_args(argv)
@@ -216,55 +108,40 @@ def main(argv=None):
         if args.config_file:
             print("reading from config file, " + args.config_file)
             with open(args.config_file, "r") as f:
-                inferenceConfig = parse_yaml_raw_as(InferenceConfig, f)
+                infer_conf = parse_yaml_raw_as(InferenceConfig, f)
         else: # args.model should be set
             print("reading from command line, " + args.model)
             model_desc = ModelDescription()
             model_desc.model_id_or_path = args.model
             model_desc.tokenizer_name_or_path = args.tokenizer if args.tokenizer is not None else args.model
-            inferenceConfig = InferenceConfig(model_description=model_desc)
-            inferenceConfig.host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
-            inferenceConfig.port = args.port
+            infer_conf = InferenceConfig(model_description=model_desc)
+            infer_conf.host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
+            infer_conf.port = args.port
             rp = args.route_prefix if args.route_prefix else "custom_model"
-            inferenceConfig.route_prefix = "/{}".format(rp)
-            inferenceConfig.name = rp
-            inferenceConfig.ipex = args.ipex
+            infer_conf.route_prefix = "/{}".format(rp)
+            infer_conf.name = rp
+            infer_conf.ipex.enabled = args.ipex
         model_list = {}
-        model_list[inferenceConfig.name] = inferenceConfig
+        model_list[infer_conf.name] = infer_conf
 
     ray.init(address="auto")
 
     deployments = []
-    for model_id, inferCfg in model_list.items():
+    for model_id, infer_conf in model_list.items():
         print("deploy model: ", model_id)
-        runtime_env = {_ray_env_key: {}}
-        if inferCfg.ipex:
-            runtime_env[_ray_env_key].update(_predictor_runtime_env_ipex)
-        if inferCfg.deepspeed:
-            runtime_env[_ray_env_key]["DS_ACCELERATOR"] = inferCfg.device
-        # now PredictDeployment itself is a worker, we should require resources for it
-        ray_actor_options = {"runtime_env": runtime_env}
-        if inferCfg.device == "cpu":
-            ray_actor_options["num_cpus"] = inferCfg.cpus_per_worker
-        elif inferCfg.device == "cuda":
-            ray_actor_options["num_gpus"] = inferCfg.gpus_per_worker
-        elif inferCfg.device == "hpu":
-            ray_actor_options["resources"] = {"HPU": inferCfg.hpus_per_worker}
-        else:
-            # TODO add xpu
-            pass
-        deployment = PredictDeployment.options(ray_actor_options=ray_actor_options).bind(inferCfg)
-        handle = serve.run(deployment, _blocking=True, host=inferCfg.host, port=inferCfg.port, name=inferCfg.name, route_prefix=inferCfg.route_prefix)
-        deployment_name = inferCfg.name
-        if inferCfg.host == "0.0.0.0":
+        ray_actor_options = get_deployment_actor_options(infer_conf)
+        deployment = PredictDeployment.options(ray_actor_options=ray_actor_options).bind(infer_conf)
+        handle = serve.run(deployment, _blocking=True, host=infer_conf.host, port=infer_conf.port, name=infer_conf.name, route_prefix=infer_conf.route_prefix)
+        deployment_name = infer_conf.name
+        if infer_conf.host == "0.0.0.0":
             all_nodes = ray.nodes()
             for node in all_nodes:
                 if "node:__internal_head__" in node["Resources"]:
                     host_ip = node["NodeManagerAddress"]
                     break
         else:
-            host_ip = inferCfg.host
-        url = f"http://{host_ip}:{inferCfg.port}{inferCfg.route_prefix}"
+            host_ip = infer_conf.host
+        url = f"http://{host_ip}:{infer_conf.port}{infer_conf.route_prefix}"
         print(f"Deployment '{deployment_name}' is ready at `{url}`.")
         deployments.append(handle)
 
diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py
index e1178af2f..942d2a26b 100644
--- a/inference/transformer_predictor.py
+++ b/inference/transformer_predictor.py
@@ -1,15 +1,17 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoConfig
-from inference_config import InferenceConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+from transformers import TextIteratorStreamer
+from inference_config import InferenceConfig, IPEX_PRECISION_BF16
 from predictor import Predictor
+from utils import get_torch_dtype
 
 class TransformerPredictor(Predictor):
-    def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteria):
-        self.amp_dtype = amp_dtype
-        self.device = torch.device(inferenceConfig.device)
-        model_desc = inferenceConfig.model_description
+    def __init__(self, infer_conf: InferenceConfig):
+        super().__init__(infer_conf)
+        
+        model_desc = infer_conf.model_description
         model_config = model_desc.config
-        config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code)
+        hf_config = AutoConfig.from_pretrained(model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code)
 
         if self.device.type == "hpu":
             from optimum.habana.transformers.modeling_utils import (
@@ -17,6 +19,8 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri
             )
 
             adapt_transformers_to_gaudi()
+        # get correct torch type for loading HF model
+        torch_dtype = get_torch_dtype(infer_conf, hf_config)
         if model_desc.bigdl:
             from bigdl.llm.transformers import AutoModelForCausalLM as BigDLAutoModelForCLM
             bmodel_config = {}
@@ -25,16 +29,16 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri
                 bmodel_config.update(model_desc.bigdl_config.dict())
             model = BigDLAutoModelForCLM.from_pretrained(
                 model_desc.model_id_or_path,
-                torch_dtype=amp_dtype,
-                config=config,
+                torch_dtype=torch_dtype,
+                config=hf_config,
                 low_cpu_mem_usage=True,
                 **bmodel_config
             )
         else:
             model = AutoModelForCausalLM.from_pretrained(
                 model_desc.model_id_or_path,
-                torch_dtype=amp_dtype,
-                config=config,
+                torch_dtype=torch_dtype,
+                config=hf_config,
                 low_cpu_mem_usage=True,
                 **model_config.dict()
             )
@@ -58,7 +62,7 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri
             # to channels last
             model = model.to(memory_format=torch.channels_last)
             # to ipex
-            if inferenceConfig.ipex:
+            if infer_conf.ipex.enabled:
                 import intel_extension_for_pytorch as ipex
 
                 torch._C._jit_set_texpr_fuser_enabled(False)
@@ -66,13 +70,12 @@ def __init__(self, inferenceConfig: InferenceConfig, amp_dtype, stopping_criteri
                 except: pass
                 model = ipex.optimize_transformers(
                     model.eval(),
-                    dtype=amp_dtype,
+                    dtype=torch.bfloat16 if infer_conf.ipex.precision == IPEX_PRECISION_BF16 else torch.float32,
                     inplace=True
                 )
         self.model = model
-        self.stopping_criteria = stopping_criteria
 
-    def _process_config(self, **config):
+    def _process_config(self, config):
         if self.device.type == "hpu":
             if "max_new_tokens" not in config:
                 # hpu requires setting max_new_tokens
@@ -82,18 +85,23 @@ def _process_config(self, **config):
                 # lazy mode should be True when using hpu graphs
                 config["lazy_mode"] = True
 
-    def streaming_generate(self, inputs, streamer, **config):
-        self._process_config(**config)
-        self.model.generate(**inputs,
+    def streaming_generate(self, prompt, streamer, **config):
+        self._process_config(config)
+        input_ids = self.tokenize_inputs(prompt)
+        self.model.generate(input_ids,
                     stopping_criteria=self.stopping_criteria,
                     streamer=streamer,
                     **config)
 
-    def generate(self, inputs, **config):
-        self._process_config(**config)
+    def generate(self, prompt, **config):
+        self._process_config(config)
+        input_ids = self.tokenize_inputs(prompt)
         gen_tokens = self.model.generate(
-            **inputs,
+            input_ids,
             stopping_criteria=self.stopping_criteria,
             **config
         )
-        return gen_tokens
+        return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
+
+    def get_streamer(self):
+        return TextIteratorStreamer(self.tokenizer, skip_prompt=True, timeout=0, skip_special_tokens=True)
diff --git a/inference/utils.py b/inference/utils.py
new file mode 100644
index 000000000..43149435c
--- /dev/null
+++ b/inference/utils.py
@@ -0,0 +1,76 @@
+from transformers import StoppingCriteria
+import torch
+
+from inference_config import InferenceConfig, DEVICE_CPU
+
+def get_deployment_actor_options(infer_conf: InferenceConfig):
+    _ray_env_key = "env_vars"
+    # OMP_NUM_THREADS will be set by num_cpus, so not set in env
+    _predictor_runtime_env_ipex = {
+        "KMP_BLOCKTIME": "1",
+        "KMP_SETTINGS": "1",
+        "KMP_AFFINITY": "granularity=fine,compact,1,0",
+        "MALLOC_CONF": "oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000"
+    }
+    runtime_env = {_ray_env_key: {}}
+    if infer_conf.ipex.enabled:
+        runtime_env[_ray_env_key].update(_predictor_runtime_env_ipex)
+    if infer_conf.deepspeed:
+        runtime_env[_ray_env_key]["DS_ACCELERATOR"] = infer_conf.device
+    # now PredictDeployment itself is a worker, we should require resources for it
+    ray_actor_options = {"runtime_env": runtime_env}
+    if infer_conf.device == "cpu":
+        ray_actor_options["num_cpus"] = infer_conf.cpus_per_worker
+    elif infer_conf.device == "cuda":
+        ray_actor_options["num_gpus"] = infer_conf.gpus_per_worker
+    elif infer_conf.device == "hpu":
+        ray_actor_options["resources"] = {"HPU": infer_conf.hpus_per_worker}
+    else:
+        # TODO add xpu
+        pass
+    return ray_actor_options
+
+class StoppingCriteriaSub(StoppingCriteria):
+
+    def __init__(self, stops = [], encounters=1):
+        super().__init__()
+        self.stops = stops
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            length = 1  if len(stop.size())==0 else stop.size()[0]
+            if torch.all((stop == input_ids[0][-length:])).item():
+                return True
+        return False
+
+# used in inference with Gaudi
+def max_input_len(input_text_length):
+    if input_text_length <= 128:
+        return 128
+    elif input_text_length <= 512:
+        return 512
+    elif input_text_length <= 2048:
+        return 2048
+    else:
+        print("Max support length is 4096")
+        return 4096
+
+def get_torch_dtype(infer_conf: InferenceConfig, hf_config) -> torch.dtype:
+    '''
+    return torch default dtype, a.k.a float32, if it's cpu only inference without ipex because
+    bfloat16 is too slow and float16 is not supported in CPU
+    '''
+    if hf_config is None or is_cpu_without_ipex(infer_conf):
+        return torch.get_default_dtype()
+    if hasattr(hf_config, 'torch_dtype'):
+        t = hf_config.torch_dtype
+        if t:
+            return t
+    if hasattr(hf_config, '__getitem__'):
+        t = hf_config['torch_dtype']
+        if t:
+            return t
+    return torch.get_default_dtype()
+
+def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool:
+    return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU
diff --git a/pretrain/config/llama2_7b_8Guadi_pretrain.conf b/pretrain/config/llama_7b_8Guadi_pretrain.conf
similarity index 80%
rename from pretrain/config/llama2_7b_8Guadi_pretrain.conf
rename to pretrain/config/llama_7b_8Guadi_pretrain.conf
index 5de0e91f0..62a0c32b1 100644
--- a/pretrain/config/llama2_7b_8Guadi_pretrain.conf
+++ b/pretrain/config/llama_7b_8Guadi_pretrain.conf
@@ -11,17 +11,18 @@
         "type":"MegatronInitializer",
         "name": "megatron",
         "megatron_config": {
-            "data_path": ["/home/user/workspace/data/my-gpt2_text_document"],
+            "data_path": ["/home/user/workspace/data/tokenized_NIH"],
             "data_impl": "mmap",
             "micro_batch_size": 1,
             "global_batch_size": 8,
             "seq_length": 2048,
             "use_dataset_only": True,
-            "vocab_file": "/home/user/workspace/data/gpt2-vocab.json",
-            "tokenizer_type": "GPT2BPETokenizer",
-            "merge_file": "/home/user/workspace/data/gpt2-merges.txt",
-            "train_iters": 300,
-            "eval_interval": 10,
+            #"vocab_file": "/home/user/workspace/data/gpt2-vocab.json",
+            "tokenizer_type": "HFTokenizer",
+            "tokenizer_model": "huggyllama/llama-7b",
+            #"merge_file": "/home/user/workspace/data/gpt2-merges.txt",
+            "eval_interval": 1000,
+            "train_samples": 300_000_000,
             "split": "949,50,1",
         },
     },
@@ -62,21 +63,25 @@
             "per_device_eval_batch_size": 1,
             "do_train": True,
             "do_eval": True,
-            "save_strategy": "no",
-            "output_dir": "/tmp/hf_trainer/",
+            "save_strategy": "steps",
+            "save_steps": 1000,
+            "output_dir": "/home/user/workspace/data/hf_trainer/",
             "gaudi_config_name": "Habana/gpt2",
             "use_habana": True,
+            "max_steps": 100000,
             "throughput_warmup_steps": 3,
             "use_lazy_mode": True,
             "overwrite_output_dir": True,
-            "max_steps": 300,
             "seed": 42,
             "bf16": True,
+            "report_to":'tensorboard',
             "deepspeed":{
                 "steps_per_print": 64,
                 "train_batch_size": 8,
                 "train_micro_batch_size_per_gpu": 1,
-                "gradient_accumulation_steps": "auto",
+                "gradient_accumulation_steps": 1,
+                "gradient_checkpoint": True,
+                "memory_efficient_linear": False,
                 "bf16": {
                     "enabled": True
                 },
@@ -85,7 +90,8 @@
                     "stage": 3,
                     "overlap_comm": False,
                     "reduce_scatter": False,
-                    "contiguous_gradients": False
+                    "contiguous_gradients": False,
+                    "stage3_gather_16bit_weights_on_model_save": True
                 }
             },
         },
@@ -103,10 +109,10 @@
             "runtime_env": {
                 "env_vars": {
                     "OMP_NUM_THREADS": "56", 
-                    #"ACCELERATE_USE_CPU": "True", 
                     "ACCELERATE_MIXED_PRECISION": "no",
-                    #"CCL_WORKER_COUNT": "1",        # CCL setting
-                    #"CCL_LOG_LEVEL": "info",
+                    "ACCELERATE_USE_DEEPSPEED": "true",
+                    "HABANA_VISIBLE_MODULES":"0,1,2,3,4,5,6,7",
+                    "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES":"true",
                     "WORLD_SIZE": "8",    # Enable multi-process
                 }
             },
@@ -121,6 +127,7 @@
             # The amount of resources per worker.
             "resources_per_worker": {
                 "CPU": 10,
+                "HPU": 1
             },
             # The placement strategy to use for the placement group of the Ray actors.
             "placement_strategy": "SPREAD"
diff --git a/pretrain/config/llama2_7b_8gpu_pretrain.conf b/pretrain/config/llama_7b_8gpu_pretrain.conf
similarity index 98%
rename from pretrain/config/llama2_7b_8gpu_pretrain.conf
rename to pretrain/config/llama_7b_8gpu_pretrain.conf
index 71a7f1192..43afe1140 100644
--- a/pretrain/config/llama2_7b_8gpu_pretrain.conf
+++ b/pretrain/config/llama_7b_8gpu_pretrain.conf
@@ -97,6 +97,7 @@
                     "overlap_comm": False,
                     "reduce_scatter": False,
                     "contiguous_gradients": False,
+                    "stage3_gather_16bit_weights_on_model_save": True
                     # "stage3_max_live_parameters" : 1e8,
                     # "stage3_max_reuse_distance" : 1e8,
                     # "stage3_prefetch_bucket_size" : 2e8,
diff --git a/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf b/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
index 00b6d7d37..5848fdae8 100644
--- a/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
+++ b/pretrain/config/llama_7b_megatron_deepspeed_zs0_8Gaudi_pretrain.conf
@@ -7,6 +7,7 @@
 
     # you should setup this path based on your environment.
     "megatron_deepspeed_path": '/home/user/Model-References/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed',
+    "pretrain_module": 'pretrain_llama',
     "megatron_config": {
         "num_layers": 32,
         "hidden_size": 4096,
@@ -15,9 +16,10 @@
         #"max_position_embeddings":2048,
         #"num_key_value_heads": 32,
         # setup the file path based on your data dir
-        "vocab_file": "/home/user/workspace/data/gpt2-vocab.json",
-        "tokenizer_type": "GPT2BPETokenizer",
-        "merge_file": "/home/user/workspace/data/gpt2-merges.txt",
+        #"vocab_file": "/home/user/workspace/data/gpt2-vocab.json",
+        "tokenizer_type": "HFTokenizer",
+        "tokenizer_model_file": "huggyllama/llama-7b",
+        #"merge_file": "/home/user/workspace/data/gpt2-merges.txt",
         "seq_length": 2048,
         "micro_batch_size": 2,
         #"eval_micro_batch_size": 2,
@@ -57,6 +59,8 @@
         "layernorm_type":"rmsnorm",
         "activation_func_type": "swiglu",
         "layernorm_epsilon": 1e-6,
+        "use_fused_sdpa": False,
+        "use_fused_sdpa_with_recompute": True,
         "bf16": True,
         #"checkpoint_activations": True,
         #"deepspeed_activation_checkpointing": True,
@@ -80,14 +84,17 @@
                 #"min_loss_scale": 1,
                 #"initial_scale_power": 12
             },
-            "bf16": {"enabled": True},
+            "bf16": {
+                "enabled": True,
+                "accumulate_grads_via_hooks": True
+            },
             "wall_clock_breakdown": False
         },
         "zero_stage":0,
         "deepspeed_activation_checkpointing": True,
         "save": "./checkpoint_megatron",
         # setup the file path based on your data dir
-        "data_path": ["/home/user/workspace/data/my-gpt2_text_document"],
+        "data_path": ["/home/user/workspace/data/tokenized_NIH"],
         "data_impl": "mmap",
         "split": "949,50,1",
         "distributed_backend": "hccl",
diff --git a/tools/workload_in_containers/Dockerfile.megatron.habana b/pretrain/docker/Dockerfile.megatron.habana
similarity index 83%
rename from tools/workload_in_containers/Dockerfile.megatron.habana
rename to pretrain/docker/Dockerfile.megatron.habana
index ee7739e4a..a12fe7dc6 100644
--- a/tools/workload_in_containers/Dockerfile.megatron.habana
+++ b/pretrain/docker/Dockerfile.megatron.habana
@@ -1,16 +1,18 @@
 FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu20.04/habanalabs/pytorch-installer-2.1.0:latest
 ENV DEBIAN_FRONTEND=noninteractive
 WORKDIR /home/user
-RUN pip install lz4 numpy==1.24.4 tensorboard gpustat==1.0.0 sentencepiece accelerate==0.19.0 datasets==2.12.0 gymnasium transformers==4.26.0 dm-tree scikit-image peft deltatuner==1.1.9
+RUN pip install lz4 numpy==1.24.4 tensorboard gpustat==1.0.0 sentencepiece accelerate datasets==2.12.0 gymnasium transformers dm-tree scikit-image peft deltatuner==1.1.9
 RUN pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
 RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0
 COPY pretrain/patch/hpu/0001-Init-megatron-deepspeed-with-Ray-cluster.patch .
+COPY pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch .
 RUN git config --global user.email "root@example.com" 
 RUN git config --global user.name "root" 
 RUN git clone https://github.com/HabanaAI/Model-References.git && \
     cd Model-References && \
     git checkout -b ray bde21928ea8c295cd029fafe2cf737d50e715fe2 && \
     git am /home/user/0001-Init-megatron-deepspeed-with-Ray-cluster.patch && \
+    git am /home/user/0001-Add-the-Huggingface-tokenizer.patch && \
     cd PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/ && \
     pip install . 
 # enable password-less ssh
diff --git a/tools/workload_in_containers/Dockerfile.megatron.gpu b/pretrain/docker/Dockerfile.nvidia
similarity index 100%
rename from tools/workload_in_containers/Dockerfile.megatron.gpu
rename to pretrain/docker/Dockerfile.nvidia
diff --git a/pretrain/docker/Dockerfile.optimum.habana b/pretrain/docker/Dockerfile.optimum.habana
new file mode 100644
index 000000000..52e7d7d89
--- /dev/null
+++ b/pretrain/docker/Dockerfile.optimum.habana
@@ -0,0 +1,12 @@
+FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu20.04/habanalabs/pytorch-installer-2.1.0:latest
+ENV DEBIAN_FRONTEND=noninteractive
+WORKDIR /home/user
+COPY pretrain/requirements.optimum-habana.txt /home/user/
+RUN pip install -r requirements.optimum-habana.txt
+RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0
+RUN pip install --upgrade-strategy eager optimum[habana]
+# enable password-less ssh
+RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -P '' && \
+    cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+CMD ["sh", "-c", "service ssh restart; bash"]
\ No newline at end of file
diff --git a/tools/workload_in_containers/build-image.sh b/pretrain/docker/build-image.sh
similarity index 75%
rename from tools/workload_in_containers/build-image.sh
rename to pretrain/docker/build-image.sh
index a6452bdfc..7b4d05293 100755
--- a/tools/workload_in_containers/build-image.sh
+++ b/pretrain/docker/build-image.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 dockerfile=Dockerfile
-if [[ $1 = "megatron-gpu" ]]; then
-    dockerfile=Dockerfile.megatron.gpu
-elif [[ $1 = "dp" ]]; then
-    dockerfile=Dockerfile.dp 
+if [[ $1 = "nvidia" ]]; then
+    dockerfile=Dockerfile.nvidia
 elif [[ $1 = "megatron-habana" ]]; then
     dockerfile=Dockerfile.megatron.habana
 elif [[ $1 = "optimum-habana" ]]; then
diff --git a/pretrain/megatron_deepspeed_pretrain.py b/pretrain/megatron_deepspeed_pretrain.py
index 3ee347f6a..2890a4044 100644
--- a/pretrain/megatron_deepspeed_pretrain.py
+++ b/pretrain/megatron_deepspeed_pretrain.py
@@ -26,29 +26,39 @@ def train_func(config: Dict[str, Any]):
         os.chdir(cwd)
 
     try:
-        import pretrain_gpt
+        import pretrain_gpt as pretrain_module
     except ImportError:
         megatron_deepspeed_path = config.get("megatron_deepspeed_path", None)
         if megatron_deepspeed_path is not None:
             sys.path.append(megatron_deepspeed_path)
-            pretrain_gpt = importlib.import_module('pretrain_gpt')
+            pretrain_module_name = config.get("pretrain_module", None)
+            if pretrain_module_name is not None:
+                pretrain_module = importlib.import_module(pretrain_module_name)
+            else:
+                pretrain_module = importlib.import_module('pretrain_gpt')
         else:
             raise ImportError("Please set megatron_deepspeed_path in config")
             
     common.init(config)
     megatron_config = config.get('megatron_config', {})
     
-    if hasattr(pretrain_gpt, 'ModelType'):
-        pretrain(pretrain_gpt.train_valid_test_datasets_provider,
-                 pretrain_gpt.model_provider,
-                 pretrain_gpt.ModelType.encoder_or_decoder,
-                 pretrain_gpt.forward_step,
+    if hasattr(pretrain_module, 'ModelType'):
+        pretrain(pretrain_module.train_valid_test_datasets_provider,
+                 pretrain_module.model_provider,
+                 pretrain_module.ModelType.encoder_or_decoder,
+                 pretrain_module.forward_step,
                  args_defaults=megatron_config,
-                 data_post_process=pretrain_gpt.data_post_process)
+                 data_post_process=pretrain_module.data_post_process)
+    elif hasattr(pretrain_module, 'llama_argument_handler'):
+        pretrain(pretrain_module.train_valid_test_datasets_provider,
+                 pretrain_module.model_provider,
+                 pretrain_module.forward_step,
+                 pretrain_module.llama_argument_handler,
+                 args_defaults=megatron_config)
     else:
-        pretrain(pretrain_gpt.train_valid_test_datasets_provider, 
-                 pretrain_gpt.model_provider, 
-                 pretrain_gpt.forward_step,
+        pretrain(pretrain_module.train_valid_test_datasets_provider,
+                 pretrain_module.model_provider,
+                 pretrain_module.forward_step,
                  args_defaults=megatron_config)
 
 def main(external_config = None):
diff --git a/pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch b/pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch
new file mode 100644
index 000000000..4d2f206b5
--- /dev/null
+++ b/pretrain/patch/hpu/0001-Add-the-Huggingface-tokenizer.patch
@@ -0,0 +1,145 @@
+From 1df9ba8d085f55d5141cdbe0857987dec12f1f7b Mon Sep 17 00:00:00 2001
+From: yuanwu <yuan.wu@intel.com>
+Date: Fri, 8 Dec 2023 04:53:13 +0000
+Subject: [PATCH] Add the Huggingface tokenizer
+
+Signed-off-by: yuanwu <yuan.wu@intel.com>
+---
+ .../Megatron-DeepSpeed/megatron/arguments.py  |  6 +-
+ .../megatron/tokenizer/tokenizer.py           | 86 +++++++++++++++++++
+ 2 files changed, 90 insertions(+), 2 deletions(-)
+
+diff --git a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py
+index b9861fa0..516c2abb 100644
+--- a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py
++++ b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/arguments.py
+@@ -871,8 +871,10 @@ def _add_data_args(parser):
+                                 'BertWordPieceCase',
+                                 'GPT2BPETokenizer',
+                                 'SentencePieceTokenizer',
+-                                'LlamaTokenizer'],
++                                'HFTokenizer'],
+                        help='What type of tokenizer to use.')
++    group.add_argument('--tokenizer-model', type=str, default=None,
++                       help='Sentencepiece tokenizer model.')
+     group.add_argument('--data-impl', type=str, default='infer',
+                        choices=['lazy', 'cached', 'mmap', 'infer'],
+                        help='Implementation of indexed datasets.')
+@@ -1174,4 +1176,4 @@ def _add_hpu_optimizations_args(parser):
+                        action='store_true',
+                        help='Flatten operands of linear layers what yields better performance')
+ 
+-    return parser
+\ No newline at end of file
++    return parser
+diff --git a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py
+index e4a49306..7989be48 100644
+--- a/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py
++++ b/PyTorch/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/tokenizer.py
+@@ -18,6 +18,7 @@
+ from abc import ABC
+ from abc import abstractmethod
+ 
++from transformers import AutoTokenizer
+ from .bert_tokenization import FullTokenizer as FullBertTokenizer
+ from .gpt2_tokenization import GPT2Tokenizer
+ from .sentencepiece_tokenization import SentencePieceTokenizer
+@@ -47,6 +48,10 @@ def build_tokenizer(args):
+     elif args.tokenizer_type == 'SentencePieceTokenizer':
+         assert args.tokenizer_model_file is not None
+         tokenizer = _SentencePieceTokenizer(args.tokenizer_model_file, args.tokenizer_eod_id)
++    elif args.tokenizer_type == 'HFTokenizer':
++        assert args.tokenizer_model is not None
++        tokenizer = _HFTokenizer(args.tokenizer_model)
++
+     else:
+         raise NotImplementedError('{} tokenizer is not '
+                                   'implemented.'.format(args.tokenizer_type))
+@@ -328,3 +333,84 @@ class _SentencePieceTokenizer(AbstractTokenizer):
+     @property
+     def eod(self):
+         return self.eod_id
++
++class _HFTokenizer(AbstractTokenizer):
++    """HF Tokenizer"""
++    def __init__(self, tokenizer_name_or_path):
++        name = tokenizer_name_or_path
++        super().__init__(name)
++        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
++        self.encoder = self.tokenizer.get_vocab()
++        self.decoder = {v: k for k, v in self.encoder.items()}
++
++    @property
++    def vocab_size(self):
++        return self.tokenizer.vocab_size
++
++    @property
++    def vocab(self):
++        return self.encoder
++
++    @property
++    def inv_vocab(self):
++        return self.decoder
++
++    def tokenize(self, text):
++        return self.tokenizer.encode(text)
++
++    def detokenize(self, token_ids):
++        return self.tokenizer.decode(token_ids)
++
++    @property
++    def bos(self):
++        return self.bos_token_id
++
++    @property
++    def bos_token_id(self):
++        candidate = self.tokenizer.eos_token_id
++        return self._check_token_candidate(candidate)
++
++    @property
++    def cls(self):
++        candidate = self.tokenizer.cls_token_id
++        return self._check_token_candidate(candidate)
++
++    @property
++    def sep(self):
++        candidate = self.tokenizer.sep_token_id
++        return self._check_token_candidate(candidate)
++
++    @property
++    def pad(self):
++        candidate = self.tokenizer.pad_token_id
++        return self._check_token_candidate(candidate)
++
++    @property
++    def eod(self):
++        candidate = self.tokenizer.eos_token_id
++        return self._check_token_candidate(candidate)
++
++    @property
++    def eos(self):
++        return self.eos_token_id
++
++    @property
++    def eos_token_id(self):
++        candidate = self.tokenizer.eos_token_id
++        return self._check_token_candidate(candidate)
++
++    @property
++    def mask(self):
++        candidate = self.tokenizer.mask_token_id
++        return self._check_token_candidate(candidate)
++
++    @property
++    def additional_special_tokens_ids(self):
++        return self.tokenizer.additional_special_tokens_ids
++
++    @staticmethod
++    def _check_token_candidate(candidate):
++        """Checks whether the candidate is None or not, and raises an exception if it is."""
++        if candidate is None:
++            raise AttributeError("Requested token doesn't exist in current tokenizer")
++        return candidate
+-- 
+2.25.1
+
diff --git a/pretrain/plugin/megatron_dataset.py b/pretrain/plugin/megatron_dataset.py
index 41f5a0355..36fc33c9a 100644
--- a/pretrain/plugin/megatron_dataset.py
+++ b/pretrain/plugin/megatron_dataset.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from megatron import get_args, print_rank_0
-from megatron.training import build_train_valid_test_datasets
+from megatron.training import build_train_valid_test_datasets, update_train_iters
 from megatron.data import gpt_dataset
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 
@@ -30,6 +30,8 @@ def _train_valid_test_datasets_provider(train_val_test_num_samples):
 
             return train_ds, valid_ds, test_ds
 
+        args = get_args()
+        update_train_iters(args)
         datasets = build_train_valid_test_datasets(_train_valid_test_datasets_provider)
         print_rank_0(datasets)
         return datasets
diff --git a/pretrain/plugin/megtron_initializer.py b/pretrain/plugin/megtron_initializer.py
index 959bcbe8d..5b520038e 100644
--- a/pretrain/plugin/megtron_initializer.py
+++ b/pretrain/plugin/megtron_initializer.py
@@ -16,4 +16,4 @@ def init(self):
             args = self.config["megatron_config"] 
             initialize_megatron(ignore_unknown_args=True, args_defaults=args, allow_no_cuda=True)
         else:
-            logger.error("cannot initialize the megatron without the megatron_config")
\ No newline at end of file
+            logger.error("cannot initialize the megatron without the megatron_config")
diff --git a/pretrain/pretrain.py b/pretrain/pretrain.py
index 46dce45a6..69ce217d3 100644
--- a/pretrain/pretrain.py
+++ b/pretrain/pretrain.py
@@ -17,9 +17,11 @@
 import common
 
 import importlib
+use_habana = False
 loader = importlib.util.find_spec('habana_frameworks')
 if loader is not None:
     from backend.habana_backend import TorchConfig
+    use_habana = True
 else:
     from ray.train.torch import TorchConfig
     from backend.deepspeed_backend import TorchConfig as DeepSpeedTorchConfig
@@ -126,7 +128,8 @@ def main(external_config = None):
 
         if (
             config['trainer'].get("training_config", None) and
-            config['trainer'].get("training_config").get("deepspeed", None)
+            config['trainer'].get("training_config").get("deepspeed", None) and
+            use_habana == False
         ):
             torch_config = DeepSpeedTorchConfig(**ray_config.get("torch_config", {}))
         else:  
diff --git a/pretrain/requirements.optimum-habana.txt b/pretrain/requirements.optimum-habana.txt
new file mode 100644
index 000000000..4ff265841
--- /dev/null
+++ b/pretrain/requirements.optimum-habana.txt
@@ -0,0 +1,22 @@
+accelerate==0.21.0
+datasets==2.12.0
+numpy==1.24.4
+https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
+transformers==4.31.0
+typing==3.7.4.3
+tabulate
+ray[tune]
+ray[serve]
+gradio
+gymnasium
+dm-tree
+scikit-image
+pydantic==1.10.11
+tensorboard
+einops
+gpustat==1.0.0
+peft==0.4.0
+evaluate
+deltatuner==1.1.9
+scikit-learn
+git+https://github.com/microsoft/Megatron-DeepSpeed.git#egg=megatron-core
diff --git a/pyproject.toml b/pyproject.toml
index 2a980317a..01d5160cc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ classifiers = [
     "Framework :: Ray"
 ]
 dependencies = [
-    "accelerate>=0.21.0",
+    "accelerate",
     "datasets>=2.14.6",
     "numpy",
     "ray @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl",
@@ -61,7 +61,7 @@ deepspeed = [
     "psutil",
     "tqdm",
     # 0.10.2 is required to support bloom
-    "deepspeed==0.10.2"
+    "deepspeed>=0.10.2, <0.11.2"
 ]
 
 bigdl-cpu = [
diff --git a/rlhf/ppo.conf b/rlhf/ppo.conf
deleted file mode 100644
index 076a270c3..000000000
--- a/rlhf/ppo.conf
+++ /dev/null
@@ -1,25 +0,0 @@
-# I am python, not json
-{
-    "General": {
-        "model_name": "EleutherAI/gpt2",
-        "model_pretrain": None,
-        "rm_name": "EleutherAI/gpt2",
-        "rm_pretrain": None,
-    },
-    "Dataset": {
-        "train_file": "examples/data/sample_ppo_data.jsonl",
-        "validation_file": None,
-        "validation_split_percentage": 5
-    },
-    "Training": {
-        "optimizer": "AdamW",
-        "experience_batch_size": 2,
-        "training_iteration": 1000,
-        "learning_rate": 1e-5,
-        "kl_coeff": 0.2,
-        "num_training_workers": 2,
-        "resources_per_worker": {
-            "CPU": 56
-        },
-    },
-}
diff --git a/rlhf/ppo.yaml b/rlhf/ppo.yaml
new file mode 100644
index 000000000..36e8fbbc5
--- /dev/null
+++ b/rlhf/ppo.yaml
@@ -0,0 +1,18 @@
+General:
+  model_name: EleutherAI/gpt2
+  model_pretrain: null
+  rm_name: EleutherAI/gpt2
+  rm_pretrain: null
+Dataset:
+  train_file: examples/data/sample_ppo_data.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  experience_batch_size: 2
+  training_iteration: 1000
+  learning_rate: 1.0e-05
+  kl_coeff: 0.2
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 56
diff --git a/rlhf/reward.conf b/rlhf/reward.conf
deleted file mode 100644
index dccaa149d..000000000
--- a/rlhf/reward.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-    "General": {
-        "base_model": "EleutherAI/gpt2",
-        "output_dir": "/tmp/llm-ray/output/rm",
-        "checkpoint_dir": "/tmp/llm-ray/checkpoint/rm"
-    },
-    "Dataset": {
-        "train_file": "examples/data/sample_rm_data.jsonl",
-        "validation_file": None,
-        "validation_split_percentage": 5
-    },
-    "Training": {
-        "optimizer": "AdamW",
-        "batch_size": 2,
-        "epochs": 3,
-        "learning_rate": 1e-5,
-        "lr_scheduler": "linear",
-        "weight_decay": 0.0,
-        "num_training_workers": 2,
-        "resources_per_worker": {
-            "CPU": 32
-        },
-    },
-}
diff --git a/rlhf/reward.yaml b/rlhf/reward.yaml
new file mode 100644
index 000000000..77da9bd54
--- /dev/null
+++ b/rlhf/reward.yaml
@@ -0,0 +1,18 @@
+General:
+  base_model: EleutherAI/gpt2
+  output_dir: /tmp/llm-ray/output/rm
+  checkpoint_dir: /tmp/llm-ray/checkpoint/rm
+Dataset:
+  train_file: examples/data/sample_rm_data.jsonl
+  validation_file: null
+  validation_split_percentage: 5
+Training:
+  optimizer: AdamW
+  batch_size: 2
+  epochs: 3
+  learning_rate: 1.0e-05
+  lr_scheduler: linear
+  weight_decay: 0.0
+  num_training_workers: 2
+  resources_per_worker:
+    CPU: 32
diff --git a/tools/workload_in_containers/Dockerfile.dp b/tools/workload_in_containers/Dockerfile.dp
deleted file mode 100755
index 7a8df2ab3..000000000
--- a/tools/workload_in_containers/Dockerfile.dp
+++ /dev/null
@@ -1,34 +0,0 @@
-FROM ubuntu:22.04
-
-RUN apt-get update && apt-get install -y \
-    python3.10 \
-    python3-pip \
-    python-is-python3 \
-    wget \
-    git \
-    build-essential \
-    vim \
-    htop \
-    ssh \
-    net-tools
-
-WORKDIR /home/user
-
-RUN pip install -U ray[default,data]
-
-RUN pip install astunparse nltk gymnasium pyyaml datasets presidio_analyzer presidio_anonymizer sentencepiece transformers
-RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-RUN python -m spacy download en_core_web_lg parquet-tools
-
-#install PII detection/redaction related libs for code
-RUN pip install gibberish-detector
-RUN pip install detect-secrets
-
-# enable password-less ssh
-RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -P '' && \
-    cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
-    sed -i 's/#   Port 22/Port 12345/' /etc/ssh/ssh_config && \
-    sed -i 's/#Port 22/Port 12345/' /etc/ssh/sshd_config
-
-CMD ["sh", "-c", "service ssh start; bash"]
-
diff --git a/tools/workload_in_containers/Dockerfile.optimum.habana b/tools/workload_in_containers/Dockerfile.optimum.habana
deleted file mode 100644
index f09c0d7f0..000000000
--- a/tools/workload_in_containers/Dockerfile.optimum.habana
+++ /dev/null
@@ -1,23 +0,0 @@
-FROM vault.habana.ai/gaudi-docker/1.13.0/ubuntu20.04/habanalabs/pytorch-installer-2.1.0:latest
-ENV DEBIAN_FRONTEND=noninteractive
-WORKDIR /home/user
-RUN pip install lz4 numpy==1.24.4 \
-    https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl \
-    tensorboard gpustat==1.0.0 sentencepiece \
-    accelerate==0.19.0 \
-    datasets==2.12.0 gymnasium transformers==4.34.0 \
-    dm-tree scikit-image evaluate peft==0.5.0 scikit-learn
-RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0
-RUN pip install --upgrade-strategy eager optimum[habana]
-COPY pretrain/patch/hpu/constants.py /usr/local/lib/python3.8/dist-packages/deepspeed/checkpoint/
-COPY pretrain/patch/hpu/state.py /usr/local/lib/python3.8/dist-packages/optimum/habana/accelerate/
-COPY pretrain/patch/gpu/0001-Change-the-sample-s-column-name.patch .
-COPY pretrain/patch/gpu/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch .
-RUN git config --global user.email "root@example.com" 
-RUN git config --global user.name "root" 
-RUN git clone https://github.com/microsoft/Megatron-DeepSpeed.git && \
-    cd Megatron-DeepSpeed && \
-    git checkout -b ray 796866fa74f23850b977d4023a7ed4f0031844ae && \
-    git am /home/user/0001-Change-the-sample-s-column-name.patch && \
-    git am /home/user/0001-Add-init.py-to-include-the-megatron.model.vision-int.patch && \
-    pip install .

From c8a0da495dc67687d51e5ab3eb7ab644fe6c51da Mon Sep 17 00:00:00 2001
From: Jiafu Zhang <jiafu.zhang@intel.com>
Date: Thu, 21 Dec 2023 21:43:00 +0800
Subject: [PATCH 07/14] add pyproject.toml to be ci monitored

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
---
 .github/workflows/workflow_orders_on_merge.yml | 1 +
 .github/workflows/workflow_orders_on_pr.yml    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
index e453f242b..56bda5006 100644
--- a/.github/workflows/workflow_orders_on_merge.yml
+++ b/.github/workflows/workflow_orders_on_merge.yml
@@ -13,6 +13,7 @@ on:
       - 'inference/**'
       - 'rlhf/**'
       - 'tools/**'
+      - 'pyproject.toml'
 
 jobs:
 
diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
index e13bccecf..2c8f93f3d 100644
--- a/.github/workflows/workflow_orders_on_pr.yml
+++ b/.github/workflows/workflow_orders_on_pr.yml
@@ -13,6 +13,7 @@ on:
       - 'inference/**'
       - 'rlhf/**'
       - 'tools/**'
+      - 'pyproject.toml'
 
 jobs:
 

From 019ba9ae82136cd0103daa91260514e3d0f7d571 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 21 Dec 2023 15:08:53 +0800
Subject: [PATCH 08/14] Bump paramiko from 3.2.0 to 3.4.0 (#7)

Bumps [paramiko](https://github.com/paramiko/paramiko) from 3.2.0 to 3.4.0.
- [Commits](https://github.com/paramiko/paramiko/compare/3.2.0...3.4.0)

---
updated-dependencies:
- dependency-name: paramiko
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 01d5160cc..47dccaec7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     "deltatuner==1.1.9",
     "py-cpuinfo",
     "pydantic-yaml",
-    "paramiko==3.2.0",
+    "paramiko==3.4.0",
 ]
 
 [project.optional-dependencies]

From 6ab749f044c383888b8309b4e0f638a6c7935efc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 21 Dec 2023 16:04:10 +0800
Subject: [PATCH 09/14] Bump transformers from 4.31.0 to 4.36.0 in /pretrain
 (#8)

Bumps [transformers](https://github.com/huggingface/transformers) from 4.31.0 to 4.36.0.
- [Release notes](https://github.com/huggingface/transformers/releases)
- [Commits](https://github.com/huggingface/transformers/compare/v4.31.0...v4.36.0)

---
updated-dependencies:
- dependency-name: transformers
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pretrain/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pretrain/requirements.txt b/pretrain/requirements.txt
index fa0d041cd..e25a3e55b 100644
--- a/pretrain/requirements.txt
+++ b/pretrain/requirements.txt
@@ -4,7 +4,7 @@ numpy==1.24.4
 https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl
 torchvision==0.14.1
 torch==1.13.1
-transformers==4.31.0
+transformers==4.36.0
 typing==3.7.4.3
 tabulate
 ray[tune]

From c8e46c6b8e35bd52a5a9ce9dfa0ee8d4e34add3b Mon Sep 17 00:00:00 2001
From: Jiafu Zhang <jiafu.zhang@intel.com>
Date: Wed, 27 Dec 2023 18:37:00 +0800
Subject: [PATCH 10/14] fixed some merge error

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
---
 .github/workflows/workflow_finetune.yml  | 2 +-
 .github/workflows/workflow_inference.yml | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index 4749750e4..106e9118b 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -45,7 +45,7 @@ jobs:
         uses: actions/checkout@v2
 
       - name: Load environment variables
-        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
+        run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV
 
       - name: Build Docker Image
         run: |
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
index eb3d978b9..5c3f0f5f6 100644
--- a/.github/workflows/workflow_inference.yml
+++ b/.github/workflows/workflow_inference.yml
@@ -16,7 +16,6 @@ jobs:
     name: inference test
     strategy:
       matrix:
-        # for mistral-7b-v0.1, we use bigdl-cpu to verify
         model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1 ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}

From 2730576700948fea8452a52c0e0ed2ac4a73bcd6 Mon Sep 17 00:00:00 2001
From: Jiafu Zhang <jiafu.zhang@intel.com>
Date: Wed, 27 Dec 2023 21:15:22 +0800
Subject: [PATCH 11/14] fixed some merge error

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
---
 .github/workflows/workflow_finetune.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index 106e9118b..df236c76b 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -45,7 +45,7 @@ jobs:
         uses: actions/checkout@v2
 
       - name: Load environment variables
-        run: cat ~/llm-ray-actions-runner/.env >> $GITHUB_ENV
+        run: cat ${{ ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV
 
       - name: Build Docker Image
         run: |

From 8db74ec6b3c9c47e242f721f840821b46cb490e8 Mon Sep 17 00:00:00 2001
From: Jiafu Zhang <jiafu.zhang@intel.com>
Date: Wed, 27 Dec 2023 21:17:46 +0800
Subject: [PATCH 12/14] fixed some merge error

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
---
 .github/workflows/workflow_finetune.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index df236c76b..261421c42 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -45,7 +45,7 @@ jobs:
         uses: actions/checkout@v2
 
       - name: Load environment variables
-        run: cat ${{ ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV
+        run: cat ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV
 
       - name: Build Docker Image
         run: |

From 91df9e9733d92089abeab443779a433d4821065a Mon Sep 17 00:00:00 2001
From: Jiafu Zhang <jiafu.zhang@intel.com>
Date: Wed, 27 Dec 2023 21:20:21 +0800
Subject: [PATCH 13/14] fixed some merge error

Signed-off-by: Jiafu Zhang <jiafu.zhang@intel.com>
---
 .github/workflows/workflow_finetune.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
index 261421c42..4749750e4 100644
--- a/.github/workflows/workflow_finetune.yml
+++ b/.github/workflows/workflow_finetune.yml
@@ -45,7 +45,7 @@ jobs:
         uses: actions/checkout@v2
 
       - name: Load environment variables
-        run: cat ${{ vars.ACTIONS_RUNNER_CONFIG_PATH }}/.env >> $GITHUB_ENV
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV
 
       - name: Build Docker Image
         run: |

From 7a2b54ac1fdba6bb7145420f45ea3bee857e5d52 Mon Sep 17 00:00:00 2001
From: harborn <gangsheng.wu@intel.com>
Date: Thu, 28 Dec 2023 10:05:45 +0800
Subject: [PATCH 14/14] remove nightly ray, and use newest release version
 (#15)

* remove nightly ray, and use newest release version

* update

* update

* update
---
 common/trainer/default_trainer.py | 6 ++++--
 finetune/finetune.py              | 3 +++
 pyproject.toml                    | 6 +++---
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
index d013f28e2..f3aa965b9 100644
--- a/common/trainer/default_trainer.py
+++ b/common/trainer/default_trainer.py
@@ -57,8 +57,10 @@ def recovery(self, config):
                 self.starting_epoch = checkpoint_epoch["epoch"] + 1
 
             logger.info(f"recovery to epoch {self.starting_epoch}")
+        except FileNotFoundError as e:
+            logger.info(e)
         except Exception as e:
-            logger.warning(f"recovery error", exc_info=True)
+            logger.warning("recovery error", exc_info=True)
 
     def _coordinate(self, accelerator):
         self.accelerator = accelerator
@@ -174,7 +176,7 @@ def train(self):
                 except OverflowError:
                     eval_loss = float("inf")
                     perplexity = float("inf")
-                logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss}]\tppl:[{perplexity}]\ttime:[{time.time()-start}]")
+                logger.info(f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]")
 
             if checkpoint is not None:
                 self.save(checkpoint, idx)
diff --git a/finetune/finetune.py b/finetune/finetune.py
index 088ef89f1..430c452de 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -108,6 +108,7 @@ def train_func(config: Dict[str, Any]):
     trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(config = {
         "num_train_epochs": config["Training"]["epochs"],
         "max_train_step": config["Training"].get("max_train_steps", None),
+        "log_step": 1,
         "output": config["General"]["output_dir"],
         "dataprocesser": {
             "type": "GeneralProcesser",
@@ -200,6 +201,8 @@ def main(external_config = None):
 
         ray.init(runtime_env = runtime_env)
 
+    common.logger.info(f"ray available resources = {ray.available_resources()}")
+
     scaling_config = ScalingConfig(
         num_workers = num_training_workers,
         use_gpu = use_gpu,
diff --git a/pyproject.toml b/pyproject.toml
index 47dccaec7..d5c4396c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "accelerate",
     "datasets>=2.14.6",
     "numpy",
-    "ray @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl",
+    "ray>=2.9",
     "typing>=3.7.4.3",
     "tabulate",
     "ray[tune]",
@@ -52,8 +52,8 @@ gpu = [
     "torch==2.0.1a0",
     "torchvision==0.15.2a0",
     "intel-extension-for-pytorch==2.0.110+xpu",
-    "oneccl_bind_pt",
-    "dpctl"
+    "oneccl_bind_pt==2.0.100+gpu",
+    "dpctl==0.14.5"
 ]
 
 deepspeed = [