Skip to content

Commit

Permalink
Merge branch 'intel:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
XinyaoWa authored Nov 7, 2023
2 parents b8219d5 + 4553256 commit 35dfba2
Show file tree
Hide file tree
Showing 144 changed files with 27,180 additions and 3,720 deletions.
37 changes: 17 additions & 20 deletions .github/workflows/e2eaiok_deltatuner_nightly_pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,38 @@ name: Publish E2EAIOK Deltatuner nightly to PyPI

on:
workflow_dispatch:
push:
branches:
- main
paths:
- 'e2eAIOK/deltatuner/version'

permissions:
contents: read
packages: write

jobs:
e2eaiok-recdp-nightly-python-pypi:
runs-on: self-hosted
e2eaiok-dtuner-nightly-python-pypi:
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write

steps:
- uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2

- name: days since the commit date
run: |
:
timestamp=$(git log --no-walk --date=unix --format=%cd $GITHUB_SHA)
days=$(( ( $(date --utc +%s) - $timestamp ) / 86400 ))
if [ $days -eq 0 ]; then
echo COMMIT_TODAY=true >> $GITHUB_ENV
fi
- name: Build sdist
run: |
pip install build wheel
cd e2eAIOK/deltatuner
release_version=$(cat version | head -1)
nightly_build_date=`date '+%Y%m%d%H'`
release_version=$(cat deltatuner/version | head -1)
nightly_build_date=`date '+%Y%m%d%H%M'`
nightly_version=${release_version}b${nightly_build_date}
echo $nightly_version > version
echo $nightly_version > deltatuner/version
python3 setup.py sdist
- name: Upload
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN_PYRECDP }}
packages_dir: e2eAIOK/deltatuner/dist
packages-dir: e2eAIOK/deltatuner/dist
verbose: true
28 changes: 28 additions & 0 deletions .github/workflows/e2eaiok_deltatuner_release_pypi.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Publish E2EAIOK Deltatuner nightly to PyPI

on:
workflow_dispatch:

jobs:
e2eaiok-dtuner-nightly-python-pypi:
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write

steps:
- uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2

- name: Build sdist
run: |
pip install build wheel
cd e2eAIOK/deltatuner
python3 setup.py sdist
- name: Upload
uses: pypa/gh-action-pypi-publish@release/v1
with:
packages-dir: e2eAIOK/deltatuner/dist
55 changes: 55 additions & 0 deletions .github/workflows/release_docker_e2eaiokv12.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Publish e2eAIOK Docker v1.2 for RecDP and deltatuner

on:
workflow_run:
workflows: ["Publish e2eAIOK Nightly Release to dockerhub"]
types:
- completed
workflow_dispatch:
push:
branches:
- main
paths:
- 'Dockerfile-ubuntu/Dockerfile-v1.2'
- '.github/workflows/release_docker_e2eaiokv12.yml'

jobs:
e2eaiok-release-docker:
runs-on: self-hosted
if: ${{ github.repository_owner == 'intel' }}
steps:
-
name: Checkout
uses: actions/checkout@v2
-
name: Set up Env
run: |
echo RELEASE_VERSION=$(cat e2eAIOK/version | head -1) >> $GITHUB_ENV
echo http_proxy=${http_proxy} >> $GITHUB_ENV
echo https_proxy=${https_proxy} >> $GITHUB_ENV
echo no_proxy=${no_proxy} >> $GITHUB_ENV
-
name: Login to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
driver-opts: |
env.http_proxy=${{ env.http_proxy }}
env.https_proxy=${{ env.https_proxy }}
"env.no_proxy='${{ env.no_proxy}}'"
-
name: Build and Push Docker images
uses: docker/build-push-action@v3
with:
context: ./Dockerfile-ubuntu
file: ./Dockerfile-ubuntu/Dockerfile-v1.2
push: true
tags: ${{ secrets.DOCKER_HUB_USERNAME }}/e2eaiok-v1.2:${{ env.RELEASE_VERSION }} , ${{ secrets.DOCKER_HUB_USERNAME }}/e2eaiok-v1.2:latest
build-args: |
"http_proxy=${{ env.http_proxy }}"
"https_proxy=${{ env.https_proxy }}"
30 changes: 30 additions & 0 deletions .github/workflows/unittest_llmutils_operations.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Unittest LLM Operations
on:
pull_request:
branches:
- main
paths:
- 'RecDP/pyrecdp/LLM/**'
- 'RecDP/pyrecdp/primitives/operations/**'
- 'RecDP/tests/cicd/bashrun_unittest_llmutilspipeline.sh'
- 'RecDP/tests/test_llmutils_operations.py'

jobs:
unittest:
name: Unittest LLM
runs-on: self-hosted
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Build Docker
run: |
cd RecDP && cd Dockerfile && docker build -t pyrecdp-test-env . -f DockerfileUbuntu --build-arg https_proxy=${https_proxy} && cd .. && yes | docker container prune && yes | docker image prune
- name: RUN Unittests
run: |
cd RecDP && docker run --rm --name unittest-pyrecdp-autofe-pandas --shm-size=300g --privileged --network host --device=/dev/dri -e RECDP_MODELS_CACHE=/home/vmagent/models -v `pwd`:/home/vmagent/app/ -v `pwd`/../../models:/home/vmagent/models/ -w /home/vmagent/app/ pyrecdp-test-env /bin/bash -c "sh tests/cicd/bashrun_unittest_llmutils_operations.sh"
- name: Clean env
run: |
docker ps --filter name=unittest-pyrecdp-autofe-pandas; while read line; do docker stop unittest-pyrecdp-autofe-pandas;done
23 changes: 23 additions & 0 deletions Dockerfile-ubuntu/Dockerfile-v1.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM ubuntu

# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8

# Install system dependencies
RUN apt-get -y update \
&& apt-get install -y build-essential \
&& apt-get install -y wget numactl git \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y python3 python3-pip python-is-python3 graphviz \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN pip install torch==1.13.0 transformers datasets accelerate SentencePiece \
evaluate peft==0.4.0 torchsummary \
nltk rouge_score protobuf==3.20.1 tokenizers einops wandb \
sigopt

RUN pip install --upgrade pip
RUN pip install pyspark
RUN pip install graphviz jupyterlab
RUN pip install featuretools
23 changes: 8 additions & 15 deletions RecDP/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,37 +37,30 @@ transformed_train_df = pipeline.fit_transform()

Low Code to build your own pipeline
```
from pyrecdp.LLM import TextPipeline
pipeline = TextPipeline("usecase/finetune_pipeline.yaml")
from pyrecdp.LLM import ResumableTextPipeline
pipeline = ResumableTextPipeline("usecase/finetune_pipeline.yaml")
ret = pipeline.execute()
```
or
```
from pyrecdp.primitives.operations import *
from pyrecdp.LLM import TextPipeline
from pyrecdp.LLM import ResumableTextPipeline
pipeline = TextPipeline()
pipeline = ResumableTextPipeline()
ops = [
JsonlReader(input_dir = "in_path"),
JsonlReader("data/"),
URLFilter(),
LengthFilter(),
ProfanityFilter(),
TextFix(),
LanguageIdentify(),
Classify(),
FuzzyDeduplicate(),
DocumentSplit(),
GlobalDeduplicate(),
PIIRemoval(),
ParquetWriter(out_dir = "out_path"),
PerfileParquetWriter("ResumableTextPipeline_output")
]
ret = pipeline.add_operations(ops).execute(dataset)
pipeline.add_operations(ops)
pipeline.execute()
```

* Pre-Training LLM Quality Control

WIP...

## LICENSE
* Apache 2.0

Expand Down
Loading

0 comments on commit 35dfba2

Please sign in to comment.