Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Jan 23, 2025
2 parents 0195bd7 + ba40e47 commit 30ccaa7
Show file tree
Hide file tree
Showing 364 changed files with 27,115 additions and 3,986 deletions.
14 changes: 14 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[run]
omit =
# avoid measuring strange non-existing files
/workspace/config.py
/workspace/config-3.py

# avoid measuring third-party dist packages
*/dist-packages/*

# avoid measuring code of unittest
tests/*

[report]
ignore_errors = True
5 changes: 4 additions & 1 deletion .github/workflows/deploy_sphinx_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ on:
jobs:
pages:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [ "3.9", "3.10" ]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@master
with:
python_version: ${{ matrix.python-version }}
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ services:
- TORCH_HOME=/data/torch
- NLTK_DATA=/data/nltk
- DATA_JUICER_CACHE_HOME=/data/dj
- EASYOCR_MODULE_PATH=/data/EasyOCR
- RAY_ADDRESS=auto
working_dir: /workspace
networks:
Expand All @@ -20,7 +21,7 @@ services:
ports:
- "6379:6379"
- "8265:8265"
shm_size: "64G"
shm_size: "128G"
deploy:
resources:
reservations:
Expand All @@ -39,6 +40,7 @@ services:
- TORCH_HOME=/data/torch
- NLTK_DATA=/data/nltk
- DATA_JUICER_CACHE_HOME=/data/dj
- EASYOCR_MODULE_PATH=/data/EasyOCR
working_dir: /workspace
volumes:
- huggingface_cache:/data
Expand All @@ -47,7 +49,7 @@ services:
- ray-head
networks:
- ray-network
shm_size: "64G"
shm_size: "128G"
deploy:
resources:
reservations:
Expand Down
56 changes: 56 additions & 0 deletions .github/workflows/perf-bench.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: performance_benchmark

on:
workflow_dispatch:
push:
branches:
- main

permissions:
contents: read

env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

jobs:
perf_bench:
runs-on: [GPU, unittest]
environment: Testing
steps:
- uses: actions/checkout@v3
with:
path: dj-${{ github.run_id }}

- name: Setup docker compose
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose up -d
- name: Install data-juicer
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head pip install -e .\[all\]
- name: Clean dataset cache
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head rm -rf /data/huggingface/dataset
- name: Run performance benchmark standalone
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head bash tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
- name: Remove docker compose
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
if: always()
run: |
docker compose down --remove-orphans
- name: Cleanup workspace
if: always()
run: |
rm -rf dj-${{ github.run_id }}
87 changes: 87 additions & 0 deletions .github/workflows/publish-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
name: Publish Docker Image

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

on:
workflow_dispatch:
release:
types: [published]

env:
IMAGE_NAME: datajuicer/data-juicer
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true


jobs:
build:
runs-on: [docker]
permissions:
contents: read
packages: write
attestations: write
# This is used to complete the identity challenge
# with sigstore/fulcio when running outside of PRs.
id-token: write

steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
path: dj-${{ github.run_id }}

# Install the cosign tool except on PR
# https://github.com/sigstore/cosign-installer
- name: Install cosign
uses: sigstore/[email protected]
with:
cosign-release: 'v2.4.1'

# Set up BuildKit Docker container builder to be able to build
# multi-platform images and export cache
# https://github.com/docker/setup-buildx-action
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

# Login against a Docker registry except on PR
# https://github.com/docker/login-action
- name: Log into Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

# Extract metadata (tags, labels) for Docker
# https://github.com/docker/metadata-action
- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.IMAGE_NAME }}

# Build and push Docker image with Buildx (don't push on PR)
# https://github.com/docker/build-push-action
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v6
with:
context: dj-${{ github.run_id }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

# Sign the resulting Docker image digest except on PRs.
# This will only write to the public Rekor transparency log when the Docker
# repository is public to avoid leaking data. If you would like to publish
# transparency data even for private images, pass --force to cosign below.
# https://github.com/sigstore/cosign
- name: Sign the published Docker image
env:
# https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
TAGS: ${{ steps.meta.outputs.tags }}
DIGEST: ${{ steps.build-and-push.outputs.digest }}
# This step uses the identity token to provision an ephemeral certificate
# against the sigstore community Fulcio instance.
run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
40 changes: 40 additions & 0 deletions .github/workflows/publish-pypi.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

name: Publish PyPi Package

on:
workflow_dispatch:
release:
types: [published]

permissions:
contents: read

jobs:
deploy:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
4 changes: 2 additions & 2 deletions .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ env:

jobs:
unittest-single:
runs-on: [self-hosted, linux]
runs-on: [GPU, unittest]
environment: Testing
steps:
- uses: actions/checkout@v3
Expand All @@ -44,7 +44,7 @@ jobs:
- name: Run unittest standalone
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head python tests/run.py --tag standalone
docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone
- name: Run unittest ray
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
Expand Down
11 changes: 10 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,21 @@ repos:
- id: mixed-line-ending
exclude: thirdparty/
args: [ "--fix=lf" ]
- repo: local
hooks:
- id: build-op-doc
name: build OP doc
entry: python .pre-commit-hooks/build_op_doc.py
language: python
require_serial: true
additional_dependencies:
- translators==5.9.3

exclude: |
(?x)^(
docs/.*|
tests/.*|
demos/.*|
demos/(?!api_service/).*|
tools/mm_eval/inception_metrics/.*|
thirdparty/easy_animate/.*|
.*\.md
Expand Down
Loading

0 comments on commit 30ccaa7

Please sign in to comment.