diff --git a/.github/workflows/test-packaging-python.yml b/.github/workflows/test-packaging-python.yml new file mode 100644 index 000000000..4ee491c8e --- /dev/null +++ b/.github/workflows/test-packaging-python.yml @@ -0,0 +1,49 @@ +name: Test - transforms/packaging/python + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "transforms/packaging/python/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "transforms/packaging/python/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +jobs: + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/packaging/python + run: | + if [ -e "transforms/packaging/python/Makefile" ]; then + make -C transforms/packaging/python DOCKER=docker test-src + else + echo "transforms/packaging/python/Makefile not found - source testing disabled for this transform." + fi diff --git a/.github/workflows/test-packaging-ray.yml b/.github/workflows/test-packaging-ray.yml new file mode 100644 index 000000000..4b812540c --- /dev/null +++ b/.github/workflows/test-packaging-ray.yml @@ -0,0 +1,49 @@ +name: Test - transforms/packaging/ray + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "transforms/packaging/ray/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "transforms/packaging/ray/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +jobs: + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/packaging/ray + run: | + if [ -e "transforms/packaging/ray/Makefile" ]; then + make -C transforms/packaging/ray DOCKER=docker test-src + else + echo "transforms/packaging/ray/Makefile not found - source testing disabled for this transform." + fi diff --git a/.make.defaults b/.make.defaults index 510e3fc05..8d7f454da 100644 --- a/.make.defaults +++ b/.make.defaults @@ -480,7 +480,8 @@ endif if [ -e requirements.txt ]; then \ echo Installing requirements from requirements.txt; \ pip install $(PIP_INSTALL_EXTRA_ARGS) $$extra_url -r requirements.txt; \ - elif [ -e pyproject.toml ]; then \ + fi; \ + if [ -e pyproject.toml ]; then \ echo Installing from pyproject.toml; \ pip install $(PIP_INSTALL_EXTRA_ARGS) $$extra_url -e .; \ fi @@ -587,6 +588,18 @@ MINIO_ADMIN_PWD= localminiosecretkey > tt.toml; \ mv tt.toml pyproject.toml; \ fi + @if [ -e requirements.txt ]; then \ + cat requirements.txt | sed \ + -e 's/data-prep-toolkit-ray\([=><~][=]\).*/data-prep-toolkit-ray\1$(DPK_LIB_VERSION)/' \ + -e 's/data-prep-toolkit-transforms\([=><~][=]\).*/data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)/' \ + -e 's/data-prep-toolkit-spark\([=><~][=]\).*/data-prep-toolkit-spark\1$(DPK_LIB_VERSION)/' \ + -e 's/data-prep-toolkit-kfp\([=><~][=]\).*/data-prep-toolkit-kfp\1$(DPK_LIB_KFP_VERSION)/' \ + -e 's/data-prep-toolkit\([=><~][=]\).*/data-prep-toolkit\1$(DPK_LIB_VERSION)/' \ + -e 's/ray\[default\]\([=><~][=]\).*/ray\[default\]\1$(RAY)/' \ + -e 's/data-prep-toolkit-kfp-shared\(..\).*/data-prep-toolkit-kfp-shared\1$(DPK_LIB_KFP_VERSION)/' \ + > tt.txt; \ + mv tt.txt requirements.txt; \ + fi # Build the distribution, usually in preparation for publishing using ith the .defaults.publish-dist target .PHONY: .defaults.build-dist diff --git a/.make.versions b/.make.versions index f0dc3428c..54e6d8ca1 100644 --- a/.make.versions +++ b/.make.versions @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2 DPK_MICRO_VERSION=1 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev0 +DPK_VERSION_SUFFIX=.dev3 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -103,6 +103,8 @@ PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +DPK_TRANSFORMS_VERSION=$(DPK_VERSION) + ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. @@ -117,3 +119,4 @@ ifeq ($(KFPv2), 1) else WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support endif + diff --git a/data-processing-lib/python/pyproject.toml b/data-processing-lib/python/pyproject.toml index d8e98aa8d..9ff6c2d7f 100644 --- a/data-processing-lib/python/pyproject.toml +++ b/data-processing-lib/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Library" diff --git a/data-processing-lib/ray/pyproject.toml b/data-processing-lib/ray/pyproject.toml index 8fad2e9de..3f347cdf4 100644 --- a/data-processing-lib/ray/pyproject.toml +++ b/data-processing-lib/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10" description = "Data Preparation Toolkit Library for Ray" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit>=0.2.1.dev3", "ray[default]==2.24.0", # These two are to fix security issues identified by quay.io "fastapi>=0.110.2", diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml index 30cb8f032..b6e9edddb 100644 --- a/data-processing-lib/spark/pyproject.toml +++ b/data-processing-lib/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10" description = "Data Preparation Toolkit Library for Spark" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "pyspark>=3.5.2", "psutil>=6.0.0" ] diff --git a/examples/notebooks/rag/requirements.txt b/examples/notebooks/rag/requirements.txt index 3c1a464d0..4578b1ea8 100644 --- a/examples/notebooks/rag/requirements.txt +++ b/examples/notebooks/rag/requirements.txt @@ -1,6 +1,6 @@ ## Data prep kit -data-prep-toolkit-transforms==0.2.1.dev1 -data-prep-toolkit-transforms-ray==0.2.1.dev1 +#data-prep-toolkit-transforms==0.2.1.dev1 +#data-prep-toolkit-transforms-ray==0.2.1.dev1 @@ -53,4 +53,4 @@ ipython ipywidgets IProgress chardet==5.2.0 -charset-normalizer==3.3.2 \ No newline at end of file +charset-normalizer==3.3.2 diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index 6e8a7e458..eaea5fb0d 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.1.dev0", + "data-prep-toolkit-kfp-shared==0.2.1.dev3", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 632e414ca..c5ca32f1a 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -12,9 +12,9 @@ authors = [ { name = "Revital Eres", email = "eres@il.ibm.com" }, ] dependencies = [ - "kfp==2.7.0", + "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.1.dev0", + "data-prep-toolkit-kfp-shared==0.2.1.dev3", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index c8d2648df..b4f509433 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index b8c97541d..79f0988be 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "parameterized", "pandas", ] diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index f610754d0..c7f1a1563 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", - "dpk-code2parquet-transform-python==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", + "dpk-code2parquet-transform-python==0.2.1.dev3", "parameterized", "pandas", ] diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 60bdf9e91..88c8f9031 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "bs4==0.0.2", "transformers==4.38.2", ] diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index 574c06d5a..6925f45c0 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-code-quality-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index 38f4f6fb1..2799974b4 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index 4105907fe..d40aa9373 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-header-cleanser-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index ce4a6a088..9e5e122ca 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index d19eb2336..60d9a3089 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-malware-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index e21924116..7fcef9bfc 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index 323c16c1e..703bf5279 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-proglang-select-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index d1d973902..6f54a65ed 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index 1dbf38560..4deb09d47 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "chunk documents Python Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "docling-core==1.3.0", "llama-index-core>=0.11.0,<0.12.0", ] diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 1ba60d642..19288e2db 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "chunk documents Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "dpk-doc-chunk-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-doc-chunk-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index e6d9a2ada..e63a6d5e5 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index 6ed293e09..6bc9cc6c6 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0" + "dpk-doc_quality-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3" ] [build-system] diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index bc472c766..f2dd72919 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "fasttext==0.9.2", "langcodes==3.3.0", "huggingface-hub >= 0.21.4, <1.0.0", diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index 2244d27c5..4833913a4 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-lang_id-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/pdf2parquet/python/pyproject.toml b/transforms/language/pdf2parquet/python/pyproject.toml index 91f7a14b5..24f2294b5 100644 --- a/transforms/language/pdf2parquet/python/pyproject.toml +++ b/transforms/language/pdf2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PDF2PARQUET Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "docling-core==1.2.0", "docling-ibm-models==1.1.7", "deepsearch-glm==0.21.0", diff --git a/transforms/language/pdf2parquet/ray/pyproject.toml b/transforms/language/pdf2parquet/ray/pyproject.toml index 9d81f8ada..950e5ce3d 100644 --- a/transforms/language/pdf2parquet/ray/pyproject.toml +++ b/transforms/language/pdf2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PDF2PARQUET Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "dpk-pdf2parquet-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-pdf2parquet-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index b63e6d676..a61987a45 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PII redactor Transform for Python" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Sowmya.L.R", email = "lrsowmya@gmail.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index 349b24075..a1b01be94 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk_pii_redactor_transform_python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index 374c36d12..1ed8725ab 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "sentence-transformers==3.0.1", ] diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index 3c53415f8..aa8af8b44 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-text_encoder-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/packaging/.gitignore b/transforms/packaging/.gitignore new file mode 100644 index 000000000..863607847 --- /dev/null +++ b/transforms/packaging/.gitignore @@ -0,0 +1,5 @@ +**/src +**/dist +**/*.egg-info +**/build + diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging new file mode 100644 index 000000000..0ecc05484 --- /dev/null +++ b/transforms/packaging/.make.packaging @@ -0,0 +1,48 @@ + +venv: + $(MAKE) .defaults.create-venv + +test:: test-src + +clean:: .transforms.clean + -rm -fr src + +image:: .transforms.python-image + +run-ut:: + source venv/bin/activate; \ + for T in $(TRANSFORMS_NAMES); do \ + echo running unit test on: $$T ; \ + $(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \ + done; + @# Help: Setup environment and run unit tests for all transforms + + +setup: .transforms.setup venv + $(MAKE) src + source venv/bin/activate; \ + $(PYTHON) -m pip install . + @# Help: Do any default transform setup before running make src and setting up a test environment + + +src: + mkdir src + for T in $(TRANSFORMS_NAMES); do \ + echo copy src from $$T ; \ + cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ + rm -fr *.egg-info ; \ + rm -fr dist ; \ + rm -fr build ; \ + done; + @# Help: Setup src folder and remove old distribution + + +build:: build-dist + +publish:: publish-dist + +build-dist:: src .defaults.build-dist + +publish-dist:: .defaults.publish-dist + + diff --git a/transforms/packaging/Makefile b/transforms/packaging/Makefile new file mode 100644 index 000000000..aa75d525e --- /dev/null +++ b/transforms/packaging/Makefile @@ -0,0 +1,60 @@ +REPOROOT=../../ +# Use make help, to see the available rules +include ../../.make.defaults + +setup:: + +clean:: + # Clean up workflows common virtual environment. + rm -rf venv || true + rm -rf *.back || true + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +src:: + @# Help: Recursively setup $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +setup:: + +build:: + +build-dist:: + @# Help: Recursively build distributions in all subdirs + $(MAKE) RULE=$@ .recurse + +publish-dist:: + @# Help: Recursively publish distributions in all subdirs + $(MAKE) RULE=$@ .recurse + +venv:: + +image:: + +publish:: + +test-image:: + +test:: + +test-src:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +kind-load-image:: + +docker-load-image:: + +docker-save-image:: + +workflow-venv:: + +workflow-test:: + +workflow-build:: + +workflow-upload:: + +set-versions:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md new file mode 100644 index 000000000..e0d23ad52 --- /dev/null +++ b/transforms/packaging/README.md @@ -0,0 +1,55 @@ +# Transforms Pacakges for both Python and Ray + +Most available Transforms can be published to pypi as a single package. A detailed list of available Python transforms is available at this [link](python/README.md). Similarly the following [link](ray/README.md) provide a derailed list and installation instructions for Ray transforms + + + +## Clone folder and update version number +```` +git clone https://github.com/IBM/data-prep-kit.git package-release +cd package-release +```` +in `.make.versions`, Set the values for DPK_MAJOR_VERSION, DPK_MINOR_VERSION and DPK_MICRO_VERSION to specify the DPK library to use and as appropriate, set the value for `DPK_TRANSFORMS_VERSION` that will be used to tag the latest version released to pypi + +`make set-versions` + +## Creating src folder + +Given that the transforms do not currently have their own name spaces, the first step is to copy all the transforms to the same src folder prior to running unit tests of the individual transforms and/or building the distribution: + + +```` +cd transforms/packaging +make clean +make src +```` + +## Build and Test + +This procedure will run all the UT for each individual transforms using a single package configuration: + +```` +cd transforms/packaging +make clean +make src +make test-src +```` + +## Build and Deploy + +This procedure will buid two wheels: one for the python transforms and one for the ray transforms. + +```` +cd transforms/packaging +make clean +make src +make build-dist +```` + +To publish the wheels to pypi.org, run: + +`make publish-dist` + + + + diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile new file mode 100644 index 000000000..1271a20c3 --- /dev/null +++ b/transforms/packaging/python/Makefile @@ -0,0 +1,61 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +# $(REPOROOT)/.make.versions file contains the versions + +include $(REPOROOT)/transforms/.make.transforms +include ../.make.packaging + +PACKAGING_RUN_TIME=python + + +#Excluded List +# ./code/malware +# ./universal/html2parquet +# ./universal/profiler # Missing implementation +# ./universal/fdedup # Missing implementation +# code/repo_level_ordering # Missing implementation + + +TRANSFORMS_NAMES = code/code_quality \ + code/code2parquet \ + code/header_cleanser \ + code/proglang_select \ + language/doc_chunk \ + language/doc_quality \ + language/lang_id \ + language/pdf2parquet \ + language/pii_redactor \ + language/text_encoder \ + universal/ededup \ + universal/filter \ + universal/resize \ + universal/tokenization \ + universal/doc_id + + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions + +test-src:: + $(MAKE) src + $(MAKE) .transforms.python-venv + $(MAKE) run-ut + @# Help: Do any default transform setup before running make src and setting up a test environment + +test-with-pypi: + $(MAKE) clean + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRANSFORMS_VERSION) + $(MAKE) run-ut + @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) + + + + diff --git a/transforms/packaging/python/README.md b/transforms/packaging/python/README.md new file mode 100644 index 000000000..45260ce56 --- /dev/null +++ b/transforms/packaging/python/README.md @@ -0,0 +1,39 @@ +# DPK Python Transforms + +## installation + +The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: + +`python -m pip install data-prep-toolkit-transforms` + +installing the python transforms will also install `data-prep-toolkit` + +## List of Transforms in current package + +Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components + +* code + * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md) + * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/header_cleanser/python/README.md) + * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/python/README.md) + * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/python/README.md) +* language + * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/python/README.md) + * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/python/README.md) + * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/python/README.md) + * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/python/README.md) + * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/python/README.md) + * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/python/README.md) +* universal + * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/python/README.md) + * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/python/README.md) + * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/python/README.md) + * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/tokenization/doc_chunk/python/README.md) + * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/python/README.md) + + + + + + + diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml new file mode 100644 index 000000000..5ddb40aae --- /dev/null +++ b/transforms/packaging/python/pyproject.toml @@ -0,0 +1,39 @@ +[project] +name = "data_prep_toolkit_transforms" +version = "0.2.1.dev3" +requires-python = ">=3.10,<3.12" +keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +description = "Data Preparation Toolkit Transforms" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] + + + + + + diff --git a/transforms/packaging/python/requirements.txt b/transforms/packaging/python/requirements.txt new file mode 100644 index 000000000..6dec1e2de --- /dev/null +++ b/transforms/packaging/python/requirements.txt @@ -0,0 +1,31 @@ +data-prep-toolkit>=0.2.1.dev3 +bs4==0.0.2 +#pdf2parquet +# conflict with chunking.... +#docling-core==1.2.0, +docling-ibm-models==1.1.7, +deepsearch-glm==0.21.0, +docling==1.11.0, +filetype >=1.2.0, <2.0.0, +#Doc chunking +docling-core==1.3.0, +llama-index-core>=0.11.0,<0.12.0, +duckdb==0.10.1 +fasttext==0.9.2 +huggingface-hub >= 0.21.4, <1.0.0 +langcodes==3.3.0 +mmh3==4.1.0 +numpy==1.26.4 +pandas +parameterized +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' +sentence-transformers==3.0.1 +transformers==4.38.2 +xxhash==3.4.1 +# PII-redactor +presidio-analyzer>=2.2.355 +presidio-anonymizer>=2.2.355 +flair>=0.14.0 +pandas>=2.2.2 + + diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile new file mode 100644 index 000000000..0a1d6d911 --- /dev/null +++ b/transforms/packaging/ray/Makefile @@ -0,0 +1,66 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +# $(REPOROOT)/.make.versions file contains the versions + +include $(REPOROOT)/transforms/.make.transforms +include ../.make.packaging + +PACKAGING_RUN_TIME=ray + +# Excluded from build +# ./code/malware/ray + +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions + + +## Ray Transforms: `find . -name src | grep ray/src` +TRANSFORMS_NAMES = code/proglang_select \ + code/header_cleanser \ + code/code_quality \ + code/repo_level_ordering \ + code/code2parquet \ + language/doc_chunk \ + language/doc_quality \ + language/lang_id \ + language/text_encoder \ + language/pii_redactor \ + language/pdf2parquet \ + universal/fdedup \ + universal/tokenization \ + universal/ededup \ + universal/profiler \ + universal/doc_id \ + universal/filter \ + universal/resize + +# doc chunk has conflict dependencies with pdf2parquet that need to be resolved +# doc_chunk depends on docling>=1.8.2,<2.0.0 +# pdf2parquet depends on docling==1.7.0 + + +test-src:: + $(MAKE) src + $(MAKE) -C ../python src + make .transforms.ray-venv + $(MAKE) run-ut + @# Help: Do any default transform setup before running make src and setting up a test environment + +test-with-python-pypi: + $(MAKE) clean + $(MAKE) .defaults.create-venv + source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . + $(MAKE) test-src + +test-with-pypi: + $(MAKE) clean + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRANSFORMS_VERSION) + $(MAKE) test-src + diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md new file mode 100644 index 000000000..b7d4cf2eb --- /dev/null +++ b/transforms/packaging/ray/README.md @@ -0,0 +1,41 @@ +# DPK Ray Transforms + +## installation + +The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: + +`python -m pip install data-prep-toolkit-transforms-ray` + +installing the Ray transforms will also install `data_prep_toolkit_transforms` and `data-prep-toolkit-ray` + +## List of Ray Transforms availabe in current package + +Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components + +* code + * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) + * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/ray/README.md) + * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) + * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/ray/README.md) + * [repo_level_ordering](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/repo_level_ordering/ray/README.md) +* language + * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/ray/README.md) + * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/ray/README.md) + * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/ray/README.md) + * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/ray/README.md) + * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/ray/README.md) + * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/ray/README.md) +* universal + * [fdedup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/fdedup/ray/README.md) + * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/tokenization/ray/README.md) + * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/ray/README.md) + * [profiler](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/profiler/ray/README.md) + * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/ray/README.md) + * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/ray/README.md) + * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/ray/README.md) + + + + + + diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml new file mode 100644 index 000000000..9c1509472 --- /dev/null +++ b/transforms/packaging/ray/pyproject.toml @@ -0,0 +1,40 @@ +[project] +name = "data_prep_toolkit_transforms_ray" +version = "0.2.1.dev3" +requires-python = ">=3.10,<3.12" +keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +description = "Data Preparation Toolkit Transforms using Ray" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] + + + + + + diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt new file mode 100644 index 000000000..2e75ae185 --- /dev/null +++ b/transforms/packaging/ray/requirements.txt @@ -0,0 +1,21 @@ +data-prep-toolkit-ray>=0.2.1.dev3 +data-prep-toolkit-transforms>=0.2.1.dev3 +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' +parameterized +tqdm==4.66.3 +mmh3==4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +#The conflict is caused by: +# ray fdedup depends on scipy==1.12.0 +# docling 1.7.0 depends on scipy<2.0.0 and >=1.14.1 +scipy>=1.12.0 +networkx==3.3 +colorlog==6.8.2 +func-timeout==4.3.5 +pandas==2.2.2 +emerge-viz==2.0.0 + + + + diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index c5af73cce..8e4358b28 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "ededup Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0" + "data-prep-toolkit==0.2.1.dev3" ] [build-system] diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index 022a63db6..e5cb79d95 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0" + "dpk_doc_id_transform_python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3" ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 7efd8cfac..13d7bc2c3 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-spark==0.2.1.dev0", + "data-prep-toolkit-spark==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 2b751b18c..e380bf58e 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "ededup Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "mmh3==4.1.0", "xxhash==3.4.1", ] diff --git a/transforms/universal/ededup/ray/Makefile b/transforms/universal/ededup/ray/Makefile index 2d81bbbe2..f828e107e 100644 --- a/transforms/universal/ededup/ray/Makefile +++ b/transforms/universal/ededup/ray/Makefile @@ -33,7 +33,7 @@ setup:: .transforms.setup # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(EDEDUP_RAY_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(EDEDUP_PYTHON_VERSION) TOML_VERSION=$(EDEDUP_RAY_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 1503d8c5c..2fdf82392 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", - "dpk_ededup_transform_python==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", + "dpk_ededup_transform_python==0.2.1.dev3", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 25e4fe5f9..70f92a23f 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", "mmh3==4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index cbaa4ad20..995247f4f 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Filter Transform for Python" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "duckdb==0.10.1", ] diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index 155e8ef05..fc0035475 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-filter-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 90974056b..4d31c2ef2 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_filter_transform_python==0.2.1.dev0", - "data-prep-toolkit-spark==0.2.1.dev0", + "data-prep-toolkit-spark==0.2.1.dev3", ] [project.optional-dependencies] diff --git a/transforms/universal/html2parquet/python/pyproject.toml b/transforms/universal/html2parquet/python/pyproject.toml index 5d183b49d..f49c498d6 100644 --- a/transforms/universal/html2parquet/python/pyproject.toml +++ b/transforms/universal/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Syed Zawad", email = "szawad@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "trafilatura==1.12.0" ] diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index 889cc6cfd..5714e70de 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index 86eebe633..9f1353b4e 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-noop-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index d22dadfa8..965770d92 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.1.dev0", - "data-prep-toolkit-spark==0.2.1.dev0", + "dpk-noop-transform-python==0.2.1.dev3", + "data-prep-toolkit-spark==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index 81439a390..1473b88b4 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", "mmh3==4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 2346a0a14..b1cc13314 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "resize Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index 56fb6f077..86834c1b1 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-resize-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index f77067612..1dc0ca104 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "transformers==4.38.2", ] diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index b77a46d6b..fd259a9b6 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-tokenization-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system]