diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index b85ad788..c66f4e09 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -7,12 +7,20 @@ on: - "main" - "release-**" paths: +<<<<<<< HEAD +======= + - '.github/actions/*.ya?ml' +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' # This workflow pull_request: branches: - "main" paths: +<<<<<<< HEAD +======= + - '.github/actions/*.ya?ml' +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' # This workflow @@ -31,7 +39,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 34e2afbb..e940bd1c 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -33,7 +33,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - name: "Checkout" diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index 2f82071b..9ebcd234 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -46,7 +46,11 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +<<<<<<< HEAD aws-region: ${{ secrets.AWS_REGION }} +======= + aws-region: ${{ vars.AWS_REGION }} +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Start EC2 runner id: start-ec2-runner @@ -130,7 +134,11 @@ jobs: . ../instructlab/venv/bin/activate pip install -v . +<<<<<<< HEAD - name: Check disk +======= + - name: Check disk before tests +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) run: | df -h @@ -142,6 +150,13 @@ jobs: . venv/bin/activate ./scripts/e2e-ci.sh -m +<<<<<<< HEAD +======= + - name: Check disk after tests + run: | + df -h + +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) stop-medium-ec2-runner: needs: - start-medium-ec2-runner @@ -154,7 +169,11 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +<<<<<<< HEAD aws-region: ${{ secrets.AWS_REGION }} +======= + aws-region: ${{ vars.AWS_REGION }} +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Stop EC2 runner uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7 diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 59531d70..12c17eba 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -24,7 +24,11 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +<<<<<<< HEAD aws-region: ${{ secrets.AWS_REGION }} +======= + aws-region: ${{ vars.AWS_REGION }} +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Start EC2 runner id: start-ec2-runner @@ -160,7 +164,11 @@ jobs: pip install . pip install .[cuda] +<<<<<<< HEAD - name: Check disk +======= + - name: Check disk before tests +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) run: | df -h @@ -172,6 +180,13 @@ jobs: . venv/bin/activate ./scripts/e2e-ci.sh -l +<<<<<<< HEAD +======= + - name: Check disk after tests + run: | + df -h + +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Add comment to PR if the workflow failed if: failure() && steps.check_pr.outputs.is_pr == 'true' working-directory: ./sdg @@ -191,6 +206,7 @@ jobs: - name: Post job results to Slack if the workflow failed if: failure() && steps.check_pr.outputs.is_pr == 'false' id: slack-report-failure +<<<<<<< HEAD uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 with: # Slack channel id, channel name, or user id to post message. @@ -200,10 +216,23 @@ jobs: slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} +======= + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + token: ${{ secrets.SON_OF_JEEVES_TOKEN }} + method: chat.postMessage + payload: | + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel: 'e2e-ci-results' + text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Post job results to Slack if the workflow succeeded if: success() && steps.check_pr.outputs.is_pr == 'false' id: slack-report-success +<<<<<<< HEAD uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 with: # Slack channel id, channel name, or user id to post message. @@ -213,6 +242,18 @@ jobs: slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} +======= + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + token: ${{ secrets.SON_OF_JEEVES_TOKEN }} + method: chat.postMessage + payload: | + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel: 'e2e-ci-results' + text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) stop-large-ec2-runner: needs: @@ -226,7 +267,11 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +<<<<<<< HEAD aws-region: ${{ secrets.AWS_REGION }} +======= + aws-region: ${{ vars.AWS_REGION }} +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Stop EC2 runner uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7 diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml index f1e93811..a069a799 100644 --- a/.github/workflows/e2e-nvidia-t4-x1.yml +++ b/.github/workflows/e2e-nvidia-t4-x1.yml @@ -46,7 +46,11 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +<<<<<<< HEAD aws-region: ${{ secrets.AWS_REGION }} +======= + aws-region: ${{ vars.AWS_REGION }} +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Start EC2 runner id: start-ec2-runner @@ -128,7 +132,11 @@ jobs: . ../instructlab/venv/bin/activate pip install . +<<<<<<< HEAD - name: Check disk +======= + - name: Check disk before tests +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) run: | df -h @@ -138,6 +146,13 @@ jobs: . venv/bin/activate ./scripts/e2e-ci.sh -s +<<<<<<< HEAD +======= + - name: Check disk after tests + run: | + df -h + +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) stop-small-ec2-runner: needs: - start-small-ec2-runner @@ -150,7 +165,11 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +<<<<<<< HEAD aws-region: ${{ secrets.AWS_REGION }} +======= + aws-region: ${{ vars.AWS_REGION }} +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) - name: Stop EC2 runner uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 039618c2..565049af 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -63,7 +63,11 @@ jobs: tox -e validate-pipelines steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 91de85f3..f53b72f1 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -37,7 +37,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs @@ -67,7 +71,11 @@ jobs: steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs @@ -99,7 +107,11 @@ jobs: steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml index 32b02123..146bf734 100644 --- a/.github/workflows/spellcheck.yml +++ b/.github/workflows/spellcheck.yml @@ -32,7 +32,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml index b621e396..2bb7960c 100644 --- a/.github/workflows/stale_bot.yml +++ b/.github/workflows/stale_bot.yml @@ -24,7 +24,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 983a308b..359f0dda 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,6 +40,10 @@ jobs: name: "${{ matrix.python }} on ${{ matrix.platform }}" runs-on: "${{ matrix.platform }}" strategy: +<<<<<<< HEAD +======= + fail-fast: false +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) matrix: python: - "3.10" @@ -51,7 +55,11 @@ jobs: platform: "macos-latest" steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs @@ -89,7 +97,11 @@ jobs: pip cache remove llama_cpp_python - name: Cache huggingface +<<<<<<< HEAD uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 +======= + uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) with: path: ~/.cache/huggingface # config contains DEFAULT_MODEL diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt index 2130dd21..a1f3e1e4 100644 --- a/.spellcheck-en-custom.txt +++ b/.spellcheck-en-custom.txt @@ -4,10 +4,21 @@ Backport backported codebase +<<<<<<< HEAD +======= +configs +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) Dataset dataset datasets distractor +<<<<<<< HEAD +======= +Docling +docling +Eval +eval +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) FIXME freeform ICL @@ -17,6 +28,7 @@ Langchain's LLM LLMBlock MCQ +<<<<<<< HEAD MMLU Ouput Pre @@ -29,6 +41,30 @@ Splitter subfolder Tatsu unchunked +======= +Merlinite +Mixtral +MMLU +multiphase +Ouput +Pre +pre +precomputed +Pregenerated +qna +quantized +repo +sdg +Splitter +subdirectory +subfolder +Tatsu +Tesseract +tokenizer +tokenizers +unchunked +upsampled +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) UUID vLLM yaml diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..8f56e2c6 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,17 @@ +## v0.6.1 + +### Fixes + +* Fixed a bug where generating data from a taxonomy with 2 or more changed knowledge leaf nodes would fail with a message about a destination path `already exists and is not an empty directory` + +## v0.6.0 + +### Features + +* Small knowledge datasets will automatically get upsampled during final data mixing based on the length of any precomputed skills datasets used during data mixing. This avoids issues where very large precomputed skills datasets were swamping the comparatively minor number of knowledge samples, resulting in lower than optimal knowledge retention during multiphase training. If a large precomputed dataset isn't in use during mixing (which is how things operate by default), this change is a no-op. +* When chunking PDF documents, we'll now look for the docling models on-disk in `$XDG_DATA_HOME/instructlab/sdg/models` (as well as `$XDG_DATA_DIRS` with the same `instructlab/sdg/models` subdirectory). If they are not found on disk, they'll automatically be downloaded from HuggingFace. +* When chunking PDF documents with Docling, we'll automatically configure Docling to use `tesserocr` if a working implementation is found instead of relying on `easyocr`. We fallback to `easyocr` if Tesseract is not properly configured for use by `tesserocr`. + +### Breaking Changes + +* Teacher model tokenizers are loaded from the local teacher model on-disk and not downloaded automatically from HuggingFace. The typical workflows in use so far expect the teacher model to exist on-disk, and this enforces that at least its tokenizer exists. diff --git a/README.md b/README.md index 8752dceb..5eb85c3c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ +<<<<<<< HEAD # sdg +======= +# Synthetic Data Generation (SDG) +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) ![Lint](https://github.com/instructlab/sdg/actions/workflows/lint.yml/badge.svg?branch=main) ![Build](https://github.com/instructlab/sdg/actions/workflows/pypi.yaml/badge.svg?branch=main) @@ -10,3 +14,72 @@ ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/sdg/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main) Python library for Synthetic Data Generation +<<<<<<< HEAD +======= + +## Introduction + +Synthetic Data Generation (SDG) is a process that creates an artificially generated dataset that mimics real data based on provided examples. SDG uses a YAML file containing question-and-answer pairs as input data. + +## Installing the SDG library + +Clone the library and navigate to the repo: + +```bash +git clone https://github.com/instructlab/sdg +cd sdg +``` + +Install the library: + +```bash +pip install . +``` + +### Using the library + +You can import SDG into your Python files with the following items: + +```python + from instructlab.sdg.generate_data import generate_data + from instructlab.sdg.utils import GenerateException +``` + +## Pipelines + +A pipeline is a series of steps to execute in order to generate data. + +There are three default pipelines shipped in SDG: `simple`, `full`, and `eval`. Each pipeline requires specific hardware specifications + +### Simple Pipeline + +The [simple pipeline](src/instructlab/sdg/pipelines/simple) is designed to be used with [quantized Merlinite](https://huggingface.co/instructlab/merlinite-7b-lab-GGUF) as the teacher model. It enables basic data generation results on low-end consumer grade hardware, such as laptops and desktops with small or no discrete GPUs. + +### Full Pipeline + +The [full pipeline](src/instructlab/sdg/pipelines/full) is designed to be used with [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) as the the teacher model, but has also been successfully tested with smaller models such as [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) and even some quantized versions of the two teacher models. This is the preferred data generation pipeline on higher end consumer grade hardware and all enterprise hardware. + +### Eval Pipeline + +The [eval pipeline](src/instructlab/sdg/pipelines/eval) is used to generate [MMLU](https://en.wikipedia.org/wiki/MMLU) benchmark data that can be used to later evaluate a trained model on your knowledge dataset. It does not generate data for use during model training. + +### Pipeline architecture + +All the pipelines are written in a YAML format and must adhere to a [specific schema](src/instructlab/sdg/pipelines/schema/v1.json). + +The pipelines that generate data for model training (simple and full pipelines) expect to have three different pipeline configs - one each for knowledge, grounded skills, and freeform skills. They are expected to exist in files called `knowledge.yaml`, `grounded_skills.yaml`, and `freeform_skills.yaml` respectively. For background on the difference in knowledge, grounded skills, and freeform skills, refer to the [InstructLab Taxonomy repository](https://github.com/instructlab/taxonomy). + +## Repository structure + +```bash +|-- src/instructlab/ (1) +|-- docs/ (2) +|-- scripts/ (3) +|-- tests/ (4) +``` + +1. Contains the SDG code that interacts with InstructLab. +2. Contains documentation on various SDG methodologies. +3. Contains some utility scripts, but not part of any supported API. +4. Contains all the tests for the SDG repository. +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) diff --git a/requirements-dev.txt b/requirements-dev.txt index 6a7f352b..342ff18b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,7 +2,11 @@ -r requirements.txt +<<<<<<< HEAD pre-commit>=3.0.4,<4.0 +======= +pre-commit>=3.0.4,<5.0 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) pylint>=2.16.2,<4.0 pylint-pydantic pytest diff --git a/requirements.txt b/requirements.txt index 7984751c..48a7b5ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 +<<<<<<< HEAD docling[tesserocr]>=2.4.2,<3.0.0 +======= +docling[tesserocr]>=2.4.2,<=2.8.3 +docling-parse>=2.0.0,<3.0.0 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) GitPython>=3.1.42,<4.0.0 gguf>=0.6.0 httpx>=0.25.0,<1.0.0 diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index d97cdc27..8d954af3 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -33,6 +33,10 @@ PipelineContext, ) from instructlab.sdg.utils import GenerateException, models +<<<<<<< HEAD +======= +from instructlab.sdg.utils.json import jldump +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, read_taxonomy_leaf_nodes, @@ -112,6 +116,7 @@ def _gen_train_data( } messages_data.append(_convert_to_messages(sample)) +<<<<<<< HEAD with open(output_file_train, "w", encoding="utf-8") as outfile: for entry in train_data: json.dump(entry, outfile, ensure_ascii=False) @@ -121,6 +126,11 @@ def _gen_train_data( for entry in messages_data: json.dump(entry, outfile, ensure_ascii=False) outfile.write("\n") +======= + jldump(train_data, output_file_train) + + jldump(messages_data, output_file_messages) +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) def _knowledge_seed_example_to_test_data(seed_example, system_prompt): @@ -170,10 +180,14 @@ def _gen_test_data( } ) +<<<<<<< HEAD with open(output_file_test, "w", encoding="utf-8") as outfile: for entry in test_data: json.dump(entry, outfile, ensure_ascii=False) outfile.write("\n") +======= + jldump(test_data, output_file_test) +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) def _check_pipeline_dir(pipeline): @@ -295,7 +309,11 @@ def _mixer_init( # This is part of the public API, and used by instructlab. # TODO - parameter removal needs to be done in sync with a CLI change. +<<<<<<< HEAD # to be removed: logger, prompt_file_path, rouge_threshold, tls_* +======= +# to be removed: logger +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) def generate_data( client: openai.OpenAI, logger: logging.Logger = logger, # pylint: disable=redefined-outer-name @@ -308,10 +326,13 @@ def generate_data( taxonomy: Optional[str] = None, # TODO rename to taxonomy_path to match config taxonomy_base: Optional[str] = None, output_dir: Optional[str] = None, +<<<<<<< HEAD # TODO - not used and should be removed from the CLI prompt_file_path: Optional[str] = None, # pylint: disable=unused-argument # TODO - probably should be removed rouge_threshold: Optional[float] = None, # pylint: disable=unused-argument +======= +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) console_output=True, yaml_rules: Optional[str] = None, chunk_word_count=None, diff --git a/src/instructlab/sdg/utils/json.py b/src/instructlab/sdg/utils/json.py index 8fd25268..8bbae4e9 100644 --- a/src/instructlab/sdg/utils/json.py +++ b/src/instructlab/sdg/utils/json.py @@ -1,6 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +<<<<<<< HEAD +======= +from typing import Any, Iterable +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) import io import json import os @@ -46,3 +50,19 @@ def jload(f, mode="r"): """Load a .json file into a dictionary.""" with _make_r_io_base(f, mode) as f_: return json.load(f_) +<<<<<<< HEAD +======= + + +def jldump(data: Iterable[Any], out: str | io.IOBase) -> None: + """Dump a list to a file in jsonl format. + + Args: + data: An data to be written. + f: io.IOBase or file path + """ + with _make_w_io_base(out, "w") as outfile: + for entry in data: + json.dump(entry, outfile, ensure_ascii=False) + outfile.write("\n") +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) diff --git a/tox.ini b/tox.ini index 723212a9..e41069cc 100644 --- a/tox.ini +++ b/tox.ini @@ -73,10 +73,20 @@ allowlist_externals = sh [testenv:mypy] description = Python type checking with mypy +<<<<<<< HEAD deps = mypy>=1.10.0,<2.0 types-PyYAML pytest +======= +# Note: 'mypy<1.14' by default pulls the latest 'pydantic' release as a dependency, but 'pydantic>=2.10' does not +# work with 'mypy<1.14', so for compatibility purposes, we set 'pydantic<=2.9.2' +deps = + mypy>=1.10.0,<1.14 + types-PyYAML + pytest + pydantic<=2.9.2 +>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`) commands = mypy src