From 5b5072a30b53a9860111f51598dd6cb7640a58e4 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Tue, 10 Dec 2024 12:23:59 -0500 Subject: [PATCH] Loosen up md-lint so we can write changelogs Signed-off-by: Ben Browning (cherry picked from commit fe31e7d3f8cddbd9d0e8e1b38094efde68bad79d) # Conflicts: # .github/workflows/actionlint.yml # .github/workflows/docs.yml # .github/workflows/e2e-nvidia-l4-x1.yml # .github/workflows/e2e-nvidia-l40s-x4.yml # .github/workflows/e2e-nvidia-t4-x1.yml # .github/workflows/lint.yml # .github/workflows/pypi.yaml # .github/workflows/spellcheck.yml # .github/workflows/stale_bot.yml # .github/workflows/test.yml # .markdownlint-cli2.yaml # .spellcheck-en-custom.txt # README.md # requirements-dev.txt # requirements.txt # src/instructlab/sdg/generate_data.py # src/instructlab/sdg/utils/json.py # tox.ini --- .github/workflows/actionlint.yml | 12 ++++ .github/workflows/docs.yml | 4 ++ .github/workflows/e2e-nvidia-l4-x1.yml | 11 ++++ .github/workflows/e2e-nvidia-l40s-x4.yml | 37 ++++++++++++ .github/workflows/e2e-nvidia-t4-x1.yml | 11 ++++ .github/workflows/lint.yml | 4 ++ .github/workflows/pypi.yaml | 20 +++++++ .github/workflows/spellcheck.yml | 4 ++ .github/workflows/stale_bot.yml | 4 ++ .github/workflows/test.yml | 12 ++++ .markdownlint-cli2.yaml | 8 +++ .spellcheck-en-custom.txt | 36 ++++++++++++ README.md | 73 ++++++++++++++++++++++++ requirements-dev.txt | 4 ++ requirements.txt | 4 ++ src/instructlab/sdg/generate_data.py | 21 +++++++ src/instructlab/sdg/utils/json.py | 20 +++++++ tox.ini | 7 +++ 18 files changed, 292 insertions(+) diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index b85ad788..87c3f88e 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -7,12 +7,20 @@ on: - "main" - "release-**" paths: +<<<<<<< HEAD +======= + - '.github/actions/*.ya?ml' +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' # This workflow pull_request: branches: - "main" paths: +<<<<<<< HEAD +======= + - '.github/actions/*.ya?ml' +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' # This workflow @@ -31,7 +39,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 34e2afbb..2107eb1e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -33,7 +33,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - name: "Checkout" diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index 63f4ff8a..bf84733b 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -130,7 +130,11 @@ jobs: . ../instructlab/venv/bin/activate pip install -v . +<<<<<<< HEAD - name: Check disk +======= + - name: Check disk before tests +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) run: | df -h @@ -142,6 +146,13 @@ jobs: . venv/bin/activate ./scripts/e2e-ci.sh -m +<<<<<<< HEAD +======= + - name: Check disk after tests + run: | + df -h + +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) stop-medium-ec2-runner: needs: - start-medium-ec2-runner diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 5f5d5c5c..d2d96410 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -160,7 +160,11 @@ jobs: pip install . pip install .[cuda] +<<<<<<< HEAD - name: Check disk +======= + - name: Check disk before tests +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) run: | df -h @@ -172,6 +176,13 @@ jobs: . venv/bin/activate ./scripts/e2e-ci.sh -l +<<<<<<< HEAD +======= + - name: Check disk after tests + run: | + df -h + +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) - name: Add comment to PR if the workflow failed if: failure() && steps.check_pr.outputs.is_pr == 'true' working-directory: ./sdg @@ -191,6 +202,7 @@ jobs: - name: Post job results to Slack if the workflow failed if: failure() && steps.check_pr.outputs.is_pr == 'false' id: slack-report-failure +<<<<<<< HEAD uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 with: # Slack channel id, channel name, or user id to post message. @@ -200,10 +212,23 @@ jobs: slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} +======= + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + token: ${{ secrets.SON_OF_JEEVES_TOKEN }} + method: chat.postMessage + payload: | + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel: 'e2e-ci-results' + text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) - name: Post job results to Slack if the workflow succeeded if: success() && steps.check_pr.outputs.is_pr == 'false' id: slack-report-success +<<<<<<< HEAD uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 with: # Slack channel id, channel name, or user id to post message. @@ -213,6 +238,18 @@ jobs: slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} +======= + uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + with: + token: ${{ secrets.SON_OF_JEEVES_TOKEN }} + method: chat.postMessage + payload: | + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel: 'e2e-ci-results' + text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) stop-large-ec2-runner: needs: diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml index 3814da44..455e7eb4 100644 --- a/.github/workflows/e2e-nvidia-t4-x1.yml +++ b/.github/workflows/e2e-nvidia-t4-x1.yml @@ -128,7 +128,11 @@ jobs: . ../instructlab/venv/bin/activate pip install . +<<<<<<< HEAD - name: Check disk +======= + - name: Check disk before tests +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) run: | df -h @@ -138,6 +142,13 @@ jobs: . venv/bin/activate ./scripts/e2e-ci.sh -s +<<<<<<< HEAD +======= + - name: Check disk after tests + run: | + df -h + +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) stop-small-ec2-runner: needs: - start-small-ec2-runner diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 039618c2..5426209b 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -63,7 +63,11 @@ jobs: tox -e validate-pipelines steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 91de85f3..8121373d 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -37,7 +37,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs @@ -67,7 +71,11 @@ jobs: steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs @@ -78,7 +86,11 @@ jobs: path: dist - name: "Upload to Test PyPI" +<<<<<<< HEAD uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2 +======= + uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: repository-url: https://test.pypi.org/legacy/ @@ -99,7 +111,11 @@ jobs: steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs @@ -130,4 +146,8 @@ jobs: rm ./dist/*.sigstore.json - name: "Upload to PyPI" +<<<<<<< HEAD uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2 +======= + uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml index 32b02123..8c0d5eee 100644 --- a/.github/workflows/spellcheck.yml +++ b/.github/workflows/spellcheck.yml @@ -32,7 +32,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml index b621e396..db3fd0b4 100644 --- a/.github/workflows/stale_bot.yml +++ b/.github/workflows/stale_bot.yml @@ -24,7 +24,11 @@ jobs: runs-on: ubuntu-latest steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 983a308b..e12b4ee1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,6 +40,10 @@ jobs: name: "${{ matrix.python }} on ${{ matrix.platform }}" runs-on: "${{ matrix.platform }}" strategy: +<<<<<<< HEAD +======= + fail-fast: false +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) matrix: python: - "3.10" @@ -51,7 +55,11 @@ jobs: platform: "macos-latest" steps: - name: "Harden Runner" +<<<<<<< HEAD uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 +======= + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs @@ -89,7 +97,11 @@ jobs: pip cache remove llama_cpp_python - name: Cache huggingface +<<<<<<< HEAD uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 +======= + uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) with: path: ~/.cache/huggingface # config contains DEFAULT_MODEL diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml index c248cb45..45e72c76 100644 --- a/.markdownlint-cli2.yaml +++ b/.markdownlint-cli2.yaml @@ -7,10 +7,18 @@ config: code-block-style: false no-duplicate-header: false single-trailing-newline: false +<<<<<<< HEAD +======= + no-duplicate-heading: false +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) globs: - "**/*.md" ignores: - ".github/**" +<<<<<<< HEAD +======= + - ".tox/**" +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) - "venv/**" - ".venv/**" - "**/testdata/**" diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt index 2130dd21..60fbe180 100644 --- a/.spellcheck-en-custom.txt +++ b/.spellcheck-en-custom.txt @@ -4,10 +4,21 @@ Backport backported codebase +<<<<<<< HEAD +======= +configs +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) Dataset dataset datasets distractor +<<<<<<< HEAD +======= +Docling +docling +Eval +eval +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) FIXME freeform ICL @@ -17,6 +28,7 @@ Langchain's LLM LLMBlock MCQ +<<<<<<< HEAD MMLU Ouput Pre @@ -29,6 +41,30 @@ Splitter subfolder Tatsu unchunked +======= +Merlinite +Mixtral +MMLU +multiphase +Ouput +Pre +pre +precomputed +Pregenerated +qna +quantized +repo +sdg +Splitter +subdirectory +subfolder +Tatsu +Tesseract +tokenizer +tokenizers +unchunked +upsampled +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) UUID vLLM yaml diff --git a/README.md b/README.md index 8752dceb..1e750855 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ +<<<<<<< HEAD # sdg +======= +# Synthetic Data Generation (SDG) +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) ![Lint](https://github.com/instructlab/sdg/actions/workflows/lint.yml/badge.svg?branch=main) ![Build](https://github.com/instructlab/sdg/actions/workflows/pypi.yaml/badge.svg?branch=main) @@ -10,3 +14,72 @@ ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/sdg/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main) Python library for Synthetic Data Generation +<<<<<<< HEAD +======= + +## Introduction + +Synthetic Data Generation (SDG) is a process that creates an artificially generated dataset that mimics real data based on provided examples. SDG uses a YAML file containing question-and-answer pairs as input data. + +## Installing the SDG library + +Clone the library and navigate to the repo: + +```bash +git clone https://github.com/instructlab/sdg +cd sdg +``` + +Install the library: + +```bash +pip install . +``` + +### Using the library + +You can import SDG into your Python files with the following items: + +```python + from instructlab.sdg.generate_data import generate_data + from instructlab.sdg.utils import GenerateException +``` + +## Pipelines + +A pipeline is a series of steps to execute in order to generate data. + +There are three default pipelines shipped in SDG: `simple`, `full`, and `eval`. Each pipeline requires specific hardware specifications + +### Simple Pipeline + +The [simple pipeline](src/instructlab/sdg/pipelines/simple) is designed to be used with [quantized Merlinite](https://huggingface.co/instructlab/merlinite-7b-lab-GGUF) as the teacher model. It enables basic data generation results on low-end consumer grade hardware, such as laptops and desktops with small or no discrete GPUs. + +### Full Pipeline + +The [full pipeline](src/instructlab/sdg/pipelines/full) is designed to be used with [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) as the the teacher model, but has also been successfully tested with smaller models such as [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) and even some quantized versions of the two teacher models. This is the preferred data generation pipeline on higher end consumer grade hardware and all enterprise hardware. + +### Eval Pipeline + +The [eval pipeline](src/instructlab/sdg/pipelines/eval) is used to generate [MMLU](https://en.wikipedia.org/wiki/MMLU) benchmark data that can be used to later evaluate a trained model on your knowledge dataset. It does not generate data for use during model training. + +### Pipeline architecture + +All the pipelines are written in a YAML format and must adhere to a [specific schema](src/instructlab/sdg/pipelines/schema/v1.json). + +The pipelines that generate data for model training (simple and full pipelines) expect to have three different pipeline configs - one each for knowledge, grounded skills, and freeform skills. They are expected to exist in files called `knowledge.yaml`, `grounded_skills.yaml`, and `freeform_skills.yaml` respectively. For background on the difference in knowledge, grounded skills, and freeform skills, refer to the [InstructLab Taxonomy repository](https://github.com/instructlab/taxonomy). + +## Repository structure + +```bash +|-- src/instructlab/ (1) +|-- docs/ (2) +|-- scripts/ (3) +|-- tests/ (4) +``` + +1. Contains the SDG code that interacts with InstructLab. +2. Contains documentation on various SDG methodologies. +3. Contains some utility scripts, but not part of any supported API. +4. Contains all the tests for the SDG repository. +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) diff --git a/requirements-dev.txt b/requirements-dev.txt index 6a7f352b..4086f277 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,7 +2,11 @@ -r requirements.txt +<<<<<<< HEAD pre-commit>=3.0.4,<4.0 +======= +pre-commit>=3.0.4,<5.0 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) pylint>=2.16.2,<4.0 pylint-pydantic pytest diff --git a/requirements.txt b/requirements.txt index 3d577c7a..72760fdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 +<<<<<<< HEAD docling[tesserocr]>=2.4.2,<=2.8.3 docling-parse>=2.0.0,<3.0.0 +======= +docling[tesserocr]>=2.4.2,<3.0.0 +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) GitPython>=3.1.42,<4.0.0 gguf>=0.6.0 httpx>=0.25.0,<1.0.0 diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index d97cdc27..2c989751 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -33,6 +33,10 @@ PipelineContext, ) from instructlab.sdg.utils import GenerateException, models +<<<<<<< HEAD +======= +from instructlab.sdg.utils.json import jldump +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, read_taxonomy_leaf_nodes, @@ -112,6 +116,7 @@ def _gen_train_data( } messages_data.append(_convert_to_messages(sample)) +<<<<<<< HEAD with open(output_file_train, "w", encoding="utf-8") as outfile: for entry in train_data: json.dump(entry, outfile, ensure_ascii=False) @@ -121,6 +126,11 @@ def _gen_train_data( for entry in messages_data: json.dump(entry, outfile, ensure_ascii=False) outfile.write("\n") +======= + jldump(train_data, output_file_train) + + jldump(messages_data, output_file_messages) +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) def _knowledge_seed_example_to_test_data(seed_example, system_prompt): @@ -170,10 +180,14 @@ def _gen_test_data( } ) +<<<<<<< HEAD with open(output_file_test, "w", encoding="utf-8") as outfile: for entry in test_data: json.dump(entry, outfile, ensure_ascii=False) outfile.write("\n") +======= + jldump(test_data, output_file_test) +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) def _check_pipeline_dir(pipeline): @@ -295,7 +309,11 @@ def _mixer_init( # This is part of the public API, and used by instructlab. # TODO - parameter removal needs to be done in sync with a CLI change. +<<<<<<< HEAD # to be removed: logger, prompt_file_path, rouge_threshold, tls_* +======= +# to be removed: logger +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) def generate_data( client: openai.OpenAI, logger: logging.Logger = logger, # pylint: disable=redefined-outer-name @@ -308,10 +326,13 @@ def generate_data( taxonomy: Optional[str] = None, # TODO rename to taxonomy_path to match config taxonomy_base: Optional[str] = None, output_dir: Optional[str] = None, +<<<<<<< HEAD # TODO - not used and should be removed from the CLI prompt_file_path: Optional[str] = None, # pylint: disable=unused-argument # TODO - probably should be removed rouge_threshold: Optional[float] = None, # pylint: disable=unused-argument +======= +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) console_output=True, yaml_rules: Optional[str] = None, chunk_word_count=None, diff --git a/src/instructlab/sdg/utils/json.py b/src/instructlab/sdg/utils/json.py index 8fd25268..e6c73f61 100644 --- a/src/instructlab/sdg/utils/json.py +++ b/src/instructlab/sdg/utils/json.py @@ -1,6 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +<<<<<<< HEAD +======= +from typing import Any, Iterable +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) import io import json import os @@ -46,3 +50,19 @@ def jload(f, mode="r"): """Load a .json file into a dictionary.""" with _make_r_io_base(f, mode) as f_: return json.load(f_) +<<<<<<< HEAD +======= + + +def jldump(data: Iterable[Any], out: str | io.IOBase) -> None: + """Dump a list to a file in jsonl format. + + Args: + data: An data to be written. + f: io.IOBase or file path + """ + with _make_w_io_base(out, "w") as outfile: + for entry in data: + json.dump(entry, outfile, ensure_ascii=False) + outfile.write("\n") +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) diff --git a/tox.ini b/tox.ini index 7590a2ba..2d27e98a 100644 --- a/tox.ini +++ b/tox.ini @@ -73,6 +73,7 @@ allowlist_externals = sh [testenv:mypy] description = Python type checking with mypy +<<<<<<< HEAD # Note: 'mypy<1.14' by default pulls the latest 'pydantic' release as a dependency, but 'pydantic>=2.10' does not # work with 'mypy<1.14', so for compatibility purposes, we set 'pydantic<=2.9.2' deps = @@ -80,6 +81,12 @@ deps = types-PyYAML pytest pydantic<=2.9.2 +======= +deps = + mypy>=1.10.0,<2.0 + types-PyYAML + pytest +>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs) commands = mypy src