From 5b5072a30b53a9860111f51598dd6cb7640a58e4 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Tue, 10 Dec 2024 12:23:59 -0500
Subject: [PATCH] Loosen up md-lint so we can write changelogs

Signed-off-by: Ben Browning <bbrownin@redhat.com>
(cherry picked from commit fe31e7d3f8cddbd9d0e8e1b38094efde68bad79d)

# Conflicts:
#	.github/workflows/actionlint.yml
#	.github/workflows/docs.yml
#	.github/workflows/e2e-nvidia-l4-x1.yml
#	.github/workflows/e2e-nvidia-l40s-x4.yml
#	.github/workflows/e2e-nvidia-t4-x1.yml
#	.github/workflows/lint.yml
#	.github/workflows/pypi.yaml
#	.github/workflows/spellcheck.yml
#	.github/workflows/stale_bot.yml
#	.github/workflows/test.yml
#	.markdownlint-cli2.yaml
#	.spellcheck-en-custom.txt
#	README.md
#	requirements-dev.txt
#	requirements.txt
#	src/instructlab/sdg/generate_data.py
#	src/instructlab/sdg/utils/json.py
#	tox.ini
---
 .github/workflows/actionlint.yml         | 12 ++++
 .github/workflows/docs.yml               |  4 ++
 .github/workflows/e2e-nvidia-l4-x1.yml   | 11 ++++
 .github/workflows/e2e-nvidia-l40s-x4.yml | 37 ++++++++++++
 .github/workflows/e2e-nvidia-t4-x1.yml   | 11 ++++
 .github/workflows/lint.yml               |  4 ++
 .github/workflows/pypi.yaml              | 20 +++++++
 .github/workflows/spellcheck.yml         |  4 ++
 .github/workflows/stale_bot.yml          |  4 ++
 .github/workflows/test.yml               | 12 ++++
 .markdownlint-cli2.yaml                  |  8 +++
 .spellcheck-en-custom.txt                | 36 ++++++++++++
 README.md                                | 73 ++++++++++++++++++++++++
 requirements-dev.txt                     |  4 ++
 requirements.txt                         |  4 ++
 src/instructlab/sdg/generate_data.py     | 21 +++++++
 src/instructlab/sdg/utils/json.py        | 20 +++++++
 tox.ini                                  |  7 +++
 18 files changed, 292 insertions(+)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index b85ad788..87c3f88e 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -7,12 +7,20 @@ on:
       - "main"
       - "release-**"
     paths:
+<<<<<<< HEAD
+=======
+      - '.github/actions/*.ya?ml'
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*' # This workflow
   pull_request:
     branches:
       - "main"
     paths:
+<<<<<<< HEAD
+=======
+      - '.github/actions/*.ya?ml'
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*' # This workflow
 
@@ -31,7 +39,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 34e2afbb..2107eb1e 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -33,7 +33,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index 63f4ff8a..bf84733b 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -130,7 +130,11 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install  -v .
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         run: |
           df -h
   
@@ -142,6 +146,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -m
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
   stop-medium-ec2-runner:
     needs:
       - start-medium-ec2-runner
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index 5f5d5c5c..d2d96410 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -160,7 +160,11 @@ jobs:
           pip install .
           pip install .[cuda]
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         run: |
           df -h
 
@@ -172,6 +176,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -l
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
         working-directory: ./sdg
@@ -191,6 +202,7 @@ jobs:
       - name: Post job results to Slack if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'false'
         id: slack-report-failure
+<<<<<<< HEAD
         uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
         with:
           # Slack channel id, channel name, or user id to post message.
@@ -200,10 +212,23 @@ jobs:
           slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
         env:
           SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+=======
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 
       - name: Post job results to Slack if the workflow succeeded
         if: success() && steps.check_pr.outputs.is_pr == 'false'
         id: slack-report-success
+<<<<<<< HEAD
         uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
         with:
           # Slack channel id, channel name, or user id to post message.
@@ -213,6 +238,18 @@ jobs:
           slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
         env:
           SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+=======
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 
   stop-large-ec2-runner:
     needs:
diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml
index 3814da44..455e7eb4 100644
--- a/.github/workflows/e2e-nvidia-t4-x1.yml
+++ b/.github/workflows/e2e-nvidia-t4-x1.yml
@@ -128,7 +128,11 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install .
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         run: |
           df -h
 
@@ -138,6 +142,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -s
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
   stop-small-ec2-runner:
     needs:
       - start-small-ec2-runner
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 039618c2..5426209b 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -63,7 +63,11 @@ jobs:
               tox -e validate-pipelines
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 91de85f3..8121373d 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -37,7 +37,11 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -67,7 +71,11 @@ jobs:
 
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -78,7 +86,11 @@ jobs:
                   path: dist
 
             - name: "Upload to Test PyPI"
+<<<<<<< HEAD
               uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
+=======
+              uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -99,7 +111,11 @@ jobs:
 
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -130,4 +146,8 @@ jobs:
                   rm ./dist/*.sigstore.json
 
             - name: "Upload to PyPI"
+<<<<<<< HEAD
               uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
+=======
+              uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
index 32b02123..8c0d5eee 100644
--- a/.github/workflows/spellcheck.yml
+++ b/.github/workflows/spellcheck.yml
@@ -32,7 +32,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         with:
          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml
index b621e396..db3fd0b4 100644
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@@ -24,7 +24,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 983a308b..e12b4ee1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,6 +40,10 @@ jobs:
     name: "${{ matrix.python }} on ${{ matrix.platform }}"
     runs-on: "${{ matrix.platform }}"
     strategy:
+<<<<<<< HEAD
+=======
+      fail-fast: false
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
       matrix:
         python:
           - "3.10"
@@ -51,7 +55,11 @@ jobs:
             platform: "macos-latest"
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -89,7 +97,11 @@ jobs:
           pip cache remove llama_cpp_python
 
       - name: Cache huggingface
+<<<<<<< HEAD
         uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+=======
+        uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
         with:
           path: ~/.cache/huggingface
           # config contains DEFAULT_MODEL
diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
index c248cb45..45e72c76 100644
--- a/.markdownlint-cli2.yaml
+++ b/.markdownlint-cli2.yaml
@@ -7,10 +7,18 @@ config:
   code-block-style: false
   no-duplicate-header: false
   single-trailing-newline: false
+<<<<<<< HEAD
+=======
+  no-duplicate-heading: false
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 globs:
   - "**/*.md"
 ignores:
   - ".github/**"
+<<<<<<< HEAD
+=======
+  - ".tox/**"
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
   - "venv/**"
   - ".venv/**"
   - "**/testdata/**"
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
index 2130dd21..60fbe180 100644
--- a/.spellcheck-en-custom.txt
+++ b/.spellcheck-en-custom.txt
@@ -4,10 +4,21 @@
 Backport
 backported
 codebase
+<<<<<<< HEAD
+=======
+configs
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 Dataset
 dataset
 datasets
 distractor
+<<<<<<< HEAD
+=======
+Docling
+docling
+Eval
+eval
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 FIXME
 freeform
 ICL
@@ -17,6 +28,7 @@ Langchain's
 LLM
 LLMBlock
 MCQ
+<<<<<<< HEAD
 MMLU
 Ouput
 Pre
@@ -29,6 +41,30 @@ Splitter
 subfolder
 Tatsu
 unchunked
+=======
+Merlinite
+Mixtral
+MMLU
+multiphase
+Ouput
+Pre
+pre
+precomputed
+Pregenerated
+qna
+quantized
+repo
+sdg
+Splitter
+subdirectory
+subfolder
+Tatsu
+Tesseract
+tokenizer
+tokenizers
+unchunked
+upsampled
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 UUID
 vLLM
 yaml
diff --git a/README.md b/README.md
index 8752dceb..1e750855 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # sdg
+=======
+# Synthetic Data Generation (SDG)
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 
 ![Lint](https://github.com/instructlab/sdg/actions/workflows/lint.yml/badge.svg?branch=main)
 ![Build](https://github.com/instructlab/sdg/actions/workflows/pypi.yaml/badge.svg?branch=main)
@@ -10,3 +14,72 @@
 ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/sdg/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
 
 Python library for Synthetic Data Generation
+<<<<<<< HEAD
+=======
+
+## Introduction
+
+Synthetic Data Generation (SDG) is a process that creates an artificially generated dataset that mimics real data based on provided examples. SDG uses a YAML file containing question-and-answer pairs as input data.
+
+## Installing the SDG library
+
+Clone the library and navigate to the repo:
+
+```bash
+git clone https://github.com/instructlab/sdg
+cd sdg
+```
+
+Install the library:
+
+```bash
+pip install .
+```
+
+### Using the library
+
+You can import SDG into your Python files with the following items:
+
+```python
+ from instructlab.sdg.generate_data import generate_data
+ from instructlab.sdg.utils import GenerateException
+```
+
+## Pipelines
+
+A pipeline is a series of steps to execute in order to generate data.
+
+There are three default pipelines shipped in SDG: `simple`, `full`, and `eval`. Each pipeline requires specific hardware specifications
+
+### Simple Pipeline
+
+The [simple pipeline](src/instructlab/sdg/pipelines/simple) is designed to be used with [quantized Merlinite](https://huggingface.co/instructlab/merlinite-7b-lab-GGUF) as the teacher model. It enables basic data generation results on low-end consumer grade hardware, such as laptops and desktops with small or no discrete GPUs.
+
+### Full Pipeline
+
+The [full pipeline](src/instructlab/sdg/pipelines/full) is designed to be used with [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) as the the teacher model, but has also been successfully tested with smaller models such as [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) and even some quantized versions of the two teacher models. This is the preferred data generation pipeline on higher end consumer grade hardware and all enterprise hardware.
+
+### Eval Pipeline
+
+The [eval pipeline](src/instructlab/sdg/pipelines/eval) is used to generate [MMLU](https://en.wikipedia.org/wiki/MMLU) benchmark data that can be used to later evaluate a trained model on your knowledge dataset. It does not generate data for use during model training.
+
+### Pipeline architecture
+
+All the pipelines are written in a YAML format and must adhere to a [specific schema](src/instructlab/sdg/pipelines/schema/v1.json).
+
+The pipelines that generate data for model training (simple and full pipelines) expect to have three different pipeline configs - one each for knowledge, grounded skills, and freeform skills. They are expected to exist in files called `knowledge.yaml`, `grounded_skills.yaml`, and `freeform_skills.yaml` respectively. For background on the difference in knowledge, grounded skills, and freeform skills, refer to the [InstructLab Taxonomy repository](https://github.com/instructlab/taxonomy).
+
+## Repository structure
+
+```bash
+|-- src/instructlab/ (1)
+|-- docs/ (2)
+|-- scripts/ (3)
+|-- tests/ (4)
+```
+
+1. Contains the SDG code that interacts with InstructLab.
+2. Contains documentation on various SDG methodologies.
+3. Contains some utility scripts, but not part of any supported API.
+4. Contains all the tests for the SDG repository.
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6a7f352b..4086f277 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,7 +2,11 @@
 
 -r requirements.txt
 
+<<<<<<< HEAD
 pre-commit>=3.0.4,<4.0
+=======
+pre-commit>=3.0.4,<5.0
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 pylint>=2.16.2,<4.0
 pylint-pydantic
 pytest
diff --git a/requirements.txt b/requirements.txt
index 3d577c7a..72760fdc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
+<<<<<<< HEAD
 docling[tesserocr]>=2.4.2,<=2.8.3
 docling-parse>=2.0.0,<3.0.0
+=======
+docling[tesserocr]>=2.4.2,<3.0.0
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 GitPython>=3.1.42,<4.0.0
 gguf>=0.6.0
 httpx>=0.25.0,<1.0.0
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index d97cdc27..2c989751 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -33,6 +33,10 @@
     PipelineContext,
 )
 from instructlab.sdg.utils import GenerateException, models
+<<<<<<< HEAD
+=======
+from instructlab.sdg.utils.json import jldump
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 from instructlab.sdg.utils.taxonomy import (
     leaf_node_to_samples,
     read_taxonomy_leaf_nodes,
@@ -112,6 +116,7 @@ def _gen_train_data(
             }
             messages_data.append(_convert_to_messages(sample))
 
+<<<<<<< HEAD
     with open(output_file_train, "w", encoding="utf-8") as outfile:
         for entry in train_data:
             json.dump(entry, outfile, ensure_ascii=False)
@@ -121,6 +126,11 @@ def _gen_train_data(
         for entry in messages_data:
             json.dump(entry, outfile, ensure_ascii=False)
             outfile.write("\n")
+=======
+    jldump(train_data, output_file_train)
+
+    jldump(messages_data, output_file_messages)
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 
 
 def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
@@ -170,10 +180,14 @@ def _gen_test_data(
                 }
             )
 
+<<<<<<< HEAD
     with open(output_file_test, "w", encoding="utf-8") as outfile:
         for entry in test_data:
             json.dump(entry, outfile, ensure_ascii=False)
             outfile.write("\n")
+=======
+        jldump(test_data, output_file_test)
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 
 
 def _check_pipeline_dir(pipeline):
@@ -295,7 +309,11 @@ def _mixer_init(
 
 # This is part of the public API, and used by instructlab.
 # TODO - parameter removal needs to be done in sync with a CLI change.
+<<<<<<< HEAD
 # to be removed: logger, prompt_file_path, rouge_threshold, tls_*
+=======
+# to be removed: logger
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 def generate_data(
     client: openai.OpenAI,
     logger: logging.Logger = logger,  # pylint: disable=redefined-outer-name
@@ -308,10 +326,13 @@ def generate_data(
     taxonomy: Optional[str] = None,  # TODO rename to taxonomy_path to match config
     taxonomy_base: Optional[str] = None,
     output_dir: Optional[str] = None,
+<<<<<<< HEAD
     # TODO - not used and should be removed from the CLI
     prompt_file_path: Optional[str] = None,  # pylint: disable=unused-argument
     # TODO - probably should be removed
     rouge_threshold: Optional[float] = None,  # pylint: disable=unused-argument
+=======
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
     console_output=True,
     yaml_rules: Optional[str] = None,
     chunk_word_count=None,
diff --git a/src/instructlab/sdg/utils/json.py b/src/instructlab/sdg/utils/json.py
index 8fd25268..e6c73f61 100644
--- a/src/instructlab/sdg/utils/json.py
+++ b/src/instructlab/sdg/utils/json.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+<<<<<<< HEAD
+=======
+from typing import Any, Iterable
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 import io
 import json
 import os
@@ -46,3 +50,19 @@ def jload(f, mode="r"):
     """Load a .json file into a dictionary."""
     with _make_r_io_base(f, mode) as f_:
         return json.load(f_)
+<<<<<<< HEAD
+=======
+
+
+def jldump(data: Iterable[Any], out: str | io.IOBase) -> None:
+    """Dump a list to a file in jsonl format.
+
+    Args:
+        data: An data to be written.
+        f: io.IOBase or file path
+    """
+    with _make_w_io_base(out, "w") as outfile:
+        for entry in data:
+            json.dump(entry, outfile, ensure_ascii=False)
+            outfile.write("\n")
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
diff --git a/tox.ini b/tox.ini
index 7590a2ba..2d27e98a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -73,6 +73,7 @@ allowlist_externals = sh
 
 [testenv:mypy]
 description = Python type checking with mypy
+<<<<<<< HEAD
 # Note: 'mypy<1.14' by default pulls the latest 'pydantic' release as a dependency, but 'pydantic>=2.10' does not
 # work with 'mypy<1.14', so for compatibility purposes, we set 'pydantic<=2.9.2'
 deps =
@@ -80,6 +81,12 @@ deps =
   types-PyYAML
   pytest
   pydantic<=2.9.2
+=======
+deps =
+  mypy>=1.10.0,<2.0
+  types-PyYAML
+  pytest
+>>>>>>> fe31e7d (Loosen up md-lint so we can write changelogs)
 commands =
   mypy src