diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index b85ad788..c66f4e09 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -7,12 +7,20 @@ on:
       - "main"
       - "release-**"
     paths:
+<<<<<<< HEAD
+=======
+      - '.github/actions/*.ya?ml'
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*' # This workflow
   pull_request:
     branches:
       - "main"
     paths:
+<<<<<<< HEAD
+=======
+      - '.github/actions/*.ya?ml'
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*' # This workflow
 
@@ -31,7 +39,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 34e2afbb..e940bd1c 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -33,7 +33,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index 2f82071b..9ebcd234 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -46,7 +46,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -130,7 +134,11 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install  -v .
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         run: |
           df -h
   
@@ -142,6 +150,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -m
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
   stop-medium-ec2-runner:
     needs:
       - start-medium-ec2-runner
@@ -154,7 +169,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index 59531d70..12c17eba 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -24,7 +24,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -160,7 +164,11 @@ jobs:
           pip install .
           pip install .[cuda]
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         run: |
           df -h
 
@@ -172,6 +180,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -l
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
         working-directory: ./sdg
@@ -191,6 +206,7 @@ jobs:
       - name: Post job results to Slack if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'false'
         id: slack-report-failure
+<<<<<<< HEAD
         uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
         with:
           # Slack channel id, channel name, or user id to post message.
@@ -200,10 +216,23 @@ jobs:
           slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
         env:
           SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+=======
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Post job results to Slack if the workflow succeeded
         if: success() && steps.check_pr.outputs.is_pr == 'false'
         id: slack-report-success
+<<<<<<< HEAD
         uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
         with:
           # Slack channel id, channel name, or user id to post message.
@@ -213,6 +242,18 @@ jobs:
           slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
         env:
           SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+=======
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
   stop-large-ec2-runner:
     needs:
@@ -226,7 +267,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml
index f1e93811..a069a799 100644
--- a/.github/workflows/e2e-nvidia-t4-x1.yml
+++ b/.github/workflows/e2e-nvidia-t4-x1.yml
@@ -46,7 +46,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -128,7 +132,11 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install .
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         run: |
           df -h
 
@@ -138,6 +146,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -s
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
   stop-small-ec2-runner:
     needs:
       - start-small-ec2-runner
@@ -150,7 +165,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 039618c2..565049af 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -63,7 +63,11 @@ jobs:
               tox -e validate-pipelines
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 91de85f3..f53b72f1 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -37,7 +37,11 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -67,7 +71,11 @@ jobs:
 
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -99,7 +107,11 @@ jobs:
 
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
index 32b02123..146bf734 100644
--- a/.github/workflows/spellcheck.yml
+++ b/.github/workflows/spellcheck.yml
@@ -32,7 +32,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml
index b621e396..2bb7960c 100644
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@@ -24,7 +24,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 983a308b..359f0dda 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,6 +40,10 @@ jobs:
     name: "${{ matrix.python }} on ${{ matrix.platform }}"
     runs-on: "${{ matrix.platform }}"
     strategy:
+<<<<<<< HEAD
+=======
+      fail-fast: false
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       matrix:
         python:
           - "3.10"
@@ -51,7 +55,11 @@ jobs:
             platform: "macos-latest"
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -89,7 +97,11 @@ jobs:
           pip cache remove llama_cpp_python
 
       - name: Cache huggingface
+<<<<<<< HEAD
         uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+=======
+        uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           path: ~/.cache/huggingface
           # config contains DEFAULT_MODEL
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
index 2130dd21..a1f3e1e4 100644
--- a/.spellcheck-en-custom.txt
+++ b/.spellcheck-en-custom.txt
@@ -4,10 +4,21 @@
 Backport
 backported
 codebase
+<<<<<<< HEAD
+=======
+configs
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 Dataset
 dataset
 datasets
 distractor
+<<<<<<< HEAD
+=======
+Docling
+docling
+Eval
+eval
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 FIXME
 freeform
 ICL
@@ -17,6 +28,7 @@ Langchain's
 LLM
 LLMBlock
 MCQ
+<<<<<<< HEAD
 MMLU
 Ouput
 Pre
@@ -29,6 +41,30 @@ Splitter
 subfolder
 Tatsu
 unchunked
+=======
+Merlinite
+Mixtral
+MMLU
+multiphase
+Ouput
+Pre
+pre
+precomputed
+Pregenerated
+qna
+quantized
+repo
+sdg
+Splitter
+subdirectory
+subfolder
+Tatsu
+Tesseract
+tokenizer
+tokenizers
+unchunked
+upsampled
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 UUID
 vLLM
 yaml
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..8f56e2c6
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,17 @@
+## v0.6.1
+
+### Fixes
+
+* Fixed a bug where generating data from a taxonomy with 2 or more changed knowledge leaf nodes would fail with a message about a destination path `already exists and is not an empty directory`
+
+## v0.6.0
+
+### Features
+
+* Small knowledge datasets will automatically get upsampled during final data mixing based on the length of any precomputed skills datasets used during data mixing. This avoids issues where very large precomputed skills datasets were swamping the comparatively minor number of knowledge samples, resulting in lower than optimal knowledge retention during multiphase training. If a large precomputed dataset isn't in use during mixing (which is how things operate by default), this change is a no-op.
+* When chunking PDF documents, we'll now look for the docling models on-disk in `$XDG_DATA_HOME/instructlab/sdg/models` (as well as `$XDG_DATA_DIRS` with the same `instructlab/sdg/models` subdirectory). If they are not found on disk, they'll automatically be downloaded from HuggingFace.
+* When chunking PDF documents with Docling, we'll automatically configure Docling to use `tesserocr` if a working implementation is found instead of relying on `easyocr`. We fallback to `easyocr` if Tesseract is not properly configured for use by `tesserocr`.
+
+### Breaking Changes
+
+* Teacher model tokenizers are loaded from the local teacher model on-disk and not downloaded automatically from HuggingFace. The typical workflows in use so far expect the teacher model to exist on-disk, and this enforces that at least its tokenizer exists.
diff --git a/README.md b/README.md
index 8752dceb..5eb85c3c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # sdg
+=======
+# Synthetic Data Generation (SDG)
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
 ![Lint](https://github.com/instructlab/sdg/actions/workflows/lint.yml/badge.svg?branch=main)
 ![Build](https://github.com/instructlab/sdg/actions/workflows/pypi.yaml/badge.svg?branch=main)
@@ -10,3 +14,72 @@
 ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/sdg/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
 
 Python library for Synthetic Data Generation
+<<<<<<< HEAD
+=======
+
+## Introduction
+
+Synthetic Data Generation (SDG) is a process that creates an artificially generated dataset that mimics real data based on provided examples. SDG uses a YAML file containing question-and-answer pairs as input data.
+
+## Installing the SDG library
+
+Clone the library and navigate to the repo:
+
+```bash
+git clone https://github.com/instructlab/sdg
+cd sdg
+```
+
+Install the library:
+
+```bash
+pip install .
+```
+
+### Using the library
+
+You can import SDG into your Python files with the following items:
+
+```python
+ from instructlab.sdg.generate_data import generate_data
+ from instructlab.sdg.utils import GenerateException
+```
+
+## Pipelines
+
+A pipeline is a series of steps to execute in order to generate data.
+
+There are three default pipelines shipped in SDG: `simple`, `full`, and `eval`. Each pipeline requires specific hardware specifications
+
+### Simple Pipeline
+
+The [simple pipeline](src/instructlab/sdg/pipelines/simple) is designed to be used with [quantized Merlinite](https://huggingface.co/instructlab/merlinite-7b-lab-GGUF) as the teacher model. It enables basic data generation results on low-end consumer grade hardware, such as laptops and desktops with small or no discrete GPUs.
+
+### Full Pipeline
+
+The [full pipeline](src/instructlab/sdg/pipelines/full) is designed to be used with [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) as the the teacher model, but has also been successfully tested with smaller models such as [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) and even some quantized versions of the two teacher models. This is the preferred data generation pipeline on higher end consumer grade hardware and all enterprise hardware.
+
+### Eval Pipeline
+
+The [eval pipeline](src/instructlab/sdg/pipelines/eval) is used to generate [MMLU](https://en.wikipedia.org/wiki/MMLU) benchmark data that can be used to later evaluate a trained model on your knowledge dataset. It does not generate data for use during model training.
+
+### Pipeline architecture
+
+All the pipelines are written in a YAML format and must adhere to a [specific schema](src/instructlab/sdg/pipelines/schema/v1.json).
+
+The pipelines that generate data for model training (simple and full pipelines) expect to have three different pipeline configs - one each for knowledge, grounded skills, and freeform skills. They are expected to exist in files called `knowledge.yaml`, `grounded_skills.yaml`, and `freeform_skills.yaml` respectively. For background on the difference in knowledge, grounded skills, and freeform skills, refer to the [InstructLab Taxonomy repository](https://github.com/instructlab/taxonomy).
+
+## Repository structure
+
+```bash
+|-- src/instructlab/ (1)
+|-- docs/ (2)
+|-- scripts/ (3)
+|-- tests/ (4)
+```
+
+1. Contains the SDG code that interacts with InstructLab.
+2. Contains documentation on various SDG methodologies.
+3. Contains some utility scripts, but not part of any supported API.
+4. Contains all the tests for the SDG repository.
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6a7f352b..342ff18b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,7 +2,11 @@
 
 -r requirements.txt
 
+<<<<<<< HEAD
 pre-commit>=3.0.4,<4.0
+=======
+pre-commit>=3.0.4,<5.0
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 pylint>=2.16.2,<4.0
 pylint-pydantic
 pytest
diff --git a/requirements.txt b/requirements.txt
index 7984751c..48a7b5ca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
+<<<<<<< HEAD
 docling[tesserocr]>=2.4.2,<3.0.0
+=======
+docling[tesserocr]>=2.4.2,<=2.8.3
+docling-parse>=2.0.0,<3.0.0
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 GitPython>=3.1.42,<4.0.0
 gguf>=0.6.0
 httpx>=0.25.0,<1.0.0
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index d97cdc27..8d954af3 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -33,6 +33,10 @@
     PipelineContext,
 )
 from instructlab.sdg.utils import GenerateException, models
+<<<<<<< HEAD
+=======
+from instructlab.sdg.utils.json import jldump
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 from instructlab.sdg.utils.taxonomy import (
     leaf_node_to_samples,
     read_taxonomy_leaf_nodes,
@@ -112,6 +116,7 @@ def _gen_train_data(
             }
             messages_data.append(_convert_to_messages(sample))
 
+<<<<<<< HEAD
     with open(output_file_train, "w", encoding="utf-8") as outfile:
         for entry in train_data:
             json.dump(entry, outfile, ensure_ascii=False)
@@ -121,6 +126,11 @@ def _gen_train_data(
         for entry in messages_data:
             json.dump(entry, outfile, ensure_ascii=False)
             outfile.write("\n")
+=======
+    jldump(train_data, output_file_train)
+
+    jldump(messages_data, output_file_messages)
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
 
 def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
@@ -170,10 +180,14 @@ def _gen_test_data(
                 }
             )
 
+<<<<<<< HEAD
     with open(output_file_test, "w", encoding="utf-8") as outfile:
         for entry in test_data:
             json.dump(entry, outfile, ensure_ascii=False)
             outfile.write("\n")
+=======
+        jldump(test_data, output_file_test)
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
 
 def _check_pipeline_dir(pipeline):
@@ -295,7 +309,11 @@ def _mixer_init(
 
 # This is part of the public API, and used by instructlab.
 # TODO - parameter removal needs to be done in sync with a CLI change.
+<<<<<<< HEAD
 # to be removed: logger, prompt_file_path, rouge_threshold, tls_*
+=======
+# to be removed: logger
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 def generate_data(
     client: openai.OpenAI,
     logger: logging.Logger = logger,  # pylint: disable=redefined-outer-name
@@ -308,10 +326,13 @@ def generate_data(
     taxonomy: Optional[str] = None,  # TODO rename to taxonomy_path to match config
     taxonomy_base: Optional[str] = None,
     output_dir: Optional[str] = None,
+<<<<<<< HEAD
     # TODO - not used and should be removed from the CLI
     prompt_file_path: Optional[str] = None,  # pylint: disable=unused-argument
     # TODO - probably should be removed
     rouge_threshold: Optional[float] = None,  # pylint: disable=unused-argument
+=======
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
     console_output=True,
     yaml_rules: Optional[str] = None,
     chunk_word_count=None,
diff --git a/src/instructlab/sdg/utils/json.py b/src/instructlab/sdg/utils/json.py
index 8fd25268..8bbae4e9 100644
--- a/src/instructlab/sdg/utils/json.py
+++ b/src/instructlab/sdg/utils/json.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+<<<<<<< HEAD
+=======
+from typing import Any, Iterable
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 import io
 import json
 import os
@@ -46,3 +50,19 @@ def jload(f, mode="r"):
     """Load a .json file into a dictionary."""
     with _make_r_io_base(f, mode) as f_:
         return json.load(f_)
+<<<<<<< HEAD
+=======
+
+
+def jldump(data: Iterable[Any], out: str | io.IOBase) -> None:
+    """Dump a list to a file in jsonl format.
+
+    Args:
+        data: An data to be written.
+        f: io.IOBase or file path
+    """
+    with _make_w_io_base(out, "w") as outfile:
+        for entry in data:
+            json.dump(entry, outfile, ensure_ascii=False)
+            outfile.write("\n")
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
diff --git a/tox.ini b/tox.ini
index 723212a9..e41069cc 100644
--- a/tox.ini
+++ b/tox.ini
@@ -73,10 +73,20 @@ allowlist_externals = sh
 
 [testenv:mypy]
 description = Python type checking with mypy
+<<<<<<< HEAD
 deps =
   mypy>=1.10.0,<2.0
   types-PyYAML
   pytest
+=======
+# Note: 'mypy<1.14' by default pulls the latest 'pydantic' release as a dependency, but 'pydantic>=2.10' does not
+# work with 'mypy<1.14', so for compatibility purposes, we set 'pydantic<=2.9.2'
+deps =
+  mypy>=1.10.0,<1.14
+  types-PyYAML
+  pytest
+  pydantic<=2.9.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 commands =
   mypy src