Pin docling-parse>=2.0.0,<3.0.0 + pin pydantic==2.9.2

`docling-parse` v3.0.0 contains breaking changes to the syntax, which is currently breaking our builds. Also, `mypy` < v1.14 pulls in the latest version of `pydantic` by default, so this commimt hardcodes the correct span of `pydantic` versions that are compatible with `mypy` < v1.14 Finally, we want to pin `docling[tesserocr]>=2.4.2,<=2.8.3` due to breaking changes in v2.9.0 on 9 Dec 2024. Signed-off-by: Courtney Pacheco <[email protected]> (cherry picked from commit 2e00bb8) # Conflicts: # .github/workflows/actionlint.yml # .github/workflows/docs.yml # .github/workflows/e2e-nvidia-l4-x1.yml # .github/workflows/e2e-nvidia-l40s-x4.yml # .github/workflows/e2e-nvidia-t4-x1.yml # .github/workflows/lint.yml # .github/workflows/pypi.yaml # .github/workflows/spellcheck.yml # .github/workflows/stale_bot.yml # .github/workflows/test.yml # .spellcheck-en-custom.txt # README.md # requirements-dev.txt # requirements.txt # src/instructlab/sdg/generate_data.py # src/instructlab/sdg/utils/json.py # tox.ini
instructlab · Dec 10, 2024 · f159971 · f159971
1 parent c220b5f
commit f159971
Show file tree

Hide file tree

Showing 18 changed files with 321 additions and 0 deletions.
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
@@ -7,12 +7,20 @@ on:
       - "main"
       - "release-**"
     paths:
+<<<<<<< HEAD
+=======
+      - '.github/actions/*.ya?ml'
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*' # This workflow
   pull_request:
     branches:
       - "main"
     paths:
+<<<<<<< HEAD
+=======
+      - '.github/actions/*.ya?ml'
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*' # This workflow
 
@@ -31,7 +39,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -33,7 +33,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"

diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -46,7 +46,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -130,7 +134,11 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install  -v .
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         run: |
           df -h
   
@@ -142,6 +150,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -m
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
   stop-medium-ec2-runner:
     needs:
       - start-medium-ec2-runner
@@ -154,7 +169,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -24,7 +24,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -160,7 +164,11 @@ jobs:
           pip install .
           pip install .[cuda]
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         run: |
           df -h
 
@@ -172,6 +180,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -l
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
         working-directory: ./sdg
@@ -191,6 +206,7 @@ jobs:
       - name: Post job results to Slack if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'false'
         id: slack-report-failure
+<<<<<<< HEAD
         uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
         with:
           # Slack channel id, channel name, or user id to post message.
@@ -200,10 +216,23 @@ jobs:
           slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
         env:
           SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+=======
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Post job results to Slack if the workflow succeeded
         if: success() && steps.check_pr.outputs.is_pr == 'false'
         id: slack-report-success
+<<<<<<< HEAD
         uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
         with:
           # Slack channel id, channel name, or user id to post message.
@@ -213,6 +242,18 @@ jobs:
           slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
         env:
           SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+=======
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+          method: chat.postMessage
+          payload: |
+            # Slack channel id, channel name, or user id to post message.
+            # See also: https://api.slack.com/methods/chat.postMessage#channels
+            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+            channel: 'e2e-ci-results'
+            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
   stop-large-ec2-runner:
     needs:
@@ -226,7 +267,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7

diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml
@@ -46,7 +46,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -128,7 +132,11 @@ jobs:
           . ../instructlab/venv/bin/activate
           pip install .
 
+<<<<<<< HEAD
       - name: Check disk
+=======
+      - name: Check disk before tests
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         run: |
           df -h
 
@@ -138,6 +146,13 @@ jobs:
           . venv/bin/activate
           ./scripts/e2e-ci.sh -s
 
+<<<<<<< HEAD
+=======
+      - name: Check disk after tests
+        run: |
+          df -h
+
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
   stop-small-ec2-runner:
     needs:
       - start-small-ec2-runner
@@ -150,7 +165,11 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+<<<<<<< HEAD
           aws-region: ${{ secrets.AWS_REGION }}
+=======
+          aws-region: ${{ vars.AWS_REGION }}
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -63,7 +63,11 @@ jobs:
               tox -e validate-pipelines
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -37,7 +37,11 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -67,7 +71,11 @@ jobs:
 
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -99,7 +107,11 @@ jobs:
 
         steps:
             - name: "Harden Runner"
+<<<<<<< HEAD
               uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+              uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
               with:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 

diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
@@ -32,7 +32,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 

diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml
@@ -24,7 +24,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -40,6 +40,10 @@ jobs:
     name: "${{ matrix.python }} on ${{ matrix.platform }}"
     runs-on: "${{ matrix.platform }}"
     strategy:
+<<<<<<< HEAD
+=======
+      fail-fast: false
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
       matrix:
         python:
           - "3.10"
@@ -51,7 +55,11 @@ jobs:
             platform: "macos-latest"
     steps:
       - name: "Harden Runner"
+<<<<<<< HEAD
         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+=======
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
@@ -89,7 +97,11 @@ jobs:
           pip cache remove llama_cpp_python
 
       - name: Cache huggingface
+<<<<<<< HEAD
         uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+=======
+        uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+>>>>>>> 2e00bb8 (Pin `docling-parse>=2.0.0,<3.0.0` + pin `pydantic==2.9.2`)
         with:
           path: ~/.cache/huggingface
           # config contains DEFAULT_MODEL