diff --git a/.github/workflows/broken-link-checker.yaml b/.github/workflows/broken-link-checker.yaml new file mode 100644 index 000000000..fcd6faf0b --- /dev/null +++ b/.github/workflows/broken-link-checker.yaml @@ -0,0 +1,44 @@ +name: Link checker for articles and flatpages on CantusDB + +on: + schedule: + - cron: "8 8 * * 0" # Cron job will run at 08h08 UTC time every Sunday + +jobs: + get-all-links: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - id: set-matrix + run: | + BASE_URL="https://cantusdatabase.org" + flatpages=$(curl "$BASE_URL/flatpages-list/" | awk '{ gsub (" ", "\",\"", $0); print}') + articles=$(curl "$BASE_URL/articles-list/" | awk '{ gsub (" ", "\",\"", $0); print}') + list="{\"links\": [\"${flatpages}\",\"${articles}\"]}" + echo $list + echo "matrix=$list" >> $GITHUB_OUTPUT + link-Checker: + runs-on: ubuntu-latest + needs: get-all-links + strategy: + fail-fast: false + max-parallel: 4 + matrix: ${{fromJson(needs.get-all-links.outputs.matrix)}} + steps: + - uses: actions/checkout@v3 + - name: Link Checker + id: lychee + uses: lycheeverse/lychee-action@v1.8.0 + with: + args: --exclude http:\/\/cantus\.sk.* ${{ matrix.links }} + format: json + output: /tmp/link-checker-output.txt + - name: Curating Link Checker Output + run: | + echo "***Python Version***" + python --version + echo "***Invoking Parsing Script***" + python "$GITHUB_WORKSPACE/scripts/parse_link_checker_output.py" >> $GITHUB_STEP_SUMMARY + echo "***Printing Summary***" + cat $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/scripts/parse_link_checker_output.py b/scripts/parse_link_checker_output.py new file mode 100644 index 000000000..6f46aa01d --- /dev/null +++ b/scripts/parse_link_checker_output.py @@ -0,0 +1,64 @@ +"""Modules""" +import json +import sys +from pathlib import Path +import itertools + +print(f"Running: {sys.argv[0]}", file=sys.stderr) + +FILE_LOCATION = "/tmp/link-checker-output.txt" + +# If link checker does not have any errors, exit gracefully +if not Path(FILE_LOCATION).exists(): + print("✅ No Broken Links Found.") + sys.exit(0) +else: + print("❌ Broken Links Found. Proceeding to Parsing Step.", file=sys.stderr) + +# Loading link checker output result +with open(FILE_LOCATION, encoding='utf-8') as link_checker_output_file: + print(f"Parsing the json data in {FILE_LOCATION}", file=sys.stderr) + link_checker_results = json.load(link_checker_output_file) + +list_of_failures = link_checker_results['fail_map'] + +if not list_of_failures: + print("✅ No Broken Links") + sys.exit(0) + +# Flatten the list of lists into a single list - +# list_of_failures is returned as a list of lists. +all_failures = list(itertools.chain.from_iterable(list_of_failures.values())) + +real_errors = [] +skippable_errors = [] + +# Process each failure in the flattened list +for failure in all_failures: + error_code = failure['status'].get('code') + + # Check if it's a timeout or a client-side issue + if not error_code: + skippable_errors.append(failure) + continue + + # Find all 4xx errors + if 400 <= error_code < 500: + real_errors.append(failure) + else: + skippable_errors.append(failure) + +if real_errors: + print("❌ Broken Links:") + for error in real_errors: + print(f"* {error['url']}: {error['status']['code']}") + print("\n") + +if skippable_errors: + print("🆗 Skippable Errors:") + for error in skippable_errors: + print(f"* {error['url']}: {error['status']['text']}") + print("\n") + +if real_errors: + sys.exit(1) \ No newline at end of file