Skip to content

Commit

Permalink
Merge pull request #1272 from DDMAL/broken-link-checker
Browse files Browse the repository at this point in the history
GitHub Actions Broken Link Checker for Production
  • Loading branch information
jacobdgm authored Jan 19, 2024
2 parents 47b6078 + 209500f commit 8908e81
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 0 deletions.
44 changes: 44 additions & 0 deletions .github/workflows/broken-link-checker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: Link checker for articles and flatpages on CantusDB

on:
schedule:
- cron: "8 8 * * 0" # Cron job will run at 08h08 UTC time every Sunday

jobs:
get-all-links:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- id: set-matrix
run: |
BASE_URL="https://cantusdatabase.org"
flatpages=$(curl "$BASE_URL/flatpages-list/" | awk '{ gsub (" ", "\",\"", $0); print}')
articles=$(curl "$BASE_URL/articles-list/" | awk '{ gsub (" ", "\",\"", $0); print}')
list="{\"links\": [\"${flatpages}\",\"${articles}\"]}"
echo $list
echo "matrix=$list" >> $GITHUB_OUTPUT
link-Checker:
runs-on: ubuntu-latest
needs: get-all-links
strategy:
fail-fast: false
max-parallel: 4
matrix: ${{fromJson(needs.get-all-links.outputs.matrix)}}
steps:
- uses: actions/checkout@v3
- name: Link Checker
id: lychee
uses: lycheeverse/[email protected]
with:
args: --exclude http:\/\/cantus\.sk.* ${{ matrix.links }}
format: json
output: /tmp/link-checker-output.txt
- name: Curating Link Checker Output
run: |
echo "***Python Version***"
python --version
echo "***Invoking Parsing Script***"
python "$GITHUB_WORKSPACE/scripts/parse_link_checker_output.py" >> $GITHUB_STEP_SUMMARY
echo "***Printing Summary***"
cat $GITHUB_STEP_SUMMARY
64 changes: 64 additions & 0 deletions scripts/parse_link_checker_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Modules"""
import json
import sys
from pathlib import Path
import itertools

print(f"Running: {sys.argv[0]}", file=sys.stderr)

FILE_LOCATION = "/tmp/link-checker-output.txt"

# If link checker does not have any errors, exit gracefully
if not Path(FILE_LOCATION).exists():
print("✅ No Broken Links Found.")
sys.exit(0)
else:
print("❌ Broken Links Found. Proceeding to Parsing Step.", file=sys.stderr)

# Loading link checker output result
with open(FILE_LOCATION, encoding='utf-8') as link_checker_output_file:
print(f"Parsing the json data in {FILE_LOCATION}", file=sys.stderr)
link_checker_results = json.load(link_checker_output_file)

list_of_failures = link_checker_results['fail_map']

if not list_of_failures:
print("✅ No Broken Links")
sys.exit(0)

# Flatten the list of lists into a single list -
# list_of_failures is returned as a list of lists.
all_failures = list(itertools.chain.from_iterable(list_of_failures.values()))

real_errors = []
skippable_errors = []

# Process each failure in the flattened list
for failure in all_failures:
error_code = failure['status'].get('code')

# Check if it's a timeout or a client-side issue
if not error_code:
skippable_errors.append(failure)
continue

# Find all 4xx errors
if 400 <= error_code < 500:
real_errors.append(failure)
else:
skippable_errors.append(failure)

if real_errors:
print("❌ Broken Links:")
for error in real_errors:
print(f"* {error['url']}: {error['status']['code']}")
print("\n")

if skippable_errors:
print("🆗 Skippable Errors:")
for error in skippable_errors:
print(f"* {error['url']}: {error['status']['text']}")
print("\n")

if real_errors:
sys.exit(1)

0 comments on commit 8908e81

Please sign in to comment.