diff --git a/.github/workflows/broken-link-checker.yaml b/.github/workflows/broken-link-checker.yaml new file mode 100644 index 000000000..fcd6faf0b --- /dev/null +++ b/.github/workflows/broken-link-checker.yaml @@ -0,0 +1,44 @@ +name: Link checker for articles and flatpages on CantusDB + +on: + schedule: + - cron: "8 8 * * 0" # Cron job will run at 08h08 UTC time every Sunday + +jobs: + get-all-links: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - id: set-matrix + run: | + BASE_URL="https://cantusdatabase.org" + flatpages=$(curl "$BASE_URL/flatpages-list/" | awk '{ gsub (" ", "\",\"", $0); print}') + articles=$(curl "$BASE_URL/articles-list/" | awk '{ gsub (" ", "\",\"", $0); print}') + list="{\"links\": [\"${flatpages}\",\"${articles}\"]}" + echo $list + echo "matrix=$list" >> $GITHUB_OUTPUT + link-Checker: + runs-on: ubuntu-latest + needs: get-all-links + strategy: + fail-fast: false + max-parallel: 4 + matrix: ${{fromJson(needs.get-all-links.outputs.matrix)}} + steps: + - uses: actions/checkout@v3 + - name: Link Checker + id: lychee + uses: lycheeverse/lychee-action@v1.8.0 + with: + args: --exclude http:\/\/cantus\.sk.* ${{ matrix.links }} + format: json + output: /tmp/link-checker-output.txt + - name: Curating Link Checker Output + run: | + echo "***Python Version***" + python --version + echo "***Invoking Parsing Script***" + python "$GITHUB_WORKSPACE/scripts/parse_link_checker_output.py" >> $GITHUB_STEP_SUMMARY + echo "***Printing Summary***" + cat $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.gitignore b/.gitignore index f9d5c63dd..4087c52d7 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,7 @@ django/cantusdb_project/Drupal_scripts # in case you're working on importing data from a database dump *.sql + +# cached files for APIs +api_cache/ + diff --git a/cron/cron.txt b/cron/cron.txt index bc279a4c8..b37423519 100644 --- a/cron/cron.txt +++ b/cron/cron.txt @@ -7,4 +7,4 @@ # min hour day month weekday command 0 4 * * * bash /home/ubuntu/code/CantusDB/cron/postgres/db_backup.sh 40 4 1 * * bash /home/ubuntu/code/CantusDB/cron/management/manage.sh populate_next_chant_fields; bash /home/ubuntu/code/CantusDB/cron/management/manage.sh populate_is_last_chant_in_feast -50 4 * * 7 /usr/local/bin/docker-compose exec nginx lego --path /etc/nginx/ssl/live -d cantusdatabase.org -d www.cantusdatabase.org -d mass.cantusdatabase.org --http --http.webroot /var/www/lego/ renew --days 45 --renew-hook "nginx -s reload" +50 4 * * 7 /usr/local/bin/docker-compose -f /home/ubuntu/code/CantusDB/docker-compose.yml exec -T nginx lego --path /etc/nginx/ssl/live -d cantusdatabase.org -d www.cantusdatabase.org -d mass.cantusdatabase.org -m updateme@example.com --http --http.webroot /var/www/lego/ renew --days 45 --renew-hook "nginx -s reload" diff --git a/django/cantusdb_project/break_json.py b/django/cantusdb_project/break_json.py deleted file mode 100644 index a2a61d07b..000000000 --- a/django/cantusdb_project/break_json.py +++ /dev/null @@ -1,60 +0,0 @@ -import ijson.backends.yajl2_c as ijson -import json -import os - -CWD = os.getcwd() -LARGE_JSON = os.path.join( - CWD, "main_app/fixtures/chant_fixture.json" -) # path to the large json file -TARGET_PATH = os.path.join( - CWD, "main_app/fixtures/chant_fixtures" -) # directory to put generated smaller json file -if not os.path.exists(TARGET_PATH): - os.makedirs(TARGET_PATH) -RESULT_BATCH_SIZE = 100 # how many chants in a resulting smaller json file - - -# to count how many chants are in the iterator -# to do this, it has to iterate till the end -def count_iterator(i): - return sum(1 for e in i) - - -file = open(LARGE_JSON, "rb") -chants = ijson.items(file, "item") -number_of_chants = count_iterator(chants) -print("total chants: ", number_of_chants) # 492794 chants -file.close() -# in order to use the elements in the iterator, -# we have to open it again -file = open(LARGE_JSON, "rb") -chants = ijson.items(file, "item") -chant_list = [] -for x in range(number_of_chants + 1): - # +1 in order to push to the limit and raise StopIteration - try: - chant = next(chants) # chant is a dict - chant_list.append(chant) - if (x + 1) % RESULT_BATCH_SIZE == 0: - print("processing chant: ", x) - with open( - os.path.join( - TARGET_PATH, - "chant_fixture_{}.json".format(int((x + 1) / RESULT_BATCH_SIZE)), - ), - "w", - ) as f: - json.dump(chant_list, f, indent=2, separators=(", ", ": ")) - chant_list = [] - except StopIteration: - print("StopIteration_Raised") - with open( - os.path.join( - TARGET_PATH, - "chant_fixture_{}.json".format(int((x + 1) / RESULT_BATCH_SIZE + 1)), - ), - "w", - ) as f: - json.dump(chant_list, f, indent=2, separators=(", ", ": ")) - chant_list = [] -file.close() diff --git a/django/cantusdb_project/create_fixtures.sh b/django/cantusdb_project/create_fixtures.sh deleted file mode 100644 index dd24a61eb..000000000 --- a/django/cantusdb_project/create_fixtures.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -# This script runs the commands necessary to export fixtures from a working version -# of CantusDB, putting them all in a single folder, fixtures/ . -# They can then be unpacked in a fresh clone of CantusDB by running break_json.py -# followed by load_fixtures.sh . -# This file should be in the same directory as manage.py . -# Run this file inside the django container with `bash create_fixtures.sh`. - -mkdir fixtures - -echo "creating group_fixture.json" -python manage.py dumpdata auth.Group -o fixtures/group_fixture.json --indent 4 - -echo "creating user_fixture.json" -python manage.py dumpdata users.User -o fixtures/user_fixture.json --indent 4 - -echo "creating flatpage_fixture.json" -python manage.py dumpdata flatpages -o fixtures/flatpage_fixture.json --indent 4 - -echo "creating article_fixture.json" -python manage.py dumpdata articles.Article -o fixtures/article_fixture.json --indent 4 - -echo "creating office_fixture.json" -python manage.py dumpdata main_app.Office -o fixtures/office_fixture.json --indent 4 - -echo "creating genre_fixture.json" -python manage.py dumpdata main_app.Genre -o fixtures/genre_fixture.json --indent 4 - -echo "creating feast_fixture.json" -python manage.py dumpdata main_app.Feast -o fixtures/feast_fixture.json --indent 4 - -echo "creating notation_fixture.json" -python manage.py dumpdata main_app.Notation -o fixtures/notation_fixture.json --indent 4 - -echo "creating century_fixture.json" -python manage.py dumpdata main_app.Century -o fixtures/century_fixture.json --indent 4 - -echo "creating provenance_fixture.json" -python manage.py dumpdata main_app.Provenance -o fixtures/provenance_fixture.json --indent 4 - -echo "creating rism_siglum_fixture.json" -python manage.py dumpdata main_app.RismSiglum -o fixtures/rism_siglum_fixture.json --indent 4 - -echo "creating segment_fixture.json" -python manage.py dumpdata main_app.Segment -o fixtures/segment_fixture.json --indent 4 - -echo "creating source_fixture.json" -python manage.py dumpdata main_app.Source -o fixtures/source_fixture.json --indent 4 - -echo "creating sequence_fixture.json" -python manage.py dumpdata main_app.Sequence -o fixtures/sequence_fixture.json --indent 4 - - -echo "creating chant_fixture.json" -python manage.py dumpdata main_app.Chant -o fixtures/chant_fixture.json --indent 4 diff --git a/django/cantusdb_project/load_fixtures.sh b/django/cantusdb_project/load_fixtures.sh deleted file mode 100755 index c3dc51e5a..000000000 --- a/django/cantusdb_project/load_fixtures.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# generate tables in the database (optional) -# python manage.py makemigrations -# python manage.py migrate - -# load initial data to populate the database -# bash load_fixtures.sh -# these fixtures need to be loaded in a certain order due to foreign key dependencies -# before running this, make sure you put the 'fixtures' folder under 'main_app' -FIXTURES_LIST=( - group_fixture.json - user_fixture.json - flatpage_fixture.json - office_fixture.json - genre_fixture.json - feast_fixture.json - notation_fixture.json - century_fixture.json - provenance_fixture.json - rism_siglum_fixture.json - segment_fixture.json - source_fixture.json - sequence_fixture.json - article_fixture.json -) - -for fixture in ${FIXTURES_LIST[*]} -do - echo $fixture - python manage.py loaddata $fixture -done - -# N.B. As of March 2023, the following part of this script is broken. -# Most Chants in the database have another Chant specified as their `next_chant`. -# If a given Chant's `next_chant` has not yet been loaded, the given Chant -# will also fail to be loaded into the database. - -# load all the chants, this takes a few hours as we have half a million chants -FILES=./main_app/fixtures/chant_fixtures/* -for f in $FILES -do - python manage.py loaddata $f -v 2 -done - -# now you can runserver and expect things to work properly \ No newline at end of file diff --git a/django/cantusdb_project/main_app/management/commands/update_cached_concordances.py b/django/cantusdb_project/main_app/management/commands/update_cached_concordances.py new file mode 100644 index 000000000..8acbbd6fa --- /dev/null +++ b/django/cantusdb_project/main_app/management/commands/update_cached_concordances.py @@ -0,0 +1,100 @@ +import ujson +import os +from sys import stdout +from datetime import datetime +from collections import defaultdict +from django.db.models.query import QuerySet +from django.core.management.base import BaseCommand +from main_app.models import Chant + + +class Command(BaseCommand): + def handle(self, *args, **kwargs) -> None: + CACHE_DIR: str = "api_cache" + FILEPATH: str = f"{CACHE_DIR}/concordances.json" + start_time: str = datetime.now().isoformat() + stdout.write(f"Running update_cached_concordances at {start_time}.\n") + concordances: dict = get_concordances() + write_time: str = datetime.now().isoformat() + metadata: dict = { + "last_updated": write_time, + } + data_and_metadata: dict = { + "data": concordances, + "metadata": metadata, + } + stdout.write(f"Attempting to make directory at {CACHE_DIR} to hold cache: ") + try: + os.mkdir(CACHE_DIR) + stdout.write(f"successfully created directory at {CACHE_DIR}.\n") + except FileExistsError: + stdout.write(f"directory at {CACHE_DIR} already exists.\n") + stdout.write(f"Writing concordances to {FILEPATH} at {write_time}.\n") + with open(FILEPATH, "w") as json_file: + ujson.dump(data_and_metadata, json_file) + end_time = datetime.now().isoformat() + stdout.write( + f"Concordances successfully written to {FILEPATH} at {end_time}.\n\n" + ) + + +def get_concordances() -> dict: + DOMAIN: str = "https://cantusdatabase.org" + + stdout.write("Querying database for published chants\n") + published_chants: QuerySet[Chant] = Chant.objects.filter(source__published=True) + values: QuerySet[dict] = published_chants.select_related( + "source", + "feast", + "genre", + "office", + ).values( + "id", + "source_id", + "source__siglum", + "folio", + "c_sequence", + "incipit", + "feast__name", + "genre__name", + "office__name", + "position", + "cantus_id", + "image_link", + "mode", + "manuscript_full_text_std_spelling", + "volpiano", + ) + + stdout.write("Processing chants\n") + concordances: defaultdict = defaultdict(list) + for chant in values: + source_id: int = chant["source_id"] + source_absolute_url: str = f"{DOMAIN}/source/{source_id}/" + chant_id: int = chant["id"] + chant_absolute_url: str = f"{DOMAIN}/chant/{chant_id}/" + + concordances[chant["cantus_id"]].append( + { + "siglum": chant["source__siglum"], + "srclink": source_absolute_url, + "chantlink": chant_absolute_url, + "folio": chant["folio"], + "sequence": chant["c_sequence"], + "incipit": chant["incipit"], + "feast": chant["feast__name"], + "genre": chant["genre__name"], + "office": chant["office__name"], + "position": chant["position"], + "cantus_id": chant["cantus_id"], + "image": chant["image_link"], + "mode": chant["mode"], + "full_text": chant["manuscript_full_text_std_spelling"], + "melody": chant["volpiano"], + "db": "CD", + } + ) + + stdout.write(f"All chants processed - found {len(concordances)} Cantus IDs\n") + + return dict(concordances) diff --git a/django/cantusdb_project/main_app/tests/test_functions.py b/django/cantusdb_project/main_app/tests/test_functions.py index 8c00ecffc..f7001af12 100644 --- a/django/cantusdb_project/main_app/tests/test_functions.py +++ b/django/cantusdb_project/main_app/tests/test_functions.py @@ -1,8 +1,18 @@ from django.test import TestCase +from typing import Union from latin_syllabification import ( clean_transcript, syllabify_word, ) +from main_app.models import ( + Chant, + Source, +) +from main_app.tests.make_fakes import ( + make_fake_chant, + make_fake_source, +) +from main_app.management.commands import update_cached_concordances # run with `python -Wa manage.py test main_app.tests.test_functions` # the -Wa flag tells Python to display deprecation warnings @@ -188,3 +198,122 @@ def test_syllabify_word(self): def test_syllabify_text(self): pass + + +class UpdateCachedConcordancesCommandTest(TestCase): + def test_concordances_structure(self): + chant: Chant = make_fake_chant(cantus_id="123456") + concordances: dict = update_cached_concordances.get_concordances() + + with self.subTest(test="Ensure get_concordances returns dict"): + self.assertIsInstance(concordances, dict) + + concordances_for_single_cantus_id: list = concordances["123456"] + with self.subTest(test="Ensure values are lists"): + self.assertIsInstance(concordances_for_single_cantus_id, list) + + single_concordance = concordances_for_single_cantus_id[0] + with self.subTest(test="Ensure each concordance is a dict"): + single_concordance: dict = concordances_for_single_cantus_id[0] + self.assertIsInstance(single_concordance, dict) + + expected_keys = ( + "siglum", + "srclink", + "chantlink", + "folio", + "sequence", + "incipit", + "feast", + "genre", + "office", + "position", + "cantus_id", + "image", + "mode", + "full_text", + "melody", + "db", + ) + concordance_keys = single_concordance.keys() + for key in expected_keys: + with self.subTest(key=key): + self.assertIn(key, concordance_keys) + with self.subTest(test="Ensure no unexpected keys present"): + self.assertEqual(len(concordance_keys), len(expected_keys)) + + def test_number_of_concordances_returned(self): + cantus_ids: tuple[tuple[str, int]] = ( + ("000002", 2), + ("000003", 3), + ("000005", 5), + ("000007", 7), + ("000011", 11), + ) + for cantus_id, n in cantus_ids: + for _ in range(n): + make_fake_chant(cantus_id=cantus_id) + + concordances: dict = update_cached_concordances.get_concordances() + with self.subTest(test="Test all Cantus IDs present"): + self.assertEqual(len(concordances), len(cantus_ids)) + + for cantus_id, n in cantus_ids: + concordances_for_id: list = concordances[cantus_id] + with self.subTest(n=n): + self.assertEqual(len(concordances_for_id), n) + + def test_published_vs_unpublished(self): + published_source: Source = make_fake_source(published=True) + published_chant: Chant = make_fake_chant( + source=published_source, + cantus_id="123456", + incipit="chant in a published source", + ) + unpublished_source: Source = make_fake_source(published=False) + unpublished_chant: Chant = make_fake_chant( + source=unpublished_source, + cantus_id="123456", + incipit="chant in an unpublished source", + ) + + concordances: dict = update_cached_concordances.get_concordances() + concordances_for_single_id: list = concordances["123456"] + self.assertEqual(len(concordances), 1) + + single_concordance: dict = concordances_for_single_id[0] + expected_incipit: str = published_chant.incipit + observed_incipit: str = single_concordance["incipit"] + self.assertEqual(expected_incipit, observed_incipit) + + def test_concordances_values(self): + chant: Chant = make_fake_chant() + cantus_id: str = chant.cantus_id + + concordances: dict = update_cached_concordances.get_concordances() + concordances_for_single_id: list = concordances[cantus_id] + single_concordance: dict = concordances_for_single_id[0] + + expected_items: tuple = ( + ("siglum", chant.source.siglum), + ("srclink", f"https://cantusdatabase.org/source/{chant.source.id}/"), + ("chantlink", f"https://cantusdatabase.org/chant/{chant.id}/"), + ("folio", chant.folio), + ("sequence", chant.c_sequence), + ("incipit", chant.incipit), + ("feast", chant.feast.name), + ("genre", chant.genre.name), + ("office", chant.office.name), + ("position", chant.position), + ("cantus_id", chant.cantus_id), + ("image", chant.image_link), + ("mode", chant.mode), + ("full_text", chant.manuscript_full_text_std_spelling), + ("melody", chant.volpiano), + ("db", "CD"), + ) + + for key, value in expected_items: + observed_value: Union[str, int, None] = single_concordance[key] + with self.subTest(key=key): + self.assertEqual(observed_value, value) diff --git a/django/cantusdb_project/main_app/tests/test_views.py b/django/cantusdb_project/main_app/tests/test_views.py index 74908830d..3ccaeca38 100644 --- a/django/cantusdb_project/main_app/tests/test_views.py +++ b/django/cantusdb_project/main_app/tests/test_views.py @@ -3282,22 +3282,6 @@ def test_proofread_chant(self): chant.refresh_from_db() self.assertIs(chant.manuscript_full_text_std_proofread, True) - def test_chant_with_volpiano_with_no_incipit(self): - # in the past, a Chant Proofread page will error rather than loading properly when the chant has volpiano but no fulltext/incipit - source = make_fake_source() - chant = make_fake_chant( - source=source, - volpiano="1---m---l---k---m---h", - ) - chant.manuscript_full_text = None - chant.manuscript_full_text_std_spelling = None - chant.incipit = None - chant.save() - response = self.client.get( - reverse("source-edit-chants", args=[source.id]), {"pk": chant.id} - ) - self.assertEqual(response.status_code, 200) - class ChantEditSyllabificationViewTest(TestCase): @classmethod @@ -5600,9 +5584,13 @@ def test_incipit_search(self): self.assertEqual(asterisk_chant["id"], chant_with_asterisk.id) def test_cantus_id_search(self): - chant_with_normal_cantus_id = make_fake_chant(cantus_id="012345") + chant_with_normal_cantus_id = make_fake_chant( + cantus_id="012345", + incipit="This incipit contains no numerals", + ) chant_with_numerals_in_incipit = make_fake_chant( - incipit="0 me! 0 my! This is unexpected!" + cantus_id="123456", + incipit="0 me! 0 my! This is unexpected!", ) # for search terms that contain numerals, we should only return @@ -5620,7 +5608,7 @@ def test_cantus_id_search(self): self.assertNotEqual(matching_id, chant_with_numerals_in_incipit.id) # we should only return istartswith results, and not icontains results - non_matching_search_term = "1" + non_matching_search_term = "2" non_matching_response = self.client.get( reverse("ajax-search-bar", args=[non_matching_search_term]) ) diff --git a/django/cantusdb_project/main_app/views/views.py b/django/cantusdb_project/main_app/views/views.py index 3c3a4f0ff..cee74104b 100644 --- a/django/cantusdb_project/main_app/views/views.py +++ b/django/cantusdb_project/main_app/views/views.py @@ -753,17 +753,13 @@ def notation_json_export(request, id: int) -> JsonResponse: notation: Notation = get_object_or_404(Notation, id=id) - User = get_user_model() - created_by: Optional[User] = notation.created_by - last_updated_by: Optional[User] = notation.last_updated_by - data = { "id": notation.id, "name": notation.name, "date_created": notation.date_created, "date_updated": notation.date_updated, - "created_by": created_by.id if created_by else None, - "last_updated_by": last_updated_by.id if last_updated_by else None, + "created_by": notation.created_by_id, + "last_updated_by": notation.last_updated_by_id, } return JsonResponse(data) @@ -777,17 +773,13 @@ def provenance_json_export(request, id: int) -> JsonResponse: provenance: Provenance = get_object_or_404(Provenance, id=id) - User = get_user_model() - created_by: Optional[User] = provenance.created_by - last_updated_by: Optional[User] = provenance.last_updated_by - data = { "id": provenance.id, "name": provenance.name, "date_created": provenance.date_created, "date_updated": provenance.date_updated, - "created_by": created_by.id if created_by else None, - "last_updated_by": last_updated_by.id if last_updated_by else None, + "created_by": provenance.created_by_id, + "last_updated_by": provenance.last_updated_by_id, } return JsonResponse(data) diff --git a/django/cantusdb_project/requirements.txt b/django/cantusdb_project/requirements.txt index af8f68620..b052dc46d 100644 --- a/django/cantusdb_project/requirements.txt +++ b/django/cantusdb_project/requirements.txt @@ -18,7 +18,6 @@ django_debug_toolbar==3.8.1 Faker==4.1.0 gunicorn==20.0.4 idna==2.10 -ijson==3.1.2.post0 isort==5.6.4 lazy-object-proxy==1.4.3 lxml==4.9.1 @@ -38,5 +37,6 @@ text-unidecode==1.3 toml==0.10.1 typed-ast==1.4.1 typing-extensions==3.10.0.0 +ujson==5.9.0 urllib3==1.26.18 wrapt==1.12.1 diff --git a/nginx/Dockerfile b/nginx/Dockerfile index 05d01c4e4..2b2ba9c0c 100644 --- a/nginx/Dockerfile +++ b/nginx/Dockerfile @@ -5,4 +5,5 @@ RUN curl -LJO https://github.com/go-acme/lego/releases/download/v4.14.2/lego_v4. tar -xvf lego_v4.14.2_linux_amd64.tar.gz && \ mv lego /usr/local/bin/lego && \ rm lego_v4.14.2_linux_amd64.tar.gz +RUN mkdir -p /var/www/lego COPY error_pages . \ No newline at end of file diff --git a/scripts/parse_link_checker_output.py b/scripts/parse_link_checker_output.py new file mode 100644 index 000000000..172f84684 --- /dev/null +++ b/scripts/parse_link_checker_output.py @@ -0,0 +1,64 @@ +"""Modules""" +import json +import sys +from pathlib import Path +import itertools + +print(f"Running: {sys.argv[0]}", file=sys.stderr) + +FILE_LOCATION = "/tmp/link-checker-output.txt" + +# If link checker does not have any errors, exit gracefully +if not Path(FILE_LOCATION).exists(): + print("✅ No Broken Links Found.") + sys.exit(0) +else: + print("❌ Broken Links Found. Proceeding to Parsing Step.", file=sys.stderr) + +# Loading link checker output result +with open(FILE_LOCATION, encoding="utf-8") as link_checker_output_file: + print(f"Parsing the json data in {FILE_LOCATION}", file=sys.stderr) + link_checker_results = json.load(link_checker_output_file) + +list_of_failures = link_checker_results["fail_map"] + +if not list_of_failures: + print("✅ No Broken Links") + sys.exit(0) + +# Flatten the list of lists into a single list - +# list_of_failures is returned as a list of lists. +all_failures = list(itertools.chain.from_iterable(list_of_failures.values())) + +real_errors = [] +skippable_errors = [] + +# Process each failure in the flattened list +for failure in all_failures: + error_code = failure["status"].get("code") + + # Check if it's a timeout or a client-side issue + if not error_code: + skippable_errors.append(failure) + continue + + # Find all 4xx errors + if 400 <= error_code < 500: + real_errors.append(failure) + else: + skippable_errors.append(failure) + +if real_errors: + print("❌ Broken Links:") + for error in real_errors: + print(f"* {error['url']}: {error['status']['code']}") + print("\n") + +if skippable_errors: + print("🆗 Skippable Errors:") + for error in skippable_errors: + print(f"* {error['url']}: {error['status']['text']}") + print("\n") + +if real_errors: + sys.exit(1)