From ff3370835c48459038c4f48f69db86df85a1a599 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Mon, 6 Nov 2023 17:51:47 -0800 Subject: [PATCH 01/42] Add RikoltiStorage utility class --- utils/rikolti_storage.py | 138 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 utils/rikolti_storage.py diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py new file mode 100644 index 000000000..0f84be0fe --- /dev/null +++ b/utils/rikolti_storage.py @@ -0,0 +1,138 @@ +import os +import boto3 +from urllib.parse import urlparse +from typing import Optional + +class RikoltiStorage(): + def __init__(self, data_url: str): + self.data_url = data_url + data_loc = urlparse(data_url) + self.data_store = data_loc.scheme + self.data_bucket = data_loc.netloc + self.data_path = data_loc.path + + self.s3 = boto3.client('s3') + + def list_pages(self) -> list: + if self.data_store == 's3': + return self.list_s3_pages() + elif self.data_store == 'file': + return self.list_file_pages() + else: + raise Exception(f"Unknown data store: {self.data_store}") + + def list_s3_pages(self) -> list: + """ + List all objects in s3_bucket with prefix s3_prefix + """ + keys = self.s3.list_objects_v2( + Bucket=self.data_bucket, + Prefix=self.data_path + ) + return keys + + def list_file_pages(self) -> list: + """ + List all files in file_path + """ + file_objects = [] + for root, dirs, files in os.walk(self.data_path): + for file in files: + file_objects.append(os.path.join(root, file)) + return file_objects + + def search_page(self, search_str: str, page: str) -> bool: + if self.data_store == 's3': + return self.search_s3_contents(search_str, page) + elif self.data_store == 'file': + return self.search_file_contents(search_str, page) + else: + raise Exception(f"Unknown data store: {self.data_store}") + + def search_s3_page(self, search_str: str, s3_key: str) -> bool: + """ + Check if search_str is in the body of the object located at s3_key + Returns the s3_key of the object if so, otherwise returns None + """ + obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key) + body = obj['Body'].read().decode('utf-8') + if search_str in body: + return True + else: + return False + + def search_file_page(self, search_str: str, file_path: str) -> bool: + """ + Check if search_str is in the body of the file located at file_path + """ + with open(file_path, 'r') as f: + body = f.read() + if search_str in body: + return True + else: + return False + + def get_page_content(self, page: str): + if self.data_store == 's3': + return self.get_s3_contents(page) + elif self.data_store == 'file': + return self.get_file_contents(page) + else: + raise Exception(f"Unknown data store: {self.data_store}") + + def get_s3_contents(self, s3_key: str): + """ + Get the body of the object located at s3_key + """ + obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key) + return obj['Body'].read().decode('utf-8') + + def get_file_contents(self, file_path: str): + """ + Get the body of the file located at file_path + """ + with open(file_path, 'r') as f: + return f.read() + + def put_page_content(self, content:str, relative_path: Optional[str]=None): + """ + Write content to a file at relative_path (relative to data_path). + relative_path is a list of strings, each string is a directory name + representing a directory tree. + handle s3 or file storage, use '/' as separator for s3 key and os.sep + as separtors for file storage + """ + path = self.data_path + if relative_path: + path += relative_path + + if self.data_store == 's3': + return self.put_s3_content(path, content) + elif self.data_store == 'file': + return self.put_file_content(path, content) + else: + raise Exception(f"Unknown data store: {self.data_store}") + + def put_file_content(self, file_path, content): + """ + Write content to a file at file_path + """ + file_path = os.sep.join(file_path.split('/')) + directory_path = os.path.dirname(file_path) + if not os.path.exists(directory_path): + os.makedirs(directory_path) + + with open(file_path, 'w') as f: + f.write(content) + + def put_s3_content(self, s3_key, content): + """ + Write content to an object named s3_key + """ + self.s3.put_object( + ACL='bucket-owner-full-control', + Bucket=self.data_bucket, + Key=s3_key, + Body=content + ) + From 09c1c29ee2eb40070a6db5814dad4bd99f39f4dc Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Mon, 6 Nov 2023 17:52:25 -0800 Subject: [PATCH 02/42] Factor out storage considerations from metadata fetcher --- metadata_fetcher/fetchers/Fetcher.py | 62 ++++--------------- metadata_fetcher/fetchers/nuxeo_fetcher.py | 5 -- metadata_fetcher/fetchers/ucd_json_fetcher.py | 14 +++-- metadata_fetcher/settings.py | 5 -- 4 files changed, 21 insertions(+), 65 deletions(-) diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 43c16c41c..0d27e78d3 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -1,12 +1,9 @@ import logging -import os -import sys - -import boto3 import requests from .. import settings from requests.adapters import HTTPAdapter, Retry +from rikolti.utils.rikolti_storage import RikoltiStorage logger = logging.getLogger(__name__) @@ -29,53 +26,11 @@ def __init__(self, params): self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - bucket = settings.DATA_DEST["BUCKET"] + self.data_destination = RikoltiStorage(settings.DATA_DEST_URL) - self.s3_data = { - "ACL": 'bucket-owner-full-control', - "Bucket": bucket, - "Key": f"{self.collection_id}/vernacular_metadata/" - } if not self.collection_id: raise CollectionIdRequired("collection_id is required") - def fetchtolocal(self, page): - path = self.get_local_path() - - filename = os.path.join(path, f"{self.write_page}") - f = open(filename, "w+") - - f.write(page) - - def get_local_path(self): - local_path = os.sep.join([ - settings.DATA_DEST["PATH"], - str(self.collection_id), - 'vernacular_metadata', - ]) - if not os.path.exists(local_path): - os.makedirs(local_path) - - return local_path - - def fetchtos3(self, page): - s3_client = boto3.client('s3') - s3_key = self.s3_data['Key'] - - try: - # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.put_object - s3_client.put_object( - ACL=self.s3_data['ACL'], - Bucket=self.s3_data['Bucket'], - Key=( - f"{s3_key}" - f"{self.write_page}" - ), - Body=page) - except Exception as e: - print(f"Metadata Fetcher: {e}", file=sys.stderr) - raise(e) - def fetch_page(self): page = self.build_fetch_request() logger.debug( @@ -92,10 +47,15 @@ def fetch_page(self): record_count = self.check_page(response) if record_count: content = self.aggregate_vernacular_content(response.text) - if settings.DATA_DEST["STORE"] != 's3': - self.fetchtolocal(content) - else: - self.fetchtos3(content) + try: + self.data_destination.put_page_content( + content, relative_path=( + f"{self.collection_id}/vernacular_metadata/{self.write_page}" + ) + ) + except Exception as e: + print(f"Metadata Fetcher: {e}") + raise(e) self.increment(response) diff --git a/metadata_fetcher/fetchers/nuxeo_fetcher.py b/metadata_fetcher/fetchers/nuxeo_fetcher.py index b1a301971..754c88390 100644 --- a/metadata_fetcher/fetchers/nuxeo_fetcher.py +++ b/metadata_fetcher/fetchers/nuxeo_fetcher.py @@ -79,11 +79,6 @@ def __init__(self, params): } if self.nuxeo['query_type'] == 'children': - if settings.DATA_DEST != 's3': - path = self.get_local_path() - children_path = os.path.join(path, "children") - if not os.path.exists(children_path): - os.mkdir(children_path) self.write_page = ( "children/" f"{self.nuxeo['current_path']['uid']}-" diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py index a8b99d6c8..7b646b192 100644 --- a/metadata_fetcher/fetchers/ucd_json_fetcher.py +++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py @@ -1,4 +1,5 @@ import json +import sys from .Fetcher import Fetcher, FetchError import requests from xml.etree import ElementTree @@ -63,10 +64,15 @@ def fetch_all_pages(self, response: requests.Response) -> int: records = list(filter(None, [self.fetch_json_ld(url.text) for url in urls])) content = json.dumps(records) - if settings.DATA_DEST.get("STORE") == "file": - self.fetchtolocal(content) - else: - self.fetchtos3(content) + try: + self.data_destination.put_page_content( + content, relative_path=( + f"{self.collection_id}/vernacular_metadata/{self.write_page}" + ) + ) + except Exception as e: + print(f"Metadata Fetcher: {e}", file=sys.stderr) + raise(e) self.write_page += 1 return len(loc_nodes) diff --git a/metadata_fetcher/settings.py b/metadata_fetcher/settings.py index 7c6c2f729..e18110918 100644 --- a/metadata_fetcher/settings.py +++ b/metadata_fetcher/settings.py @@ -13,11 +13,6 @@ FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY') DATA_DEST_URL = os.environ.get("FETCHER_DATA_DEST", "file:///tmp") -DATA_DEST = { - "STORE": urlparse(DATA_DEST_URL).scheme, - "BUCKET": urlparse(DATA_DEST_URL).netloc, - "PATH": urlparse(DATA_DEST_URL).path -} for key, value in os.environ.items(): logger.debug(f"{key}={value}") From 21576bc5e850094a0f08a0288d8279960daca5b0 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 12:05:48 -0800 Subject: [PATCH 03/42] Factor out metadata mapper source data storage considerations --- metadata_mapper/lambda_function.py | 5 +++- metadata_mapper/lambda_shepherd.py | 40 +++++++++--------------------- metadata_mapper/mappers/mapper.py | 31 ----------------------- utils/rikolti_storage.py | 28 +++++++++++---------- 4 files changed, 31 insertions(+), 73 deletions(-) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 0b1676c84..2c2763480 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -7,6 +7,7 @@ from . import settings from .mappers.mapper import Record, UCLDCWriter, Vernacular +from rikolti.utils.rikolti_storage import RikoltiStorage logger = logging.getLogger(__name__) @@ -78,7 +79,9 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str vernacular_reader = import_vernacular_reader( collection.get('rikolti_mapper_type')) source_vernacular = vernacular_reader(collection_id, page_filename) - api_resp = source_vernacular.get_api_response() + storage = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata/{page_filename}") + api_resp = storage.get_page_content() + source_metadata_records = source_vernacular.parse(api_resp) source_metadata_records = run_enrichments( diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py index 19d09b369..a371c2a1c 100644 --- a/metadata_mapper/lambda_shepherd.py +++ b/metadata_mapper/lambda_shepherd.py @@ -1,8 +1,6 @@ import json -import os import sys -import boto3 import requests from urllib.parse import urlparse @@ -10,6 +8,7 @@ from . import settings, validate_mapping from .lambda_function import map_page from .mappers.mapper import Record +from rikolti.utils.rikolti_storage import RikoltiStorage def get_collection(collection_id): @@ -39,34 +38,19 @@ def check_for_missing_enrichments(collection): def get_vernacular_pages(collection_id): - page_list = [] + rikolti_data = RikoltiStorage( + f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata") - if settings.DATA_SRC["STORE"] == 'file': - vernacular_path = settings.local_path( - collection_id, 'vernacular_metadata') - try: - page_list = [f for f in os.listdir(vernacular_path) - if os.path.isfile(os.path.join(vernacular_path, f))] - children_path = os.path.join(vernacular_path, 'children') - if os.path.exists(children_path): - page_list += [os.path.join('children', f) - for f in os.listdir(children_path) - if os.path.isfile(os.path.join(children_path, f))] - except FileNotFoundError as e: - print( - f"{e} - have you fetched {collection_id}? " - f"looked in dir {e.filename}" - ) - raise(e) - elif settings.DATA_SRC["STORE"] == 's3': - s3_client = boto3.client('s3') - resp = s3_client.list_objects_v2( - Bucket=settings.DATA_SRC["BUCKET"], - Prefix=f"{collection_id}/vernacular_metadata" + try: + page_list = rikolti_data.list_pages() + except FileNotFoundError as e: + print( + f"{e} - have you fetched {collection_id}? " + f"looked in dir {e.filename} for vernacular pages" ) - # TODO: check resp['IsTruncated'] and use ContinuationToken if needed - page_list = [page['Key'] for page in resp['Contents']] - # TODO: split page_list into pages and children + raise(e) + + # TODO: split page_list into pages and children return page_list diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py index 116b232ac..1bea9af1d 100644 --- a/metadata_mapper/mappers/mapper.py +++ b/metadata_mapper/mappers/mapper.py @@ -19,7 +19,6 @@ from .iso639_1 import iso_639_1 from .iso639_3 import iso_639_3, language_regexes, wb_language_regexes - class UCLDCWriter(object): def __init__(self, collection_id: int, page_filename: str): self.collection_id = collection_id @@ -56,36 +55,6 @@ def __init__(self, collection_id: int, page_filename: str) -> None: self.collection_id = collection_id self.page_filename = page_filename - def get_api_response(self) -> dict: - if settings.DATA_SRC["STORE"] == 'file': - return self.get_local_api_response() - else: - return self.get_s3_api_response() - - def get_local_api_response(self) -> str: - local_path = settings.local_path( - self.collection_id, 'vernacular_metadata') - page_path = os.sep.join([local_path, str(self.page_filename)]) - page = open(page_path, "r") - api_response = page.read() - return api_response - - def get_s3_api_response(self) -> str: - s3_client = boto3.client('s3') - if not self.page_filename.startswith( - f'{self.collection_id}/vernacular_metadata'): - self.page_filename = ( - f"{self.collection_id}/vernacular_metadata/" - f"{self.page_filename}" - ) - - page = s3_client.get_object( - Bucket=settings.DATA_SRC["BUCKET"], - Key=self.page_filename - ) - api_response = page['Body'].read() - return api_response - def get_records(self, records): return [ self.record_cls(self.collection_id, record) diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 0f84be0fe..4964b363a 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -25,10 +25,12 @@ def list_s3_pages(self) -> list: """ List all objects in s3_bucket with prefix s3_prefix """ - keys = self.s3.list_objects_v2( + s3_objects = self.s3.list_objects_v2( Bucket=self.data_bucket, Prefix=self.data_path ) + # TODO: check resp['IsTruncated'] and use ContinuationToken if needed + keys = [obj['Key'] for obj in s3_objects['Contents']] return keys def list_file_pages(self) -> list: @@ -43,12 +45,12 @@ def list_file_pages(self) -> list: def search_page(self, search_str: str, page: str) -> bool: if self.data_store == 's3': - return self.search_s3_contents(search_str, page) + return self.search_s3_page(search_str, page) elif self.data_store == 'file': - return self.search_file_contents(search_str, page) + return self.search_file_page(search_str, page) else: raise Exception(f"Unknown data store: {self.data_store}") - + def search_s3_page(self, search_str: str, s3_key: str) -> bool: """ Check if search_str is in the body of the object located at s3_key @@ -60,7 +62,7 @@ def search_s3_page(self, search_str: str, s3_key: str) -> bool: return True else: return False - + def search_file_page(self, search_str: str, file_path: str) -> bool: """ Check if search_str is in the body of the file located at file_path @@ -72,26 +74,26 @@ def search_file_page(self, search_str: str, file_path: str) -> bool: else: return False - def get_page_content(self, page: str): + def get_page_content(self): if self.data_store == 's3': - return self.get_s3_contents(page) + return self.get_s3_contents() elif self.data_store == 'file': - return self.get_file_contents(page) + return self.get_file_contents() else: raise Exception(f"Unknown data store: {self.data_store}") - - def get_s3_contents(self, s3_key: str): + + def get_s3_contents(self): """ Get the body of the object located at s3_key """ - obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key) + obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path) return obj['Body'].read().decode('utf-8') - def get_file_contents(self, file_path: str): + def get_file_contents(self): """ Get the body of the file located at file_path """ - with open(file_path, 'r') as f: + with open(self.data_path, 'r') as f: return f.read() def put_page_content(self, content:str, relative_path: Optional[str]=None): From d36ba39e8da926e6e580244ce9eaa1ce2478b57b Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 12:12:50 -0800 Subject: [PATCH 04/42] factor out metadata_mapper.settings.local_path --- metadata_mapper/mappers/mapper.py | 16 ++++++++++------ metadata_mapper/mappers/oai/oai_mapper.py | 10 +++++++--- metadata_mapper/settings.py | 7 ------- metadata_mapper/utilities.py | 17 ++++++++++++++--- .../validate_registry_collections.py | 7 ++++++- 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py index 1bea9af1d..ee4abdbb9 100644 --- a/metadata_mapper/mappers/mapper.py +++ b/metadata_mapper/mappers/mapper.py @@ -25,13 +25,17 @@ def __init__(self, collection_id: int, page_filename: str): self.page_filename = page_filename def write_local_mapped_metadata(self, mapped_metadata): - local_path = settings.local_path( - self.collection_id, 'mapped_metadata') - if not os.path.exists(local_path): - os.makedirs(local_path) - page_path = os.sep.join([local_path, str(self.page_filename)]) + mapped_data_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(self.collection_id), + 'mapped_metadata', + ]) + + if not os.path.exists(mapped_data_path): + os.makedirs(mapped_data_path) + page_path = os.sep.join([mapped_data_path, str(self.page_filename)]) if 'children' in page_path: - local_children_path = os.path.join(local_path, 'children') + local_children_path = os.path.join(mapped_data_path, 'children') if not os.path.exists(local_children_path): os.makedirs(local_children_path) page = open(page_path, "w+") diff --git a/metadata_mapper/mappers/oai/oai_mapper.py b/metadata_mapper/mappers/oai/oai_mapper.py index 0472a2167..5c9f0d152 100644 --- a/metadata_mapper/mappers/oai/oai_mapper.py +++ b/metadata_mapper/mappers/oai/oai_mapper.py @@ -127,9 +127,13 @@ def strip_metadata(self, record_metadata): # lxml parser requires bytes input or XML fragments without declaration, # so use 'rb' mode def get_local_api_response(self): - local_path = settings.local_path( - self.collection_id, 'vernacular_metadata') - page_path = os.sep.join([local_path, str(self.page_filename)]) + vernacular_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(self.collection_id), + 'vernacular_metadata', + ]) + + page_path = os.sep.join([vernacular_path, str(self.page_filename)]) page = open(page_path, "rb") api_response = page.read() return api_response diff --git a/metadata_mapper/settings.py b/metadata_mapper/settings.py index adca3d6ab..ec5a989d8 100644 --- a/metadata_mapper/settings.py +++ b/metadata_mapper/settings.py @@ -26,10 +26,3 @@ SOLR_API_KEY = os.environ.get('UCLDC_SOLR_API_KEY', False) COUCH_URL = os.environ.get('UCLDC_COUCH_URL', False) -def local_path(collection_id, folder): - local_path = os.sep.join([ - DATA_SRC["PATH"], - str(collection_id), - folder, - ]) - return local_path diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py index 03dd4e3fa..47f29a6e6 100644 --- a/metadata_mapper/utilities.py +++ b/metadata_mapper/utilities.py @@ -52,7 +52,12 @@ def get_files(collection_id: int, directory: str) -> list[str]: Gets a list of filenames in a given directory. """ if settings.DATA_SRC["STORE"] == "file": - path = settings.local_path(collection_id, directory) + path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + directory, + ]) + try: return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] @@ -103,7 +108,9 @@ def read_from_bucket(collection_id: int, directory: str, """ if settings.DATA_SRC["STORE"] == 'file': page_path = os.sep.join([ - settings.local_path(collection_id, directory), + settings.DATA_SRC["PATH"], + str(collection_id), + directory, str(file_name) ]) try: @@ -173,7 +180,11 @@ def write_to_bucket(collection_id: int, directory: str, content = json.dumps(content) if settings.DATA_SRC["STORE"] == 'file': - dir_path = settings.local_path(collection_id, directory) + dir_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + directory, + ]) if not os.path.exists(dir_path): os.makedirs(dir_path) page_path = os.sep.join([dir_path, str(file_name)]) diff --git a/metadata_mapper/validate_registry_collections.py b/metadata_mapper/validate_registry_collections.py index 1242f8361..df5a7f6d1 100644 --- a/metadata_mapper/validate_registry_collections.py +++ b/metadata_mapper/validate_registry_collections.py @@ -59,7 +59,12 @@ def validate_endpoint(url): continue results.append(collection_validation) - validation_path = settings.local_path(collection_id, 'validation') + validation_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + 'validation', + ]) + if not os.path.exists(validation_path): os.makedirs(validation_path) page_path = os.sep.join([ From ac36c0ad3970f4643804bbbc45ec062d3ceeed1b Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 16:38:57 -0800 Subject: [PATCH 05/42] Add recursive and relative flags to list_pages --- metadata_mapper/lambda_function.py | 2 +- metadata_mapper/lambda_shepherd.py | 2 +- utils/rikolti_storage.py | 45 ++++++++++++++++++++++++------ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 2c2763480..2812d7905 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -78,10 +78,10 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str vernacular_reader = import_vernacular_reader( collection.get('rikolti_mapper_type')) - source_vernacular = vernacular_reader(collection_id, page_filename) storage = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata/{page_filename}") api_resp = storage.get_page_content() + source_vernacular = vernacular_reader(collection_id, page_filename) source_metadata_records = source_vernacular.parse(api_resp) source_metadata_records = run_enrichments( diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py index a371c2a1c..91cd5999d 100644 --- a/metadata_mapper/lambda_shepherd.py +++ b/metadata_mapper/lambda_shepherd.py @@ -42,7 +42,7 @@ def get_vernacular_pages(collection_id): f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata") try: - page_list = rikolti_data.list_pages() + page_list = rikolti_data.list_pages(relative=True) except FileNotFoundError as e: print( f"{e} - have you fetched {collection_id}? " diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 4964b363a..0c924e796 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -1,5 +1,8 @@ import os +import re + import boto3 + from urllib.parse import urlparse from typing import Optional @@ -13,15 +16,15 @@ def __init__(self, data_url: str): self.s3 = boto3.client('s3') - def list_pages(self) -> list: + def list_pages(self, recursive=True, relative=True) -> list: if self.data_store == 's3': - return self.list_s3_pages() + return self.list_s3_pages(recursive=recursive, relative=relative) elif self.data_store == 'file': - return self.list_file_pages() + return self.list_file_pages(recursive=recursive, relative=relative) else: raise Exception(f"Unknown data store: {self.data_store}") - def list_s3_pages(self) -> list: + def list_s3_pages(self, recursive=True, relative=True) -> list: """ List all objects in s3_bucket with prefix s3_prefix """ @@ -30,17 +33,41 @@ def list_s3_pages(self) -> list: Prefix=self.data_path ) # TODO: check resp['IsTruncated'] and use ContinuationToken if needed - keys = [obj['Key'] for obj in s3_objects['Contents']] + + keys = [f"s3://{self.data_bucket}/{obj['Key']}" for obj in s3_objects['Contents']] + prefix = "s3://{self.data_bucket}/{self.data_path}" + + if not recursive: + # prune deeper branches + leaf_regex = re.escape(prefix) + r"^\/?[\w!'_.*()-]+\/?$" + keys = [key for key in keys if re.match(leaf_regex, key)] + + if relative: + keys = [key[len(prefix):] for key in keys] + return keys - def list_file_pages(self) -> list: + def list_file_pages(self, recursive=True, relative=True) -> list: """ List all files in file_path """ file_objects = [] - for root, dirs, files in os.walk(self.data_path): - for file in files: - file_objects.append(os.path.join(root, file)) + if recursive: + for root, dirs, files in os.walk(self.data_path): + root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}" + for file in files: + file_objects.append(f"{root_uri}{file}") + + if not recursive: + for file in os.listdir(self.data_path): + if os.path.isfile(os.path.join(self.data_path, file)): + root_uri = "file://{self.data_path}/" if self.data_path[-1] != '/' else "file://{self.data_path}" + file_objects.append(f"{root_uri}{file}") + + if relative: + prefix = "file://{self.data_path}/" + file_objects = [file[len(prefix):] for file in file_objects] + return file_objects def search_page(self, search_str: str, page: str) -> bool: From 80e5f0a279452568e25c5f6f131be1ce9c34d450 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 17:54:37 -0800 Subject: [PATCH 06/42] factor out metadata mapper source data from utilities --- metadata_mapper/lambda_function.py | 5 +- metadata_mapper/mappers/mapper.py | 2 +- metadata_mapper/mappers/oai/oai_mapper.py | 17 +--- metadata_mapper/utilities.py | 113 +++------------------- utils/rikolti_storage.py | 38 ++++++-- 5 files changed, 51 insertions(+), 124 deletions(-) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 2812d7905..566f282bb 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -78,7 +78,10 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str vernacular_reader = import_vernacular_reader( collection.get('rikolti_mapper_type')) - storage = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata/{page_filename}") + storage = RikoltiStorage( + f"{settings.DATA_SRC_URL}/{collection_id}/" + f"vernacular_metadata/{page_filename}" + ) api_resp = storage.get_page_content() source_vernacular = vernacular_reader(collection_id, page_filename) diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py index ee4abdbb9..a40da6dde 100644 --- a/metadata_mapper/mappers/mapper.py +++ b/metadata_mapper/mappers/mapper.py @@ -26,7 +26,7 @@ def __init__(self, collection_id: int, page_filename: str): def write_local_mapped_metadata(self, mapped_metadata): mapped_data_path = os.sep.join([ - settings.DATA_SRC["PATH"], + settings.DATA_DEST["PATH"], str(self.collection_id), 'mapped_metadata', ]) diff --git a/metadata_mapper/mappers/oai/oai_mapper.py b/metadata_mapper/mappers/oai/oai_mapper.py index 5c9f0d152..033f4a5bb 100644 --- a/metadata_mapper/mappers/oai/oai_mapper.py +++ b/metadata_mapper/mappers/oai/oai_mapper.py @@ -1,10 +1,8 @@ -import os from typing import Union from lxml import etree from sickle import models -from ... import settings from ..mapper import Record, Vernacular @@ -83,6 +81,7 @@ def map_is_shown_by(self): class OaiVernacular(Vernacular): def parse(self, api_response): + api_response = bytes(api_response, 'utf-8') namespace = {'oai2': 'http://www.openarchives.org/OAI/2.0/'} page = etree.XML(api_response) @@ -123,17 +122,3 @@ def strip_metadata(self, record_metadata): stripped[key] = value return stripped - - # lxml parser requires bytes input or XML fragments without declaration, - # so use 'rb' mode - def get_local_api_response(self): - vernacular_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(self.collection_id), - 'vernacular_metadata', - ]) - - page_path = os.sep.join([vernacular_path, str(self.page_filename)]) - page = open(page_path, "rb") - api_response = page.read() - return api_response diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py index 47f29a6e6..6a2573751 100644 --- a/metadata_mapper/utilities.py +++ b/metadata_mapper/utilities.py @@ -1,11 +1,9 @@ import importlib import json -import os from typing import Callable, Union -import boto3 - from . import settings +from rikolti.utils.rikolti_storage import RikoltiStorage def returns_callable(func: Callable) -> Callable: @@ -51,43 +49,10 @@ def get_files(collection_id: int, directory: str) -> list[str]: """ Gets a list of filenames in a given directory. """ - if settings.DATA_SRC["STORE"] == "file": - path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - directory, - ]) - - try: - return [f for f in os.listdir(path) - if os.path.isfile(os.path.join(path, f))] - except Exception as e: - raise Exception( - f"{collection_id:<6}: Error listing files in {path}\n" - f"{collection_id:<6}: {e}" - ) - elif settings.DATA_SRC["STORE"] == "s3": - s3_client = boto3.client('s3') - try: - resp = s3_client.list_objects_v2( - Bucket=settings.DATA_SRC["BUCKET"], - Prefix=f"{collection_id}/{directory}" - ) - # TODO: check resp['IsTruncated'] and use ContinuationToken if needed - return [page['Key'] for page in resp['Contents']] - except Exception as e: - s3_url = ( - f"s3://{settings.DATA_SRC['BUCKET']}/{collection_id}/" - f"{directory}/") - url = ( - f"https://{settings.DATA_SRC['BUCKET']}.s3.us-west-2.amazonaws" - ".com/index.html#{collection_id}/" - ) - raise Exception( - f"{collection_id<6}: Error listing files at {s3_url}\n" - f"{collection_id<6}: Check that {directory} exists at {url}\n" - f"{collection_id<6}: {e}" - ) + rikolti_data = RikoltiStorage( + f"{settings.DATA_SRC_URL}/{collection_id}/{directory}") + rikolti_data.list_pages(recursive=False, relative=True) + def read_from_bucket(collection_id: int, directory: str, file_name: Union[str, int]) -> str: @@ -106,40 +71,10 @@ def read_from_bucket(collection_id: int, directory: str, Returns: str The file contents """ - if settings.DATA_SRC["STORE"] == 'file': - page_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - directory, - str(file_name) - ]) - try: - with open(page_path, "r") as metadata_file: - return metadata_file.read() - except Exception as e: - raise Exception( - f"{collection_id:<6}: Error reading {page_path}\n" - f"{collection_id:<6}: {e}" - ) - elif settings.DATA_SRC["STORE"] == 's3': - s3_client = boto3.client('s3') - try: - s3_obj_summary = s3_client.get_object( - Bucket=settings.DATA_SRC["BUCKET"], - Key=f"{file_name}" - ) - return s3_obj_summary['Body'].read() - except Exception as e: - s3_url = (f"s3://{settings.DATA_SRC['BUCKET']}/{file_name}") - url = ( - f"https://{settings.DATA_SRC['BUCKET']}.s3.us-west-2.amazonaws" - ".com/index.html#{file_name}/" - ) - raise Exception( - f"{collection_id<6}: Error reading file at {s3_url}\n" - f"{collection_id<6}: Check {url}\n" - f"{collection_id<6}: {e}" - ) + rikolti_data = RikoltiStorage( + f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}") + return rikolti_data.get_page_content() + def read_mapped_metadata(collection_id: int, page_id: int) -> list[dict]: """ @@ -174,34 +109,12 @@ def read_vernacular_metadata(collection_id: int, page_id: int) -> list[dict]: def write_to_bucket(collection_id: int, directory: str, - file_name: Union[str, int], content: str, - append: bool = False) -> None: + file_name: Union[str, int], content: str) -> None: if isinstance(content, list) or isinstance(content, dict): content = json.dumps(content) - if settings.DATA_SRC["STORE"] == 'file': - dir_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - directory, - ]) - if not os.path.exists(dir_path): - os.makedirs(dir_path) - page_path = os.sep.join([dir_path, str(file_name)]) - - with open(page_path, "a" if append else "w") as file: - file.write(content) - file_location = f"file://{page_path}" - elif settings.DATA_SRC["STORE"] == 's3': - s3_client = boto3.client('s3') - key = ( - f"{collection_id}/{directory}/" - f"{file_name}" - ) - s3_client.put_object( - Bucket=settings.DATA_DEST["BUCKET"], - Key=key, - Body=content) - file_location = f"s3://{settings.DATA_DEST['BUCKET']}/{key}" + rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}") + rikolti_data.put_page_content(content, str(file_name)) + file_location = f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}" return file_location diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 0c924e796..722d4ae5f 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -18,9 +18,22 @@ def __init__(self, data_url: str): def list_pages(self, recursive=True, relative=True) -> list: if self.data_store == 's3': - return self.list_s3_pages(recursive=recursive, relative=relative) + try: + return self.list_s3_pages(recursive=recursive, relative=relative) + except Exception as e: + url = ( + f"https://{self.data_bucket}.s3.us-west-2.amazonaws" + ".com/index.html#{self.data_path}/" + ) + raise Exception( + f"Error listing files at {self.data_url}\n" + f"Check that {self.data_path} exists at {url}\n{e}" + ) elif self.data_store == 'file': - return self.list_file_pages(recursive=recursive, relative=relative) + try: + return self.list_file_pages(recursive=recursive, relative=relative) + except Exception as e: + raise Exception(f"Error listing files in {path}\n{e}") else: raise Exception(f"Unknown data store: {self.data_store}") @@ -113,15 +126,28 @@ def get_s3_contents(self): """ Get the body of the object located at s3_key """ - obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path) - return obj['Body'].read().decode('utf-8') + try: + obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path) + return obj['Body'].read().decode('utf-8') + except Exception as e: + url = ( + f"https://{self.data_bucket}.s3.us-west-2.amazonaws.com/" + "index.html#{self.data_path}/" + ) + raise Exception( + f"Error reading file at {self.data_url}\nCheck: {url}\n{e}" + ) def get_file_contents(self): """ Get the body of the file located at file_path """ - with open(self.data_path, 'r') as f: - return f.read() + try: + with open(self.data_path, 'r') as f: + return f.read() + except Exception as e: + raise Exception(f"Error reading {self.data_path}\n{e}") + def put_page_content(self, content:str, relative_path: Optional[str]=None): """ From 29ced6a829f4495946c2c9903aae0f440809c033 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 17:55:42 -0800 Subject: [PATCH 07/42] remove validate_registry_collections and tests.py --- metadata_mapper/tests.py | 60 ------------- .../validate_registry_collections.py | 90 ------------------- 2 files changed, 150 deletions(-) delete mode 100644 metadata_mapper/tests.py delete mode 100644 metadata_mapper/validate_registry_collections.py diff --git a/metadata_mapper/tests.py b/metadata_mapper/tests.py deleted file mode 100644 index 9bc75e812..000000000 --- a/metadata_mapper/tests.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse -import json -import logging -import os - -from . import settings -from .lambda_shepherd import map_collection -from .map_registry_collections import map_endpoint -from .sample_data.islandora_harvests import islandora_harvests -from .sample_data.nuxeo_harvests import (nuxeo_complex_object_harvests, - nuxeo_harvests, - nuxeo_nested_complex_object_harvests) -from .sample_data.oac_harvests import oac_harvests -from .validate_mapping import validate_collection -from .validate_registry_collections import validate_endpoint - - -def main(): - vernacular_path = settings.DATA_SRC["PATH"] - urls = [ - f"https://registry.cdlib.org/api/v1/rikoltimapper/{f}/?format=json" - for f in os.listdir(vernacular_path) - ] - for url in urls: - map_endpoint(url) - - for url in urls: - validate_endpoint(url) - - -def test_static_samples(): - harvests = [ - oac_harvests[0], islandora_harvests[0], - nuxeo_harvests[0], nuxeo_complex_object_harvests[0], - nuxeo_nested_complex_object_harvests[0] - ] - - for harvest in harvests: - print(f"tests.py: {json.dumps(harvest)}") - status = map_collection(json.dumps(harvest), {}) - print(f"Map status: {status}") - - for harvest in harvests: - print(f"validate mapping: {json.dumps(harvest)}") - validate_collection(json.dumps(harvest)) - print(f"validated: {str(harvest)}") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '-log', - '--loglevel', - default='warning', - help='log level (default: warning)' - ) - args = parser.parse_args() - logging.basicConfig(level=args.loglevel.upper()) - logging.info('logging now set up') - main() diff --git a/metadata_mapper/validate_registry_collections.py b/metadata_mapper/validate_registry_collections.py deleted file mode 100644 index df5a7f6d1..000000000 --- a/metadata_mapper/validate_registry_collections.py +++ /dev/null @@ -1,90 +0,0 @@ -import argparse -import json -import logging -import os -import sys -from datetime import datetime - -import requests -import urllib3 - -from . import settings -from .validate_mapping import validate_collection - -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - -def validate_endpoint(url): - collection_page = url - results = [] - - while collection_page: - try: - response = requests.get(url=collection_page) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - msg = ( - f"[{collection_page}]: " - f"{err}; A valid collection id is required for validation" - ) - print(msg) - collection_page = None - break - - total_collections = response.json().get('meta', {}).get('total_count', 1) - print( - f">>> Validating {total_collections} collections " - f"described at {collection_page}" - ) - - collection_page = response.json().get('meta', {}).get('next') - if collection_page: - collection_page = f"https://registry.cdlib.org{collection_page}" - logging.debug(f"Next page: {collection_page}") - collections = response.json().get('objects', [response.json()]) - for collection in collections: - collection_id = collection['collection_id'] - log_msg = f"[{collection_id}]: " + "{}" - print(log_msg.format( - f"Validating collection {collection_id} - " - f"{collection['solr_count']} items in solr as of " - f"{collection['solr_last_updated']}" - )) - logging.debug(log_msg.format(f"lambda payload: {collection}")) - try: - collection_validation = validate_collection( - json.dumps(collection)) - except FileNotFoundError: - print(f"[{collection_id}]: not fetched yet") - continue - results.append(collection_validation) - - validation_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - 'validation', - ]) - - if not os.path.exists(validation_path): - os.makedirs(validation_path) - page_path = os.sep.join([ - validation_path, - f"{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv" - ]) - output = open(page_path, "w") - for field_validation in collection_validation: - output.write(field_validation) - output.write('\n') - output.close() - - return results - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Run mapper for registry endpoint") - parser.add_argument('endpoint', help='registry api endpoint') - args = parser.parse_args(sys.argv[1:]) - validation_errors = validate_endpoint(args.endpoint) - # print(validation_errors) - sys.exit(0) From c4a0b5091a2f2496969e0b70c74b93a379d3a435 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 18:06:31 -0800 Subject: [PATCH 08/42] factor out metadata mapper dest data considerations --- metadata_mapper/lambda_function.py | 8 ++------ metadata_mapper/mappers/mapper.py | 32 +++++------------------------- metadata_mapper/settings.py | 13 ------------ 3 files changed, 7 insertions(+), 46 deletions(-) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 566f282bb..b9509f8f5 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -96,8 +96,7 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str writer = UCLDCWriter(collection_id, page_filename) # TODO: write interim mapped but not enriched metadata to s3? - # if settings.DATA_DEST["STORE"] == 'file': - # writer.write_local_mapped_metadata( + # writer.write_mapped_metadata( # [record.to_dict() for record in mapped_records]) mapped_records = run_enrichments( @@ -126,10 +125,7 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str # for record in mapped_records] mapped_metadata = [record.to_dict() for record in mapped_records] - if settings.DATA_DEST["STORE"] == 'file': - writer.write_local_mapped_metadata(mapped_metadata) - else: - writer.write_s3_mapped_metadata(mapped_metadata) + writer.write_mapped_metadata(mapped_metadata) return { 'status': 'success', diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py index a40da6dde..70e0c5d32 100644 --- a/metadata_mapper/mappers/mapper.py +++ b/metadata_mapper/mappers/mapper.py @@ -24,34 +24,12 @@ def __init__(self, collection_id: int, page_filename: str): self.collection_id = collection_id self.page_filename = page_filename - def write_local_mapped_metadata(self, mapped_metadata): - mapped_data_path = os.sep.join([ - settings.DATA_DEST["PATH"], - str(self.collection_id), - 'mapped_metadata', - ]) - - if not os.path.exists(mapped_data_path): - os.makedirs(mapped_data_path) - page_path = os.sep.join([mapped_data_path, str(self.page_filename)]) - if 'children' in page_path: - local_children_path = os.path.join(mapped_data_path, 'children') - if not os.path.exists(local_children_path): - os.makedirs(local_children_path) - page = open(page_path, "w+") - page.write(json.dumps(mapped_metadata)) - - def write_s3_mapped_metadata(self, mapped_metadata): - s3_client = boto3.client('s3') - key = ( - f"{self.collection_id}/mapped_metadata/" - f"{self.page_filename.split('/')[-1]}" + def write_mapped_metadata(self, mapped_metadata): + rikolti_data = RikoltiStorage( + f"{settings.DATA_DEST_URL}/{self.collection_id}/" + f"mapped_metadata/{self.page_filename}" ) - s3_client.put_object( - ACL='bucket-owner-full-control', - Bucket=settings.DATA_DEST["BUCKET"], - Key=key, - Body=json.dumps(mapped_metadata)) + rikolti_data.write_page_content(json.dumps(mapped_metadata)) class Vernacular(ABC, object): diff --git a/metadata_mapper/settings.py b/metadata_mapper/settings.py index ec5a989d8..aaecef5fc 100644 --- a/metadata_mapper/settings.py +++ b/metadata_mapper/settings.py @@ -1,24 +1,11 @@ import os -from urllib.parse import urlparse - from dotenv import load_dotenv load_dotenv() DATA_SRC_URL = os.environ.get('MAPPER_DATA_SRC', 'file:///tmp') -DATA_SRC = { - "STORE": urlparse(DATA_SRC_URL).scheme, - "BUCKET": urlparse(DATA_SRC_URL).netloc, - "PATH": urlparse(DATA_SRC_URL).path -} - DATA_DEST_URL = os.environ.get('MAPPER_DATA_DEST', 'file:///tmp') -DATA_DEST = { - "STORE": urlparse(DATA_DEST_URL).scheme, - "BUCKET": urlparse(DATA_DEST_URL).netloc, - "PATH": urlparse(DATA_DEST_URL).path -} SKIP_UNDEFINED_ENRICHMENTS = os.environ.get('SKIP_UNDEFINED_ENRICHMENTS', False) From 5b4317b374320b8582b04333817efe3d8305e834 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 18:10:25 -0800 Subject: [PATCH 09/42] UCLDCWriter no longer does anything --- metadata_mapper/lambda_function.py | 17 ++++++++++++----- metadata_mapper/mappers/mapper.py | 14 -------------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index b9509f8f5..71da9db65 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -6,7 +6,7 @@ from urllib.parse import parse_qs, urlparse from . import settings -from .mappers.mapper import Record, UCLDCWriter, Vernacular +from .mappers.mapper import Record, Vernacular from rikolti.utils.rikolti_storage import RikoltiStorage logger = logging.getLogger(__name__) @@ -94,10 +94,13 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str record.to_UCLDC() mapped_records = source_metadata_records - writer = UCLDCWriter(collection_id, page_filename) # TODO: write interim mapped but not enriched metadata to s3? - # writer.write_mapped_metadata( - # [record.to_dict() for record in mapped_records]) + # rikolti_data = RikoltiStorage( + # f"{settings.DATA_DEST_URL}/{collection_id}/" + # f"interim_mapped_metadata/{page_filename}" + # ) + # rikolti_data.put_page_content(json.dumps( + # [record.to_dict() for record in mapped_records])) mapped_records = run_enrichments( mapped_records, collection, 'rikolti__enrichments', page_filename) @@ -125,7 +128,11 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str # for record in mapped_records] mapped_metadata = [record.to_dict() for record in mapped_records] - writer.write_mapped_metadata(mapped_metadata) + rikolti_data = RikoltiStorage( + f"{settings.DATA_DEST_URL}/{collection_id}/" + f"mapped_metadata/{page_filename}" + ) + rikolti_data.put_page_content(json.dumps(mapped_metadata)) return { 'status': 'success', diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py index 70e0c5d32..bb5ae3981 100644 --- a/metadata_mapper/mappers/mapper.py +++ b/metadata_mapper/mappers/mapper.py @@ -8,10 +8,8 @@ from datetime import timezone from typing import Any, Callable -import boto3 from markupsafe import Markup -from .. import settings from ..utilities import returns_callable from ..validator.validation_log import ValidationLog # noqa: F401 from ..validator.validator import Validator @@ -19,18 +17,6 @@ from .iso639_1 import iso_639_1 from .iso639_3 import iso_639_3, language_regexes, wb_language_regexes -class UCLDCWriter(object): - def __init__(self, collection_id: int, page_filename: str): - self.collection_id = collection_id - self.page_filename = page_filename - - def write_mapped_metadata(self, mapped_metadata): - rikolti_data = RikoltiStorage( - f"{settings.DATA_DEST_URL}/{self.collection_id}/" - f"mapped_metadata/{self.page_filename}" - ) - rikolti_data.write_page_content(json.dumps(mapped_metadata)) - class Vernacular(ABC, object): def __init__(self, collection_id: int, page_filename: str) -> None: From da89de336c680284ca22d39d77898824010fc872 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 18:17:28 -0800 Subject: [PATCH 10/42] factor out content_harvester.settings.local_path --- content_harvester/by_collection.py | 6 +++++- content_harvester/by_page.py | 30 +++++++++++++++++++++++++----- content_harvester/settings.py | 8 -------- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py index ea420e6a4..a7987be3a 100644 --- a/content_harvester/by_collection.py +++ b/content_harvester/by_collection.py @@ -10,7 +10,11 @@ def get_mapped_pages(collection_id): page_list = [] if settings.DATA_SRC['STORE'] == 'file': - mapped_path = settings.local_path(collection_id, 'mapped_metadata') + mapped_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + 'mapped_metadata', + ]) try: page_list = [f for f in os.listdir(mapped_path) if os.path.isfile(os.path.join(mapped_path, f))] diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index a3e48f1e4..1bef1378e 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -26,8 +26,13 @@ class UnsupportedMimetype(Exception): def get_mapped_records(collection_id, page_filename, s3_client) -> list: mapped_records = [] if settings.DATA_SRC["STORE"] == 'file': - local_path = settings.local_path(collection_id, 'mapped_metadata') - page_path = os.path.join(local_path, str(page_filename)) + local_mapped_data_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + 'mapped_metadata', + ]) + + page_path = os.path.join(local_mapped_data_path, str(page_filename)) page = open(page_path, "r") mapped_records = json.loads(page.read()) else: @@ -41,7 +46,12 @@ def get_mapped_records(collection_id, page_filename, s3_client) -> list: def write_mapped_record(collection_id, record, s3_client): if settings.DATA_DEST["STORE"] == 'file': - local_path = settings.local_path(collection_id, 'mapped_with_content') + local_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + 'mapped_with_content', + ]) + if not os.path.exists(local_path): os.makedirs(local_path) @@ -67,7 +77,12 @@ def write_mapped_record(collection_id, record, s3_client): def write_mapped_page(collection_id, page, records): if settings.DATA_DEST["STORE"] == 'file': - local_path = settings.local_path(collection_id, 'mapped_with_content') + local_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + 'mapped_with_content', + ]) + if not os.path.exists(local_path): os.makedirs(local_path) page_path = os.path.join(local_path, page) @@ -78,7 +93,12 @@ def write_mapped_page(collection_id, page, records): def get_child_records(collection_id, parent_id, s3_client) -> list: mapped_child_records = [] if settings.DATA_SRC["STORE"] == 'file': - local_path = settings.local_path(collection_id, 'mapped_metadata') + local_path = os.sep.join([ + settings.DATA_SRC["PATH"], + str(collection_id), + 'mapped_metadata', + ]) + children_path = os.path.join(local_path, 'children') if os.path.exists(children_path): diff --git a/content_harvester/settings.py b/content_harvester/settings.py index 0a99cfc9c..6a614ad61 100644 --- a/content_harvester/settings.py +++ b/content_harvester/settings.py @@ -41,11 +41,3 @@ 'ffmpeg': '/usr/bin/ffmpeg', 'ffprobe': '/usr/bin/ffprobe', } - -def local_path(collection_id, folder): - local_path = os.sep.join([ - DATA_SRC["PATH"], - str(collection_id), - folder, - ]) - return local_path From bd758837c74bf0846c75eb0a76bd45dd31419dcd Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 21:21:56 -0800 Subject: [PATCH 11/42] factor out content harvester's data source concerns --- content_harvester/by_collection.py | 37 +++------- content_harvester/by_page.py | 114 +++++++---------------------- content_harvester/settings.py | 5 -- utils/rikolti_storage.py | 5 +- 4 files changed, 38 insertions(+), 123 deletions(-) diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py index a7987be3a..31dcc51f6 100644 --- a/content_harvester/by_collection.py +++ b/content_harvester/by_collection.py @@ -1,38 +1,19 @@ import json -import os - -import boto3 from . import settings from .by_page import harvest_page_content - +from rikolti.utils.rikolti_storage import RikoltiStorage def get_mapped_pages(collection_id): page_list = [] - if settings.DATA_SRC['STORE'] == 'file': - mapped_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - 'mapped_metadata', - ]) - try: - page_list = [f for f in os.listdir(mapped_path) - if os.path.isfile(os.path.join(mapped_path, f))] - except FileNotFoundError as e: - print(f"{e} - have you mapped {collection_id}?") - else: - s3_client = boto3.client( - 's3', - aws_access_key_id=settings.AWS_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, - aws_session_token=settings.AWS_SESSION_TOKEN, - region_name=settings.AWS_REGION - ) - response = s3_client.list_objects_v2( - Bucket=settings.DATA_SRC["BUCKET"], - Prefix=f'{collection_id}/mapped_metadata/' - ) - page_list = [obj['Key'].split('/')[-1] for obj in response['Contents']] + rikolti_data = RikoltiStorage( + f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata", + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, + aws_session_token=settings.AWS_SESSION_TOKEN, + region_name=settings.AWS_REGION + ) + page_list = rikolti_data.list_pages(recursive=False, relative=True) return page_list diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index 1bef1378e..acc064891 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -14,6 +14,7 @@ from . import derivatives from . import settings +from rikolti.utils.rikolti_storage import RikoltiStorage class DownloadError(Exception): pass @@ -23,103 +24,40 @@ class UnsupportedMimetype(Exception): pass -def get_mapped_records(collection_id, page_filename, s3_client) -> list: +def get_mapped_records(collection_id, page_filename) -> list: mapped_records = [] - if settings.DATA_SRC["STORE"] == 'file': - local_mapped_data_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - 'mapped_metadata', - ]) - - page_path = os.path.join(local_mapped_data_path, str(page_filename)) - page = open(page_path, "r") - mapped_records = json.loads(page.read()) - else: - page = s3_client.get_object( - Bucket=settings.DATA_SRC["BUCKET"], - Key=f"{collection_id}/mapped_metadata/{page_filename}" - ) - mapped_records = json.loads(page['Body'].read()) + rikolti_data = RikoltiStorage( + f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/{page_filename}") + mapped_records = json.loads(rikolti_data.get_page_content()) return mapped_records -def write_mapped_record(collection_id, record, s3_client): - if settings.DATA_DEST["STORE"] == 'file': - local_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - 'mapped_with_content', - ]) - - if not os.path.exists(local_path): - os.makedirs(local_path) - - # some ids have slashes - page_path = os.path.join( - local_path, - record.get('calisphere-id').replace(os.sep, '_') - ) - - page = open(page_path, "w") - page.write(json.dumps(record)) - else: - upload_status = s3_client.put_object( - Bucket=settings.DATA_DEST["BUCKET"], - Key=( - f"{collection_id}/mapped_with_content/" - f"{record.get('calisphere-id')}" - ), - Body=json.dumps(record) - ) - print(f"Upload status: {upload_status}") +def write_mapped_record(collection_id, record): + rikolti_data = RikoltiStorage( + f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/" + f"{record.get('calisphere-id').replace(os.sep, '_')}" + ) + rikolti_data.put_page_content(json.dumps(record)) def write_mapped_page(collection_id, page, records): - if settings.DATA_DEST["STORE"] == 'file': - local_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - 'mapped_with_content', - ]) - - if not os.path.exists(local_path): - os.makedirs(local_path) - page_path = os.path.join(local_path, page) - page = open(page_path, "w") - page.write(json.dumps(records)) + rikolti_data = RikoltiStorage( + f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/{page}" + ) + rikolti_data.put_page_content(json.dumps(records)) def get_child_records(collection_id, parent_id, s3_client) -> list: mapped_child_records = [] - if settings.DATA_SRC["STORE"] == 'file': - local_path = os.sep.join([ - settings.DATA_SRC["PATH"], - str(collection_id), - 'mapped_metadata', - ]) - - children_path = os.path.join(local_path, 'children') - - if os.path.exists(children_path): - child_pages = [file for file in os.listdir(children_path) - if file.startswith(parent_id)] - for child_page in child_pages: - child_page_path = os.path.join(children_path, child_page) - page = open(child_page_path, "r") - mapped_child_records.extend(json.loads(page.read())) - else: - child_pages = s3_client.list_objects_v2( - Bucket=settings.DATA_SRC["BUCKET"], - Prefix=f"{collection_id}/mapped_metadata/children/{parent_id}" - ) - for child_page in child_pages['Contents']: - page = s3_client.get_object( - Bucket=settings.DATA_SRC["BUCKET"], - Key=child_page['Key'] - ) - mapped_child_records.extend(json.loads(page['Body'].read())) - + rikolti_data = RikoltiStorage( + f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children") + children = rikolti_data.list_pages(recursive=False, relative=False) + if rikolti_data.data_store == 'file': + children = [page for page in children + if os.path.basename(page).startswith(parent_id)] + for child in children: + child_data = RikoltiStorage(child) + mapped_child_records.extend(json.loads(child_data.get_page_content())) return mapped_child_records @@ -423,7 +361,7 @@ def harvest_page_content(collection_id, page_filename, **kwargs): src_auth=auth ) - records = get_mapped_records(collection_id, page_filename, harvester.s3) + records = get_mapped_records(collection_id, page_filename) print( f"[{collection_id}, {page_filename}]: " f"Harvesting content for {len(records)} records" @@ -438,7 +376,7 @@ def harvest_page_content(collection_id, page_filename, **kwargs): try: record_with_content = harvester.harvest(record) # write_mapped_record( - # collection_id, record_with_content, harvester.s3) + # collection_id, record_with_content) if not record_with_content.get('thumbnail'): warn_level = "ERROR" if 'sound' in record.get('type', []): diff --git a/content_harvester/settings.py b/content_harvester/settings.py index 6a614ad61..e54c550d4 100644 --- a/content_harvester/settings.py +++ b/content_harvester/settings.py @@ -7,11 +7,6 @@ load_dotenv() DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp') -DATA_SRC = { - "STORE": urlparse(DATA_SRC_URL).scheme, - "BUCKET": urlparse(DATA_SRC_URL).netloc, - "PATH": urlparse(DATA_SRC_URL).path -} DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp') DATA_DEST = { diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 722d4ae5f..1b419838f 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -7,14 +7,15 @@ from typing import Optional class RikoltiStorage(): - def __init__(self, data_url: str): + def __init__(self, data_url: str, **kwargs): self.data_url = data_url data_loc = urlparse(data_url) self.data_store = data_loc.scheme self.data_bucket = data_loc.netloc self.data_path = data_loc.path - self.s3 = boto3.client('s3') + if self.data_store == 's3': + self.s3 = boto3.client('s3', **kwargs) def list_pages(self, recursive=True, relative=True) -> list: if self.data_store == 's3': From bb16c3fae44b5b983a3c6427de0868bc6f22490f Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 21:22:41 -0800 Subject: [PATCH 12/42] factor out content harvester's data dest concerns --- content_harvester/settings.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/content_harvester/settings.py b/content_harvester/settings.py index e54c550d4..60b825605 100644 --- a/content_harvester/settings.py +++ b/content_harvester/settings.py @@ -7,13 +7,7 @@ load_dotenv() DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp') - DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp') -DATA_DEST = { - "STORE": urlparse(DATA_DEST_URL).scheme, - "BUCKET": urlparse(DATA_DEST_URL).netloc, - "PATH": urlparse(DATA_DEST_URL).path -} CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp') CONTENT_DEST = { From 39a03a3cf581ff3cb4c218388cc4f4808a72f217 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 21:23:16 -0800 Subject: [PATCH 13/42] factor out content harvester's content dest --- content_harvester/settings.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/content_harvester/settings.py b/content_harvester/settings.py index 60b825605..f7bebe969 100644 --- a/content_harvester/settings.py +++ b/content_harvester/settings.py @@ -8,13 +8,7 @@ DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp') DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp') - CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp') -CONTENT_DEST = { - "STORE": urlparse(CONTENT_DEST_URL).scheme, - "BUCKET": urlparse(CONTENT_DEST_URL).netloc, - "PATH": urlparse(CONTENT_DEST_URL).path -} AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False) AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', False) From 4cd9471ece9ad419d82dbe29b7ad12b01c0f2af7 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 7 Nov 2023 21:33:21 -0800 Subject: [PATCH 14/42] use registry_endpoint generator in content_harvester --- content_harvester/by_registry_endpoint.py | 65 ++++++++++------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/content_harvester/by_registry_endpoint.py b/content_harvester/by_registry_endpoint.py index b5b73b453..d60ac8761 100644 --- a/content_harvester/by_registry_endpoint.py +++ b/content_harvester/by_registry_endpoint.py @@ -4,50 +4,41 @@ from .by_collection import harvest_collection - -def harvest_endpoint(url): - registry_page = url - results = [] - - while registry_page: - try: - response = requests.get(url=registry_page) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - print( - f"[{registry_page}]: {err}" - ) - registry_page = None - break - - total_collections = response.json().get( - 'meta', {}).get('total_count', 1) - print( - f">>> Harvesting content for {total_collections} collections " - f"described at {registry_page}" - ) +def registry_endpoint(url): + page = url + while page: + response = requests.get(url=page) + response.raise_for_status() + page = response.json().get('meta', {}).get('next', None) + if page: + page = f"https://registry.cdlib.org{page}" collections = response.json().get('objects', [response.json()]) for collection in collections: - print( - f"> Harvesting content from collection " - f"{collection['collection_id']} - {collection['solr_count']} " - f"items in solr as of {collection['solr_last_updated']}" - ) + yield collection - # TODO: what is return val? - collection_stats = harvest_collection(collection) - collection_stats.update({'solr_count': collection['solr_count']}) - - results.append(collection_stats) +def harvest_endpoint(url, limit=None): + response = requests.get(url=url) + response.raise_for_status() + total = response.json().get('meta', {}).get('total_count', 1) + if not limit: + limit = total + print( + f">>> Content harvest for {limit/total} collections described at {url}" + ) + results = [] - print(f">>> Harvested {len(results)} collections") + for collection in registry_endpoint(url): + print( + f"{collection['id']:<6}: {collection['solr_count']} items in solr " + f"as of {collection['solr_last_updated']}" + ) - registry_page = response.json().get('meta', {}).get('next') - if registry_page: - registry_page = f"https://registry.cdlib.org{registry_page}" - print(f">>> Next page: {registry_page}") + # TODO: what is return val? + collection_stats = harvest_collection(collection) + collection_stats.update({'solr_count': collection['solr_count']}) + results.append(collection_stats) return results From 3d062686eb4874dc88c31f187326945a7d117e4b Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 8 Nov 2023 12:33:17 -0800 Subject: [PATCH 15/42] fetcher, mapper, content harvester all use RikoltiStorage now --- content_harvester/by_page.py | 4 ++-- metadata_fetcher/fetchers/Fetcher.py | 9 +++++---- metadata_fetcher/fetchers/ucd_json_fetcher.py | 4 +--- metadata_mapper/utilities.py | 4 ++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index acc064891..ae3b9bf87 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -47,7 +47,7 @@ def write_mapped_page(collection_id, page, records): rikolti_data.put_page_content(json.dumps(records)) -def get_child_records(collection_id, parent_id, s3_client) -> list: +def get_child_records(collection_id, parent_id) -> list: mapped_child_records = [] rikolti_data = RikoltiStorage( f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children") @@ -249,7 +249,7 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict: # Recurse through the record's children (if any) child_records = get_child_records( - self.collection_id, calisphere_id, self.s3) + self.collection_id, calisphere_id) if child_records: print( f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: " diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 0d27e78d3..58e42a04c 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -26,7 +26,10 @@ def __init__(self, params): self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - self.data_destination = RikoltiStorage(settings.DATA_DEST_URL) + self.data_destination = RikoltiStorage( + f"{settings.DATA_DEST_URL}/{self.collection_id}/" + "vernacular_metadata/" + ) if not self.collection_id: raise CollectionIdRequired("collection_id is required") @@ -49,9 +52,7 @@ def fetch_page(self): content = self.aggregate_vernacular_content(response.text) try: self.data_destination.put_page_content( - content, relative_path=( - f"{self.collection_id}/vernacular_metadata/{self.write_page}" - ) + content, relative_path=f"{self.write_page}" ) except Exception as e: print(f"Metadata Fetcher: {e}") diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py index 7b646b192..9e3936be0 100644 --- a/metadata_fetcher/fetchers/ucd_json_fetcher.py +++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py @@ -66,9 +66,7 @@ def fetch_all_pages(self, response: requests.Response) -> int: try: self.data_destination.put_page_content( - content, relative_path=( - f"{self.collection_id}/vernacular_metadata/{self.write_page}" - ) + content, relative_path=f"{self.write_page}" ) except Exception as e: print(f"Metadata Fetcher: {e}", file=sys.stderr) diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py index 6a2573751..0ba9f45b8 100644 --- a/metadata_mapper/utilities.py +++ b/metadata_mapper/utilities.py @@ -51,7 +51,7 @@ def get_files(collection_id: int, directory: str) -> list[str]: """ rikolti_data = RikoltiStorage( f"{settings.DATA_SRC_URL}/{collection_id}/{directory}") - rikolti_data.list_pages(recursive=False, relative=True) + return rikolti_data.list_pages(recursive=False, relative=True) def read_from_bucket(collection_id: int, directory: str, @@ -113,7 +113,7 @@ def write_to_bucket(collection_id: int, directory: str, if isinstance(content, list) or isinstance(content, dict): content = json.dumps(content) - rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}") + rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/") rikolti_data.put_page_content(content, str(file_name)) file_location = f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}" From a00f75ba6e156eb0542feb47dd2cef52988df0a1 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 9 Nov 2023 11:33:41 -0800 Subject: [PATCH 16/42] make paths more absolute throughout codebase --- content_harvester/by_collection.py | 10 +- content_harvester/by_page.py | 45 +-- dags/shared_tasks.py | 12 +- metadata_fetcher/fetchers/Fetcher.py | 20 +- metadata_fetcher/fetchers/ucd_json_fetcher.py | 39 +- metadata_fetcher/lambda_function.py | 19 +- metadata_mapper/lambda_function.py | 36 +- metadata_mapper/lambda_shepherd.py | 10 +- metadata_mapper/utilities.py | 75 ---- metadata_mapper/validate_mapping.py | 24 +- metadata_mapper/validator/validation_log.py | 15 +- utils/rikolti_storage.py | 351 +++++++++++------- 12 files changed, 340 insertions(+), 316 deletions(-) diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py index 31dcc51f6..9d7215b5d 100644 --- a/content_harvester/by_collection.py +++ b/content_harvester/by_collection.py @@ -2,18 +2,18 @@ from . import settings from .by_page import harvest_page_content -from rikolti.utils.rikolti_storage import RikoltiStorage +from rikolti.utils.rikolti_storage import list_pages def get_mapped_pages(collection_id): page_list = [] - rikolti_data = RikoltiStorage( + page_list = list_pages( f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata", + recursive=False, aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, aws_session_token=settings.AWS_SESSION_TOKEN, region_name=settings.AWS_REGION ) - page_list = rikolti_data.list_pages(recursive=False, relative=True) return page_list @@ -32,8 +32,8 @@ def harvest_collection(collection): print(f"[{collection_id}]: Harvesting content for {len(page_list)} pages") collection_stats = {} - for page in page_list: - collection.update({'page_filename': page}) + for page_path in page_list: + collection.update({'page_path': page_path}) page_stats = harvest_page_content(**collection) # in some cases, value is int and in some cases, value is Counter diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index ae3b9bf87..4789a4cbd 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -14,7 +14,7 @@ from . import derivatives from . import settings -from rikolti.utils.rikolti_storage import RikoltiStorage +from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content class DownloadError(Exception): pass @@ -24,40 +24,40 @@ class UnsupportedMimetype(Exception): pass -def get_mapped_records(collection_id, page_filename) -> list: +def get_mapped_records(page_path) -> list: mapped_records = [] - rikolti_data = RikoltiStorage( - f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/{page_filename}") - mapped_records = json.loads(rikolti_data.get_page_content()) + mapped_records = json.loads(get_page_content(page_path)) return mapped_records def write_mapped_record(collection_id, record): - rikolti_data = RikoltiStorage( - f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/" - f"{record.get('calisphere-id').replace(os.sep, '_')}" - ) - rikolti_data.put_page_content(json.dumps(record)) + put_page_content( + json.dumps(record), + ( + f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/" + f"{record.get('calisphere-id').replace(os.sep, '_')}" + ) +) def write_mapped_page(collection_id, page, records): - rikolti_data = RikoltiStorage( + put_page_content( + json.dumps(records), f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/{page}" ) - rikolti_data.put_page_content(json.dumps(records)) def get_child_records(collection_id, parent_id) -> list: mapped_child_records = [] - rikolti_data = RikoltiStorage( - f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children") - children = rikolti_data.list_pages(recursive=False, relative=False) + children = list_pages( + f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children", + recursive=False + ) if rikolti_data.data_store == 'file': children = [page for page in children if os.path.basename(page).startswith(parent_id)] for child in children: - child_data = RikoltiStorage(child) - mapped_child_records.extend(json.loads(child_data.get_page_content())) + mapped_child_records.extend(json.loads(get_page_content(child))) return mapped_child_records @@ -346,9 +346,10 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = return dest_path -# {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "r-0"} -def harvest_page_content(collection_id, page_filename, **kwargs): +# {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "file:///rikolti_data/r-0"} +def harvest_page_content(collection_id, page_path, **kwargs): rikolti_mapper_type = kwargs.get('rikolti_mapper_type') + page_filename = os.path.basename(page_path) # Weird how we have to use username/pass to hit this endpoint # but we have to use auth token to hit API endpoint @@ -361,7 +362,7 @@ def harvest_page_content(collection_id, page_filename, **kwargs): src_auth=auth ) - records = get_mapped_records(collection_id, page_filename) + records = get_mapped_records(page_path) print( f"[{collection_id}, {page_filename}]: " f"Harvesting content for {len(records)} records" @@ -454,12 +455,12 @@ def harvest_page_content(collection_id, page_filename, **kwargs): parser = argparse.ArgumentParser( description="Harvest content using a page of mapped metadata") parser.add_argument('collection_id', help="Collection ID") - parser.add_argument('page_filename', help="Page Filename") + parser.add_argument('page_path', help="URI-formatted path to a mapped metadata page") parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth") args = parser.parse_args() arguments = { 'collection_id': args.collection_id, - 'page_filename': args.page_filename, + 'page_filename': args.page_path, } if args.nuxeo: arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo' diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index 76ce61f64..04e72d62c 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -39,10 +39,9 @@ def get_collection_fetchdata_task(params=None): @task() def fetch_collection_task(collection: dict): fetch_status = fetch_collection(collection, {}) - success = all([page['status'] == 'success' for page in fetch_status]) total_items = sum([page['document_count'] for page in fetch_status]) - total_pages = fetch_status[-1]['page'] + 1 + total_pages = len(fetch_status) diff_items = total_items - collection['solr_count'] date = datetime.strptime( collection['solr_last_updated'], @@ -67,9 +66,12 @@ def fetch_collection_task(collection: dict): f"{'more' if diff_items > 0 else 'fewer'} items." ) - return [ - str(page['page']) for page in fetch_status if page['status']=='success' - ] + vernacular_filepaths = [page['vernacular_filepath'] for page in fetch_status] + if not vernacular_filepaths or not success: + raise Exception( + 'vernacular metadata not successfully fetched\n{fetch_status}') + + return vernacular_filepaths @task() diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 58e42a04c..a6ecd15ef 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -1,7 +1,7 @@ import logging import requests +import os -from .. import settings from requests.adapters import HTTPAdapter, Retry from rikolti.utils.rikolti_storage import RikoltiStorage @@ -26,10 +26,7 @@ def __init__(self, params): self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - self.data_destination = RikoltiStorage( - f"{settings.DATA_DEST_URL}/{self.collection_id}/" - "vernacular_metadata/" - ) + self.data_destination = RikoltiStorage(self.collection_id) if not self.collection_id: raise CollectionIdRequired("collection_id is required") @@ -48,19 +45,24 @@ def fetch_page(self): f"[{self.collection_id}]: unable to fetch page {page}") record_count = self.check_page(response) + filepath = None if record_count: content = self.aggregate_vernacular_content(response.text) try: - self.data_destination.put_page_content( - content, relative_path=f"{self.write_page}" - ) + filepath = self.data_destination.save_fetched_content( + content, self.write_page) + print(filepath) except Exception as e: print(f"Metadata Fetcher: {e}") raise(e) self.increment(response) - return record_count + return { + 'document_count': record_count, + 'vernacular_filepath': filepath, + 'status': 'success' + } def aggregate_vernacular_content(self, response): return response diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py index 9e3936be0..4db788462 100644 --- a/metadata_fetcher/fetchers/ucd_json_fetcher.py +++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py @@ -1,13 +1,16 @@ import json +import math +import os import sys -from .Fetcher import Fetcher, FetchError + +from typing import Optional + import requests + from xml.etree import ElementTree from bs4 import BeautifulSoup -from .. import settings -import math -from typing import Optional +from .Fetcher import Fetcher, FetchError class UcdJsonFetcher(Fetcher): def __init__(self, params: dict[str]): @@ -21,7 +24,7 @@ def __init__(self, params: dict[str]): self.url = params.get("harvest_data").get("url") self.per_page = 10 - def fetch_page(self) -> int: + def fetch_page(self) -> dict[str, int or str]: """ UCD's harvesting endpoint gets us an XML document listing a URL for every record in a collection, but not the actual metadata records themselves. fetch_page @@ -43,7 +46,7 @@ def fetch_page(self) -> int: return self.fetch_all_pages(response) - def fetch_all_pages(self, response: requests.Response) -> int: + def fetch_all_pages(self, response: requests.Response) -> list: """ Parameters: response: requests.Response @@ -56,24 +59,28 @@ def fetch_all_pages(self, response: requests.Response) -> int: loc_nodes = xml.findall(".//ns:loc", ns) pages = math.ceil(len(loc_nodes) / self.per_page) + fetch_status = [] for page in range(pages): print(f"[{self.collection_id}]: Fetching URLs for page {page + 1} " f"({page + 1}/{pages})") - skip = self.write_page * self.per_page - urls = loc_nodes[skip:(skip + self.per_page)] - records = list(filter(None, [self.fetch_json_ld(url.text) for url in urls])) - content = json.dumps(records) - + offset = self.write_page * self.per_page + urls = loc_nodes[offset:(offset + self.per_page)] + urls = list(filter(None, [url.text for url in urls])) + records = [self.fetch_json_ld(url) for url in urls] + document_count = len(records) try: - self.data_destination.put_page_content( - content, relative_path=f"{self.write_page}" - ) + filepath = self.data_destination.save_fetched_content( + json.dumps(records), self.write_page) + fetch_status.append({ + 'document_count': document_count, + 'vernacular_filepath': filepath, + 'status': 'success' + }) except Exception as e: print(f"Metadata Fetcher: {e}", file=sys.stderr) raise(e) - self.write_page += 1 - return len(loc_nodes) + return fetch_status def fetch_json_ld(self, url: str) -> Optional[dict]: """ diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index db12b0bc6..e6b2f3376 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -28,28 +28,29 @@ def fetch_collection(payload, context): fetcher_class = import_fetcher(payload.get('harvest_type')) - fetch_status = {'page': payload.get('write_page', 0), 'document_count': 0} + fetch_status = [] try: fetcher = fetcher_class(payload) - fetch_status['document_count'] = fetcher.fetch_page() + fetch_status.append(fetcher.fetch_page()) except InvalidHarvestEndpoint as e: logger.error(e) - fetch_status.update({ + fetch_status.append({ 'status': 'error', 'body': json.dumps({ 'error': repr(e), 'payload': payload }) }) - return [fetch_status] + return fetch_status next_page = fetcher.json() - fetch_status.update({ - 'status': 'success', - 'next_page': next_page - }) - fetch_status = [fetch_status] + # this is a ucd json fetcher workaround + # TODO: could be cleaner to stash ucd's table of contents in a known + # location and have each iteration of the fetcher reference that location, + # then we could resolve this difference in return values + if len(fetch_status) == 1 and type(fetch_status[0]) == list: + fetch_status = fetch_status[0] if not json.loads(next_page).get('finished'): fetch_status.extend(fetch_collection(next_page, {})) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 71da9db65..1d63d85de 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -7,7 +7,7 @@ from . import settings from .mappers.mapper import Record, Vernacular -from rikolti.utils.rikolti_storage import RikoltiStorage +from rikolti.utils.rikolti_storage import get_page_content, put_page_content logger = logging.getLogger(__name__) @@ -72,17 +72,14 @@ def run_enrichments(records, collection, enrichment_set, page_filename): return records -def map_page(collection_id: int, page_filename: str, collection: Union[dict, str]): +def map_page(collection_id: int, page_path: str, collection: Union[dict, str]): if isinstance(collection, str): collection = json.loads(collection) vernacular_reader = import_vernacular_reader( collection.get('rikolti_mapper_type')) - storage = RikoltiStorage( - f"{settings.DATA_SRC_URL}/{collection_id}/" - f"vernacular_metadata/{page_filename}" - ) - api_resp = storage.get_page_content() + page_filename = os.path.basename(page_path) + api_resp = get_page_content(page_path) source_vernacular = vernacular_reader(collection_id, page_filename) source_metadata_records = source_vernacular.parse(api_resp) @@ -95,12 +92,13 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str mapped_records = source_metadata_records # TODO: write interim mapped but not enriched metadata to s3? - # rikolti_data = RikoltiStorage( - # f"{settings.DATA_DEST_URL}/{collection_id}/" - # f"interim_mapped_metadata/{page_filename}" + # put_page_content( + # json.dumps([record.to_dict() for record in mapped_records]), + # ( + # f"{settings.DATA_DEST_URL}/{collection_id}/" + # f"interim_mapped_metadata/{page_filename}" + # ) # ) - # rikolti_data.put_page_content(json.dumps( - # [record.to_dict() for record in mapped_records])) mapped_records = run_enrichments( mapped_records, collection, 'rikolti__enrichments', page_filename) @@ -128,11 +126,13 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str # for record in mapped_records] mapped_metadata = [record.to_dict() for record in mapped_records] - rikolti_data = RikoltiStorage( - f"{settings.DATA_DEST_URL}/{collection_id}/" - f"mapped_metadata/{page_filename}" + put_page_content( + json.dumps(mapped_metadata), + ( + f"{settings.DATA_DEST_URL}/{collection_id}/" + f"mapped_metadata/{page_filename}" + ) ) - rikolti_data.put_page_content(json.dumps(mapped_metadata)) return { 'status': 'success', @@ -147,11 +147,11 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str parser = argparse.ArgumentParser( description="Map metadata from the institution's vernacular") parser.add_argument('collection_id', help='collection id') - parser.add_argument('page_filename', help='vernauclar metadata page filename') + parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename') parser.add_argument('collection', help='json collection metadata from registry') args = parser.parse_args(sys.argv[1:]) - mapped_page = map_page(args.collection_id, args.page_filename, args.collection) + mapped_page = map_page(args.collection_id, args.page_path, args.collection) print(f"{mapped_page.get('num_records_mapped')} records mapped") diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py index 91cd5999d..c40c9dd2f 100644 --- a/metadata_mapper/lambda_shepherd.py +++ b/metadata_mapper/lambda_shepherd.py @@ -8,7 +8,7 @@ from . import settings, validate_mapping from .lambda_function import map_page from .mappers.mapper import Record -from rikolti.utils.rikolti_storage import RikoltiStorage +from rikolti.utils.rikolti_storage import list_pages def get_collection(collection_id): @@ -38,11 +38,11 @@ def check_for_missing_enrichments(collection): def get_vernacular_pages(collection_id): - rikolti_data = RikoltiStorage( - f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata") - try: - page_list = rikolti_data.list_pages(relative=True) + page_list = list_pages( + f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata", + recursive=True + ) except FileNotFoundError as e: print( f"{e} - have you fetched {collection_id}? " diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py index 0ba9f45b8..af188f419 100644 --- a/metadata_mapper/utilities.py +++ b/metadata_mapper/utilities.py @@ -3,7 +3,6 @@ from typing import Callable, Union from . import settings -from rikolti.utils.rikolti_storage import RikoltiStorage def returns_callable(func: Callable) -> Callable: @@ -44,77 +43,3 @@ def import_vernacular_reader(mapper_type): exit() return vernacular_class - -def get_files(collection_id: int, directory: str) -> list[str]: - """ - Gets a list of filenames in a given directory. - """ - rikolti_data = RikoltiStorage( - f"{settings.DATA_SRC_URL}/{collection_id}/{directory}") - return rikolti_data.list_pages(recursive=False, relative=True) - - -def read_from_bucket(collection_id: int, directory: str, - file_name: Union[str, int]) -> str: - """ - Reads the contents of a file from the appropriate content bucket. - - Data comes from local filesystem or S3, depending on ENV vars. - - Parameters: - directory: str - collection_id: str - Files are separated into directories by collection_id - file_name: Union[str, int] - The name of the file to read - - Returns: str - The file contents - """ - rikolti_data = RikoltiStorage( - f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}") - return rikolti_data.get_page_content() - - -def read_mapped_metadata(collection_id: int, page_id: int) -> list[dict]: - """ - Reads and parses the content of a mapped metadata file. - - Parameters: - collection_id: int - The collection ID - page_id: int - The page ID (filename) to read and parse - - Returns: list[dict] - The parsed data - """ - return json.loads(read_from_bucket(collection_id, "mapped_metadata", page_id)) - - -def read_vernacular_metadata(collection_id: int, page_id: int) -> list[dict]: - """ - Reads and parses the content of a vernacular (unmapped) metadata file. - - Parameters: - collection_id: int - The collection ID - page_id: int - The page ID (filename) to read and parse - - Returns: list[dict] - The parsed data - """ - return json.loads(read_from_bucket(collection_id, "vernacular_metadata", page_id)) - - -def write_to_bucket(collection_id: int, directory: str, - file_name: Union[str, int], content: str) -> None: - if isinstance(content, list) or isinstance(content, dict): - content = json.dumps(content) - - rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/") - rikolti_data.put_page_content(content, str(file_name)) - file_location = f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}" - - return file_location diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py index 96779b0d3..584e53aa7 100644 --- a/metadata_mapper/validate_mapping.py +++ b/metadata_mapper/validate_mapping.py @@ -10,6 +10,7 @@ from .validator.validation_log import ValidationLogLevel from .validator.validation_mode import ValidationMode from .validator.validator import Validator +from rikolti.utils.rikolti_storage import list_pages, get_page_content urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -49,13 +50,18 @@ def validate_collection(collection_id: int, log_level = log_level, verbose = verbose) - for page_id in utilities.get_files(collection_id, "mapped_metadata"): - validate_page(collection_id, page_id, validator) + mapped_pages = list_pages( + f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/", + recursive=False + ) + + for page_path in mapped_pages: + validate_page(collection_id, page_path, validator) return validator -def validate_page(collection_id: int, page_id: int, +def validate_page(collection_id: int, page_path: str, validator: Validator) -> Validator: """ Validates a provided page of a provided collection of mapped data. @@ -63,8 +69,8 @@ def validate_page(collection_id: int, page_id: int, Parameters: collection_id: int The collection ID - page_id: int - The page number within the collection + page_path: str + The absolute path to a page within the collection validator: Validator The validator instance to use @@ -73,10 +79,10 @@ def validate_page(collection_id: int, page_id: int, """ context = { "collection_id": collection_id, - "page_id": page_id + "page_path": page_path } mapped_metadata = validator.generate_keys( - get_mapped_data(collection_id, page_id), + get_mapped_data(page_path), type="Rikolti", context=context ) @@ -117,8 +123,8 @@ def create_collection_validation_csv(collection_id: int, **options) -> tuple[int ## Private-ish -def get_mapped_data(collection_id: int, page_id: int) -> dict: - return utilities.read_mapped_metadata(collection_id, page_id) +def get_mapped_data(page_path: str) -> list[dict]: + return json.loads(get_page_content(page_path)) def get_comparison_data(collection_id: int, harvest_ids: list[str]) -> list[dict]: diff --git a/metadata_mapper/validator/validation_log.py b/metadata_mapper/validator/validation_log.py index 41ef2e9fc..72e6eeeae 100644 --- a/metadata_mapper/validator/validation_log.py +++ b/metadata_mapper/validator/validation_log.py @@ -2,7 +2,8 @@ from enum import Enum from typing import IO, Any -from .. import utilities +from .. import settings +from rikolti.utils.rikolti_storage import put_page_content class ValidationLogLevel(Enum): @@ -110,7 +111,7 @@ def output_csv_to_file(self, file: IO[str], append: bool = False, f.write(self._csv_content_string(include_fields, append)) def output_csv_to_bucket(self, collection_id: int, filename: str = None, - include_fields: list[str] = None) -> None: + include_fields: list[str] = None) -> str: """ Writes a CSV to the env-appropriate bucket (local or S3). @@ -126,12 +127,16 @@ def output_csv_to_bucket(self, collection_id: int, filename: str = None, if not filename: filename = f"{datetime.now().strftime('%m-%d-%YT%H:%M:%S')}.csv" - file_location = utilities.write_to_bucket( - collection_id, "validation", filename, - self._csv_content_string(include_fields)) + content = self._csv_content_string(include_fields) + if isinstance(content, list) or isinstance(content, dict): + content = json.dumps(content) + + file_location = f"{settings.DATA_SRC_URL}/{collection_id}/validation/{filename}" + put_page_content(content, file_location) return file_location + def _csv_content(self, include_fields: list[str] = None, include_headers: bool = True) -> list[list[str]]: """ diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 1b419838f..2be23c9ef 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -2,87 +2,236 @@ import re import boto3 +from datetime import datetime from urllib.parse import urlparse from typing import Optional +from collections import namedtuple -class RikoltiStorage(): - def __init__(self, data_url: str, **kwargs): - self.data_url = data_url - data_loc = urlparse(data_url) - self.data_store = data_loc.scheme - self.data_bucket = data_loc.netloc - self.data_path = data_loc.path +DataStorage = namedtuple( + "DateStorage", "uri, store, bucket, path" +) - if self.data_store == 's3': - self.s3 = boto3.client('s3', **kwargs) +def parse_data_uri(data_uri: str): + data_loc = urlparse(data_uri) + return DataStorage( + data_uri, data_loc.scheme, data_loc.netloc, data_loc.path) - def list_pages(self, recursive=True, relative=True) -> list: - if self.data_store == 's3': - try: - return self.list_s3_pages(recursive=recursive, relative=relative) - except Exception as e: - url = ( - f"https://{self.data_bucket}.s3.us-west-2.amazonaws" - ".com/index.html#{self.data_path}/" - ) - raise Exception( - f"Error listing files at {self.data_url}\n" - f"Check that {self.data_path} exists at {url}\n{e}" + +def list_dirs(data_uri: str, **kwargs) -> list[str]: + data = parse_data_uri(data_uri) + if data.store == 's3': + s3 = boto3.client('s3', **kwargs) + s3_objects = s3.list_objects_v2( + Bucket=data.bucket, + Prefix=data.path, + Delimiter='/' + ) + keys = [ + obj['Prefix'][len(data.path):-1] + for obj in s3_objects['CommonPrefixes'] + ] + return keys + elif data.store == 'file': + dir_contents = os.listdir(data.path) + dirs = [ + file for file in dir_contents + if os.path.isdir(os.path.join(data.path, file)) + ] + return dirs + else: + raise Exception(f"Unknown data store: {data.store}") + + +def list_pages(data_uri: str, recursive: bool=True, **kwargs) -> list: + data = parse_data_uri(data_uri) + + if data.store == 's3': + try: + return list_s3_pages(data, recursive=recursive, **kwargs) + except Exception as e: + url = ( + f"https://{data.bucket}.s3.us-west-2.amazonaws" + ".com/index.html#{data.path}/" ) - elif self.data_store == 'file': - try: - return self.list_file_pages(recursive=recursive, relative=relative) - except Exception as e: - raise Exception(f"Error listing files in {path}\n{e}") - else: - raise Exception(f"Unknown data store: {self.data_store}") + raise Exception( + f"Error listing files at {data.uri}\n" + f"Check that {data.path} exists at {url}\n{e}" + ) + elif data.store == 'file': + try: + return list_file_pages(data, recursive=recursive) + except Exception as e: + raise Exception(f"Error listing files in {data.path}\n{e}") + else: + raise Exception(f"Unknown data store: {data.store}") - def list_s3_pages(self, recursive=True, relative=True) -> list: - """ - List all objects in s3_bucket with prefix s3_prefix - """ - s3_objects = self.s3.list_objects_v2( - Bucket=self.data_bucket, - Prefix=self.data_path + +def list_s3_pages(data: DataStorage, recursive: bool=True, **kwargs) -> list: + """ + List all objects in s3_bucket with prefix s3_prefix + """ + s3 = boto3.client('s3', **kwargs) + + s3_objects = s3.list_objects_v2( + Bucket=data.bucket, + Prefix=data.path + ) + # TODO: check resp['IsTruncated'] and use ContinuationToken if needed + + keys = [f"s3://{data.bucket}/{obj['Key']}" for obj in s3_objects['Contents']] + prefix = f"s3://{data.bucket}/{data.path}" + + if not recursive: + # prune deeper branches + leaf_regex = re.escape(prefix) + r"^\/?[\w!'_.*()-]+\/?$" + keys = [key for key in keys if re.match(leaf_regex, key)] + + return keys + + +def list_file_pages(data: DataStorage, recursive: bool=True) -> list: + """ + List all files in file_path + """ + file_objects = [] + if recursive: + for root, dirs, files in os.walk(data.path): + root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}" + for file in files: + file_objects.append(f"{root_uri}{file}") + + if not recursive: + for file in os.listdir(data.path): + if os.path.isfile(os.path.join(data.path, file)): + root_uri = "file://{data.path}/" if data.path[-1] != '/' else "file://{data.path}" + file_objects.append(f"{root_uri}{file}") + + return file_objects + + +def get_page_content(data_uri: str, **kwargs): + data = parse_data_uri(data_uri) + if data.store == 's3': + return get_s3_contents(data) + elif data.store == 'file': + return get_file_contents(data) + else: + raise Exception(f"Unknown data store: {data.store}") + + +def get_s3_contents(data: DataStorage, **kwargs): + """ + Get the body of the object located at data.path + """ + s3 = boto3.client('s3', **kwargs) + + try: + obj = s3.get_object(Bucket=data.bucket, Key=data.path) + return obj['Body'].read().decode('utf-8') + except Exception as e: + url = ( + f"https://{data.bucket}.s3.us-west-2.amazonaws.com/" + f"index.html#{data.path}/" + ) + raise Exception( + f"Error reading file at {data.uri}\nCheck: {url}\n{e}" ) - # TODO: check resp['IsTruncated'] and use ContinuationToken if needed - keys = [f"s3://{self.data_bucket}/{obj['Key']}" for obj in s3_objects['Contents']] - prefix = "s3://{self.data_bucket}/{self.data_path}" - if not recursive: - # prune deeper branches - leaf_regex = re.escape(prefix) + r"^\/?[\w!'_.*()-]+\/?$" - keys = [key for key in keys if re.match(leaf_regex, key)] +def get_file_contents(data: DataStorage): + """ + Get the body of the file located at file_path + """ + try: + with open(data.path, 'r') as f: + return f.read() + except Exception as e: + raise Exception(f"Error reading {data.path}\n{e}") - if relative: - keys = [key[len(prefix):] for key in keys] - return keys +def put_page_content(content:str, data_uri: str, **kwargs) -> str: + """ + Write content to a file at relative_path (relative to data_path). + relative_path is a list of strings, each string is a directory name + representing a directory tree. + handle s3 or file storage, use '/' as separator for s3 key and os.sep + as separtors for file storage + """ + data = parse_data_uri(data_uri) + + if data.store == 's3': + return put_s3_content(data, content, **kwargs) + elif data.store == 'file': + return put_file_content(data, content) + else: + raise Exception(f"Unknown data store: {data.store}") - def list_file_pages(self, recursive=True, relative=True) -> list: - """ - List all files in file_path - """ - file_objects = [] - if recursive: - for root, dirs, files in os.walk(self.data_path): - root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}" - for file in files: - file_objects.append(f"{root_uri}{file}") - if not recursive: - for file in os.listdir(self.data_path): - if os.path.isfile(os.path.join(self.data_path, file)): - root_uri = "file://{self.data_path}/" if self.data_path[-1] != '/' else "file://{self.data_path}" - file_objects.append(f"{root_uri}{file}") +def put_s3_content(data: DataStorage, content, **kwargs) -> str: + """ + Write content to an object named data.path + """ + s3 = boto3.client('s3', **kwargs) + s3.put_object( + ACL='bucket-owner-full-control', + Bucket=data.bucket, + Key=data.path, + Body=content + ) + return data.uri - if relative: - prefix = "file://{self.data_path}/" - file_objects = [file[len(prefix):] for file in file_objects] +def put_file_content(data: DataStorage, content) -> str: + """ + Write content to a file at data.path + """ + file_path = os.sep.join(data.path.split('/')) + directory_path = os.path.dirname(file_path) + if not os.path.exists(directory_path): + os.makedirs(directory_path) - return file_objects + with open(file_path, 'w') as f: + f.write(content) + return data.uri + +class RikoltiStorage(): + def __init__( + self, + collection_id: int or str, + vernacular_suffix: Optional[str] = None, + vernacular_path: Optional[str] = None, + **kwargs): + + self.collection_id = collection_id + + fetcher_data_dest = os.environ.get("FETCHER_DATA_DEST", "file:///tmp") + vernacular_root = ( + f"{fetcher_data_dest.rstrip('/')}/{collection_id}/" + ) + if not vernacular_path: + if not vernacular_suffix: + vernacular_suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + vernacular_path = ( + f"vernacular_metadata_{vernacular_suffix}/" + ) + + self.vernacular = f"{vernacular_root}{vernacular_path.rstrip('/')}/" + + # mapped_data_src = os.environ.get("MAPPED_DATA_SRC", fetcher_data_dest) + # mapped_root = ( + # f"{mapped_data_src.rstrip('/')}/{self.collection_id}/" + # ) + + + def save_fetched_content(self, content: str, filename: str): + return put_page_content(content, f"{self.vernacular}data/{filename}") + + # def list_fetched_content(self, recursive: bool=True, **kwargs) -> list: + # return list_pages( + # f"{self.vernacular_data}/{self.collection_id}/" + # f"vernacular_metadata{self.suffix}/", + # recursive=recursive + # ) def search_page(self, search_str: str, page: str) -> bool: if self.data_store == 's3': @@ -115,80 +264,6 @@ def search_file_page(self, search_str: str, file_path: str) -> bool: else: return False - def get_page_content(self): - if self.data_store == 's3': - return self.get_s3_contents() - elif self.data_store == 'file': - return self.get_file_contents() - else: - raise Exception(f"Unknown data store: {self.data_store}") - - def get_s3_contents(self): - """ - Get the body of the object located at s3_key - """ - try: - obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path) - return obj['Body'].read().decode('utf-8') - except Exception as e: - url = ( - f"https://{self.data_bucket}.s3.us-west-2.amazonaws.com/" - "index.html#{self.data_path}/" - ) - raise Exception( - f"Error reading file at {self.data_url}\nCheck: {url}\n{e}" - ) - - def get_file_contents(self): - """ - Get the body of the file located at file_path - """ - try: - with open(self.data_path, 'r') as f: - return f.read() - except Exception as e: - raise Exception(f"Error reading {self.data_path}\n{e}") - - - def put_page_content(self, content:str, relative_path: Optional[str]=None): - """ - Write content to a file at relative_path (relative to data_path). - relative_path is a list of strings, each string is a directory name - representing a directory tree. - handle s3 or file storage, use '/' as separator for s3 key and os.sep - as separtors for file storage - """ - path = self.data_path - if relative_path: - path += relative_path - if self.data_store == 's3': - return self.put_s3_content(path, content) - elif self.data_store == 'file': - return self.put_file_content(path, content) - else: - raise Exception(f"Unknown data store: {self.data_store}") - def put_file_content(self, file_path, content): - """ - Write content to a file at file_path - """ - file_path = os.sep.join(file_path.split('/')) - directory_path = os.path.dirname(file_path) - if not os.path.exists(directory_path): - os.makedirs(directory_path) - - with open(file_path, 'w') as f: - f.write(content) - - def put_s3_content(self, s3_key, content): - """ - Write content to an object named s3_key - """ - self.s3.put_object( - ACL='bucket-owner-full-control', - Bucket=self.data_bucket, - Key=s3_key, - Body=content - ) From 59d11cc2c68a84be57c91beb5ca2dca62b75e981 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 9 Nov 2023 12:01:26 -0800 Subject: [PATCH 17/42] moved RikoltiStorage init into create_vernacular_version --- metadata_fetcher/fetchers/Fetcher.py | 8 ++-- metadata_fetcher/fetchers/ucd_json_fetcher.py | 6 +-- utils/rikolti_storage.py | 43 +++++++++++++------ 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index a6ecd15ef..cfedbaf12 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -3,7 +3,7 @@ import os from requests.adapters import HTTPAdapter, Retry -from rikolti.utils.rikolti_storage import RikoltiStorage +from rikolti.utils.rikolti_storage import create_vernacular_version, put_page_content logger = logging.getLogger(__name__) @@ -26,7 +26,7 @@ def __init__(self, params): self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - self.data_destination = RikoltiStorage(self.collection_id) + self.data_destination = create_vernacular_version(self.collection_id) if not self.collection_id: raise CollectionIdRequired("collection_id is required") @@ -49,8 +49,8 @@ def fetch_page(self): if record_count: content = self.aggregate_vernacular_content(response.text) try: - filepath = self.data_destination.save_fetched_content( - content, self.write_page) + filepath = put_page_content( + content, f"{self.data_destination}data/{self.write_page}") print(filepath) except Exception as e: print(f"Metadata Fetcher: {e}") diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py index 4db788462..860495116 100644 --- a/metadata_fetcher/fetchers/ucd_json_fetcher.py +++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py @@ -1,6 +1,5 @@ import json import math -import os import sys from typing import Optional @@ -11,6 +10,7 @@ from bs4 import BeautifulSoup from .Fetcher import Fetcher, FetchError +from rikolti.utils.rikolti_storage import put_page_content class UcdJsonFetcher(Fetcher): def __init__(self, params: dict[str]): @@ -69,8 +69,8 @@ def fetch_all_pages(self, response: requests.Response) -> list: records = [self.fetch_json_ld(url) for url in urls] document_count = len(records) try: - filepath = self.data_destination.save_fetched_content( - json.dumps(records), self.write_page) + filepath = put_page_content( + json.dumps(records), f"{self.data_destination}data/{self.write_page}") fetch_status.append({ 'document_count': document_count, 'vernacular_filepath': filepath, diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 2be23c9ef..38e8fb8a7 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -194,33 +194,52 @@ def put_file_content(data: DataStorage, content) -> str: f.write(content) return data.uri + +def create_vernacular_version( + collection_id: int or str, + vernacular_suffix: Optional[str] = None + ): + fetcher_data_dest = os.environ.get( + "FETCHER_DATA_DEST", "file:///tmp") + vernacular_root = ( + f"{fetcher_data_dest.rstrip('/')}/{collection_id}/") + if not vernacular_suffix: + vernacular_suffix = ( + datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + vernacular_path = ( + f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/") + return vernacular_path + + class RikoltiStorage(): def __init__( self, collection_id: int or str, vernacular_suffix: Optional[str] = None, vernacular_path: Optional[str] = None, + mapped_data_suffix: Optional[str] = None, + mapped_data_path: Optional[str] = None, **kwargs): self.collection_id = collection_id - fetcher_data_dest = os.environ.get("FETCHER_DATA_DEST", "file:///tmp") - vernacular_root = ( - f"{fetcher_data_dest.rstrip('/')}/{collection_id}/" - ) if not vernacular_path: + fetcher_data_dest = os.environ.get( + "FETCHER_DATA_DEST", "file:///tmp") + vernacular_root = ( + f"{fetcher_data_dest.rstrip('/')}/{collection_id}/") if not vernacular_suffix: - vernacular_suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + vernacular_suffix = ( + datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) vernacular_path = ( - f"vernacular_metadata_{vernacular_suffix}/" - ) + f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/") - self.vernacular = f"{vernacular_root}{vernacular_path.rstrip('/')}/" + self.vernacular = vernacular_path.rstrip('/')+"/" - # mapped_data_src = os.environ.get("MAPPED_DATA_SRC", fetcher_data_dest) - # mapped_root = ( - # f"{mapped_data_src.rstrip('/')}/{self.collection_id}/" - # ) + mapped_data_dest = os.environ.get("MAPPED_DATA_DEST", "file:///tmp") + mapped_root = ( + f"{mapped_data_dest.rstrip('/')}/{self.collection_id}/" + ) def save_fetched_content(self, content: str, filename: str): From 8f97daf4f82e36b471a60dfd725fbebc8166f5f6 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 9 Nov 2023 14:34:50 -0800 Subject: [PATCH 18/42] create a vernacular version prior to fetching --- dags/fetcher_dag.py | 5 +- dags/harvest_dag.py | 5 +- dags/shared_tasks.py | 11 +- .../fetch_registry_collections.py | 5 +- metadata_fetcher/fetchers/Fetcher.py | 6 +- metadata_fetcher/lambda_function.py | 10 +- metadata_fetcher/tests.py | 4 +- utils/rikolti_storage.py | 139 +++++++++--------- 8 files changed, 102 insertions(+), 83 deletions(-) diff --git a/dags/fetcher_dag.py b/dags/fetcher_dag.py index 25e510a30..410eecda2 100644 --- a/dags/fetcher_dag.py +++ b/dags/fetcher_dag.py @@ -5,6 +5,7 @@ from rikolti.dags.shared_tasks import get_collection_fetchdata_task from rikolti.dags.shared_tasks import fetch_collection_task +from rikolti.dags.shared_tasks import create_vernacular_version_task @dag( dag_id="fetch_collection", @@ -16,6 +17,8 @@ ) def fetcher_dag(): fetchdata = get_collection_fetchdata_task() - fetch_collection_task(collection=fetchdata) + vernacular_version = create_vernacular_version_task(collection=fetchdata) + fetch_collection_task( + collection=fetchdata, vernacular_version=vernacular_version) fetcher_dag() diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py index 88c6352b4..78f814ce4 100644 --- a/dags/harvest_dag.py +++ b/dags/harvest_dag.py @@ -4,6 +4,7 @@ from airflow.models.param import Param +from rikolti.dags.shared_tasks import create_vernacular_version_task from rikolti.dags.shared_tasks import fetch_collection_task from rikolti.dags.shared_tasks import get_collection_fetchdata_task from rikolti.dags.shared_tasks import get_collection_metadata_task @@ -34,7 +35,9 @@ def harvest(): fetchdata = get_collection_fetchdata_task() collection = get_collection_metadata_task() - fetched_pages = fetch_collection_task(collection=fetchdata) + vernacular_version = create_vernacular_version_task(collection=fetchdata) + fetched_pages = fetch_collection_task( + collection=fetchdata, vernacular_version=vernacular_version) mapped_pages = ( map_page_task .partial(collection=collection) diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index 04e72d62c..8102c7324 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -17,6 +17,7 @@ from rikolti.record_indexer.create_collection_index import get_index_name from rikolti.record_indexer.create_collection_index import delete_index from rikolti.record_indexer.move_index_to_prod import move_index_to_prod +from rikolti.utils.rikolti_storage import create_vernacular_version # TODO: remove the rikoltifetcher registry endpoint and restructure @@ -37,8 +38,14 @@ def get_collection_fetchdata_task(params=None): @task() -def fetch_collection_task(collection: dict): - fetch_status = fetch_collection(collection, {}) +def create_vernacular_version_task(collection): + vernacular_version = create_vernacular_version(collection.get('id')) + return vernacular_version + + +@task() +def fetch_collection_task(collection: dict, vernacular_version: str): + fetch_status = fetch_collection(collection, vernacular_version, {}) success = all([page['status'] == 'success' for page in fetch_status]) total_items = sum([page['document_count'] for page in fetch_status]) total_pages = len(fetch_status) diff --git a/metadata_fetcher/fetch_registry_collections.py b/metadata_fetcher/fetch_registry_collections.py index d8bf8ec0a..27a322b54 100644 --- a/metadata_fetcher/fetch_registry_collections.py +++ b/metadata_fetcher/fetch_registry_collections.py @@ -5,6 +5,7 @@ import requests from . import lambda_function +from rikolti.utils.rikolti_storage import create_vernacular_version logger = logging.getLogger(__name__) @@ -51,7 +52,9 @@ def fetch_endpoint(url, limit=None, job_logger=logger): job_logger.debug( f"{collection_id:<6}: call lambda with payload: {collection}") - fetch_result = lambda_function.fetch_collection(collection, None) + vernacular_version = create_vernacular_version(collection_id) + fetch_result = lambda_function.fetch_collection( + collection, vernacular_version, None) results[collection_id] = fetch_result success = all([page['status'] == 'success' for page in fetch_result]) diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index cfedbaf12..17f53490a 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -3,7 +3,7 @@ import os from requests.adapters import HTTPAdapter, Retry -from rikolti.utils.rikolti_storage import create_vernacular_version, put_page_content +from rikolti.utils.rikolti_storage import put_page_content logger = logging.getLogger(__name__) @@ -22,11 +22,11 @@ class FetchError(Exception): class Fetcher(object): - def __init__(self, params): + def __init__(self, params, vernacular_data_version): self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - self.data_destination = create_vernacular_version(self.collection_id) + self.data_destination = vernacular_data_version if not self.collection_id: raise CollectionIdRequired("collection_id is required") diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index e6b2f3376..5ebd2e1b3 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -4,6 +4,7 @@ import sys from .fetchers.Fetcher import Fetcher, InvalidHarvestEndpoint +from rikolti.utils.rikolti_storage import create_vernacular_version logger = logging.getLogger(__name__) @@ -20,7 +21,7 @@ def import_fetcher(harvest_type): # AWS Lambda entry point -def fetch_collection(payload, context): +def fetch_collection(payload, vernacular_version, context): if isinstance(payload, str): payload = json.loads(payload) @@ -30,7 +31,7 @@ def fetch_collection(payload, context): fetch_status = [] try: - fetcher = fetcher_class(payload) + fetcher = fetcher_class(payload, vernacular_version) fetch_status.append(fetcher.fetch_page()) except InvalidHarvestEndpoint as e: logger.error(e) @@ -53,7 +54,7 @@ def fetch_collection(payload, context): fetch_status = fetch_status[0] if not json.loads(next_page).get('finished'): - fetch_status.extend(fetch_collection(next_page, {})) + fetch_status.extend(fetch_collection(next_page, vernacular_version, {})) return fetch_status @@ -71,7 +72,8 @@ def fetch_collection(payload, context): encoding='utf-8', level=logging.DEBUG ) + vernacular_version = create_vernacular_version(payload.get('collection_id')) print(f"Starting to fetch collection {payload.get('collection_id')}") - fetch_collection(payload, {}) + fetch_collection(payload, vernacular_version, {}) print(f"Finished fetching collection {payload.get('collection_id')}") sys.exit(0) diff --git a/metadata_fetcher/tests.py b/metadata_fetcher/tests.py index 6f176780b..85f6cebb1 100644 --- a/metadata_fetcher/tests.py +++ b/metadata_fetcher/tests.py @@ -10,6 +10,7 @@ nuxeo_nested_complex_object_harvests) from .sample_data.oac_harvests import oac_harvests from .sample_data.oai_harvests import oai_harvests +from rikolti.utils.rikolti_storage import create_vernacular_version def main(): @@ -25,7 +26,8 @@ def main(): for harvest in harvests: print(f"tests.py: {json.dumps(harvest)}") - status = fetch_collection(json.dumps(harvest), {}) + vernacular_version = create_vernacular_version(harvest.get('collection_id')) + status = fetch_collection(json.dumps(harvest), vernacular_version, {}) print(f"Harvest status: {status}") urls = [ diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 38e8fb8a7..f885a3927 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -18,7 +18,7 @@ def parse_data_uri(data_uri: str): data_uri, data_loc.scheme, data_loc.netloc, data_loc.path) -def list_dirs(data_uri: str, **kwargs) -> list[str]: +def list_dirs(data_uri: str, recursive=False, **kwargs) -> list[str]: data = parse_data_uri(data_uri) if data.store == 's3': s3 = boto3.client('s3', **kwargs) @@ -197,53 +197,52 @@ def put_file_content(data: DataStorage, content) -> str: def create_vernacular_version( collection_id: int or str, - vernacular_suffix: Optional[str] = None + version_suffix: Optional[str] = None ): fetcher_data_dest = os.environ.get( "FETCHER_DATA_DEST", "file:///tmp") - vernacular_root = ( + collection_path = ( f"{fetcher_data_dest.rstrip('/')}/{collection_id}/") - if not vernacular_suffix: - vernacular_suffix = ( + if not version_suffix: + version_suffix = ( datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) - vernacular_path = ( - f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/") - return vernacular_path - - -class RikoltiStorage(): - def __init__( - self, - collection_id: int or str, - vernacular_suffix: Optional[str] = None, - vernacular_path: Optional[str] = None, - mapped_data_suffix: Optional[str] = None, - mapped_data_path: Optional[str] = None, - **kwargs): - - self.collection_id = collection_id - - if not vernacular_path: - fetcher_data_dest = os.environ.get( - "FETCHER_DATA_DEST", "file:///tmp") - vernacular_root = ( - f"{fetcher_data_dest.rstrip('/')}/{collection_id}/") - if not vernacular_suffix: - vernacular_suffix = ( - datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) - vernacular_path = ( - f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/") - - self.vernacular = vernacular_path.rstrip('/')+"/" - - mapped_data_dest = os.environ.get("MAPPED_DATA_DEST", "file:///tmp") + vernacular_version_path = ( + f"{collection_path}vernacular_metadata_{version_suffix}/") + return vernacular_version_path + + +def get_most_recent_vernacular_version(collection_id: int or str): + mapper_data_src = os.environ.get("MAPPED_DATA_SRC") + vernacular_versions = list_dirs(f"{mapper_data_src}/{collection_id}/") + if not vernacular_versions: + raise Exception( + "No vernacular metadata versions found for {collection_id}") + return sorted(vernacular_versions)[-1] + + +def create_mapped_version( + collection_id: int or str, + vernacular_path: str, + mapped_data_suffix: Optional[str] = None, +): + mapper_data_dest = os.environ.get("MAPPED_DATA_DEST") + # get path of the vernacular version, not the vernacular data + mapped_root = vernacular_path.rsplit('data', 1)[0] + + if mapper_data_dest: + # get path relative to collection_id + vernacular_path = vernacular_path.split(str(collection_id))[-1] mapped_root = ( - f"{mapped_data_dest.rstrip('/')}/{self.collection_id}/" + f"{mapper_data_dest.rstrip('/')}/{collection_id}/{vernacular_path}" ) - - def save_fetched_content(self, content: str, filename: str): - return put_page_content(content, f"{self.vernacular}data/{filename}") + if not mapped_data_suffix: + mapped_data_suffix = ( + datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + mapped_data_path = ( + f"{mapped_root.rstrip('/')}/mapped_metadata_{mapped_data_suffix}/") + return mapped_data_path + # def list_fetched_content(self, recursive: bool=True, **kwargs) -> list: # return list_pages( @@ -252,36 +251,36 @@ def save_fetched_content(self, content: str, filename: str): # recursive=recursive # ) - def search_page(self, search_str: str, page: str) -> bool: - if self.data_store == 's3': - return self.search_s3_page(search_str, page) - elif self.data_store == 'file': - return self.search_file_page(search_str, page) - else: - raise Exception(f"Unknown data store: {self.data_store}") - - def search_s3_page(self, search_str: str, s3_key: str) -> bool: - """ - Check if search_str is in the body of the object located at s3_key - Returns the s3_key of the object if so, otherwise returns None - """ - obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key) - body = obj['Body'].read().decode('utf-8') - if search_str in body: - return True - else: - return False - - def search_file_page(self, search_str: str, file_path: str) -> bool: - """ - Check if search_str is in the body of the file located at file_path - """ - with open(file_path, 'r') as f: - body = f.read() - if search_str in body: - return True - else: - return False + # def search_page(self, search_str: str, page: str) -> bool: + # if self.data_store == 's3': + # return self.search_s3_page(search_str, page) + # elif self.data_store == 'file': + # return self.search_file_page(search_str, page) + # else: + # raise Exception(f"Unknown data store: {self.data_store}") + + # def search_s3_page(self, search_str: str, s3_key: str) -> bool: + # """ + # Check if search_str is in the body of the object located at s3_key + # Returns the s3_key of the object if so, otherwise returns None + # """ + # obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key) + # body = obj['Body'].read().decode('utf-8') + # if search_str in body: + # return True + # else: + # return False + + # def search_file_page(self, search_str: str, file_path: str) -> bool: + # """ + # Check if search_str is in the body of the file located at file_path + # """ + # with open(file_path, 'r') as f: + # body = f.read() + # if search_str in body: + # return True + # else: + # return False From e81427241c021adb3870198591d840a1abbc4cdd Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 9 Nov 2023 14:57:55 -0800 Subject: [PATCH 19/42] update metadata mapper to use versioning --- dags/harvest_dag.py | 7 +++++- dags/mapper_dag.py | 15 +++++++++--- dags/shared_tasks.py | 14 ++++++++--- metadata_mapper/lambda_function.py | 22 +++++------------ metadata_mapper/lambda_shepherd.py | 27 ++++++++++++--------- metadata_mapper/map_registry_collections.py | 3 +-- 6 files changed, 51 insertions(+), 37 deletions(-) diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py index 78f814ce4..49a3f420a 100644 --- a/dags/harvest_dag.py +++ b/dags/harvest_dag.py @@ -8,6 +8,7 @@ from rikolti.dags.shared_tasks import fetch_collection_task from rikolti.dags.shared_tasks import get_collection_fetchdata_task from rikolti.dags.shared_tasks import get_collection_metadata_task +from rikolti.dags.shared_tasks import create_mapped_version_task from rikolti.dags.shared_tasks import map_page_task from rikolti.dags.shared_tasks import get_mapping_status_task from rikolti.dags.shared_tasks import validate_collection_task @@ -38,9 +39,13 @@ def harvest(): vernacular_version = create_vernacular_version_task(collection=fetchdata) fetched_pages = fetch_collection_task( collection=fetchdata, vernacular_version=vernacular_version) + mapped_data_version = create_mapped_version_task( + collection=collection, + vernacular_pages=fetched_pages + ) mapped_pages = ( map_page_task - .partial(collection=collection) + .partial(collection=collection, mapped_data_version=mapped_data_version) .expand(page=fetched_pages) ) diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py index bb5963490..bcf639796 100644 --- a/dags/mapper_dag.py +++ b/dags/mapper_dag.py @@ -1,22 +1,27 @@ from datetime import datetime +from typing import Optional from airflow.decorators import dag, task from airflow.models.param import Param from rikolti.dags.shared_tasks import get_collection_metadata_task +from rikolti.dags.shared_tasks import create_mapped_version_task from rikolti.dags.shared_tasks import map_page_task from rikolti.dags.shared_tasks import get_mapping_status_task from rikolti.dags.shared_tasks import validate_collection_task from rikolti.metadata_mapper.lambda_shepherd import get_vernacular_pages +from rikolti.utils.rikolti_storage import get_most_recent_vernacular_version @task() -def get_vernacular_pages_task(collection: dict): +def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str] = None): collection_id = collection.get('id') + if not vernacular_version: + vernacular_version = get_most_recent_vernacular_version(collection_id) if not collection_id: raise ValueError( f"Collection ID not found in collection metadata: {collection}") - pages = get_vernacular_pages(collection_id) + pages = get_vernacular_pages(collection_id, vernacular_version) return pages # This is a functional duplicate of @@ -48,9 +53,13 @@ def get_vernacular_pages_task(collection: dict): def mapper_dag(): collection = get_collection_metadata_task() page_list = get_vernacular_pages_task(collection=collection) + mapped_data_version = create_mapped_version_task( + collection=collection, + vernacular_pages=page_list + ) mapped_pages = ( map_page_task - .partial(collection=collection) + .partial(collection=collection, mapped_data_version=mapped_data_version) .expand(page=page_list) ) diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index 8102c7324..5975e5027 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -17,6 +17,7 @@ from rikolti.record_indexer.create_collection_index import get_index_name from rikolti.record_indexer.create_collection_index import delete_index from rikolti.record_indexer.move_index_to_prod import move_index_to_prod +from rikolti.utils.rikolti_storage import create_mapped_version from rikolti.utils.rikolti_storage import create_vernacular_version @@ -99,11 +100,11 @@ def get_collection_metadata_task(params=None): # max_active_tis_per_dag - setting on the task to restrict how many # instances can be running at the same time, *across all DAG runs* @task() -def map_page_task(page: str, collection: dict): +def map_page_task(page: str, collection: dict, mapped_data_version: str): collection_id = collection.get('id') - if not collection_id: + if not collection_id or not mapped_data_version: return False - mapped_page = map_page(collection_id, page, collection) + mapped_page = map_page(collection_id, page, mapped_data_version, collection) return mapped_page @@ -113,6 +114,13 @@ def get_mapping_status_task(collection: dict, mapped_pages: list): return mapping_status +@task() +def create_mapped_version_task(collection, vernacular_pages): + mapped_data_version = create_mapped_version( + collection.get('id'), vernacular_pages[0]) + return mapped_data_version + + @task() def validate_collection_task(collection_status: dict, params=None) -> str: if not params or not params.get('validate'): diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 1d63d85de..8e92784a6 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -1,6 +1,7 @@ import importlib import json import logging +import os import sys from typing import Union from urllib.parse import parse_qs, urlparse @@ -72,7 +73,7 @@ def run_enrichments(records, collection, enrichment_set, page_filename): return records -def map_page(collection_id: int, page_path: str, collection: Union[dict, str]): +def map_page(collection_id: int, page_path: str, mapped_data_version: str, collection: Union[dict, str]): if isinstance(collection, str): collection = json.loads(collection) @@ -91,15 +92,6 @@ def map_page(collection_id: int, page_path: str, collection: Union[dict, str]): record.to_UCLDC() mapped_records = source_metadata_records - # TODO: write interim mapped but not enriched metadata to s3? - # put_page_content( - # json.dumps([record.to_dict() for record in mapped_records]), - # ( - # f"{settings.DATA_DEST_URL}/{collection_id}/" - # f"interim_mapped_metadata/{page_filename}" - # ) - # ) - mapped_records = run_enrichments( mapped_records, collection, 'rikolti__enrichments', page_filename) @@ -128,10 +120,7 @@ def map_page(collection_id: int, page_path: str, collection: Union[dict, str]): mapped_metadata = [record.to_dict() for record in mapped_records] put_page_content( json.dumps(mapped_metadata), - ( - f"{settings.DATA_DEST_URL}/{collection_id}/" - f"mapped_metadata/{page_filename}" - ) + f"{mapped_data_version.rstrip('/')}/data/{page_filename}.jsonl" ) return { @@ -147,11 +136,12 @@ def map_page(collection_id: int, page_path: str, collection: Union[dict, str]): parser = argparse.ArgumentParser( description="Map metadata from the institution's vernacular") parser.add_argument('collection_id', help='collection id') - parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename') + parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/data/1') + parser.add_argument('mapped_data_version', help='uri file path to mapped data version; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/mapped_data_version_1/') parser.add_argument('collection', help='json collection metadata from registry') args = parser.parse_args(sys.argv[1:]) - mapped_page = map_page(args.collection_id, args.page_path, args.collection) + mapped_page = map_page(args.collection_id, args.page_path, args.mapped_data_path, args.collection) print(f"{mapped_page.get('num_records_mapped')} records mapped") diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py index c40c9dd2f..506c35385 100644 --- a/metadata_mapper/lambda_shepherd.py +++ b/metadata_mapper/lambda_shepherd.py @@ -5,10 +5,10 @@ from urllib.parse import urlparse -from . import settings, validate_mapping +from . import validate_mapping from .lambda_function import map_page from .mappers.mapper import Record -from rikolti.utils.rikolti_storage import list_pages +from rikolti.utils.rikolti_storage import list_pages, create_mapped_version, get_most_recent_vernacular_version def get_collection(collection_id): @@ -37,12 +37,9 @@ def check_for_missing_enrichments(collection): return not_yet_implemented -def get_vernacular_pages(collection_id): +def get_vernacular_pages(collection_id, vernacular_version): try: - page_list = list_pages( - f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata", - recursive=True - ) + page_list = list_pages(vernacular_version, recursive=True) except FileNotFoundError as e: print( f"{e} - have you fetched {collection_id}? " @@ -50,7 +47,7 @@ def get_vernacular_pages(collection_id): ) raise(e) - # TODO: split page_list into pages and children + # TODO: split page_list into pages and children? return page_list @@ -75,7 +72,7 @@ def get_mapping_status(collection, mapped_pages): 'group_exceptions': group_exceptions } -def map_collection(collection_id, validate=False): +def map_collection(collection_id, vernacular_version=None, validate=False): # This is a functional duplicate of rikolti.d*gs.mapper_d*g.mapper_d*g # Within an airflow runtime context, we take advantage of airflow's dynamic @@ -91,11 +88,16 @@ def map_collection(collection_id, validate=False): collection = get_collection(collection_id) - page_list = get_vernacular_pages(collection_id) + if not vernacular_version: + vernacular_version = get_most_recent_vernacular_version(collection_id) + page_list = get_vernacular_pages(collection_id, vernacular_version) + + mapped_data_version = create_mapped_version(collection_id, page_list[0]) mapped_pages = [] for page in page_list: try: - mapped_page = map_page(collection_id, page, collection) + mapped_page = map_page( + collection_id, page, mapped_data_version, collection) mapped_pages.append(mapped_page) except KeyError: print( @@ -126,8 +128,9 @@ def map_collection(collection_id, validate=False): parser.add_argument('collection_id', help='collection ID from registry') parser.add_argument('--validate', help='validate mapping; may provide json opts', const=True, nargs='?') + parser.add_argument('vernacular_version', help='URI to a folder of vernacular pages to map') args = parser.parse_args(sys.argv[1:]) - mapped_collection = map_collection(args.collection_id, args.validate) + mapped_collection = map_collection(args.collection_id, args.vernacular_version, args.validate) missing_enrichments = mapped_collection.get('missing_enrichments') if len(missing_enrichments) > 0: print( diff --git a/metadata_mapper/map_registry_collections.py b/metadata_mapper/map_registry_collections.py index 3cb241db1..584830b86 100644 --- a/metadata_mapper/map_registry_collections.py +++ b/metadata_mapper/map_registry_collections.py @@ -51,8 +51,7 @@ def map_endpoint(url, limit=None): f"{collection_id:<6}: call lambda with collection_id: {collection_id}") try: - map_result = lambda_shepherd.map_collection( - collection_id) + map_result = lambda_shepherd.map_collection(collection_id) except FileNotFoundError: print(f"{collection_id:<6}: not fetched yet", file=sys.stderr) continue From 6bee4c318852651ea65a57753c8ef251f93666a8 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 9 Nov 2023 15:07:06 -0800 Subject: [PATCH 20/42] maintain Fetcher.__init__ function signature --- dags/shared_tasks.py | 3 +-- metadata_fetcher/fetchers/Fetcher.py | 6 +++--- metadata_fetcher/lambda_function.py | 3 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index 5975e5027..c7b9cd463 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -40,8 +40,7 @@ def get_collection_fetchdata_task(params=None): @task() def create_vernacular_version_task(collection): - vernacular_version = create_vernacular_version(collection.get('id')) - return vernacular_version + return create_vernacular_version(collection.get('collection_id')) @task() diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 17f53490a..02085d18c 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -22,11 +22,12 @@ class FetchError(Exception): class Fetcher(object): - def __init__(self, params, vernacular_data_version): + def __init__(self, params): self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - self.data_destination = vernacular_data_version + self.data_destination = params.get('vernacular_version') + if not self.collection_id: raise CollectionIdRequired("collection_id is required") @@ -51,7 +52,6 @@ def fetch_page(self): try: filepath = put_page_content( content, f"{self.data_destination}data/{self.write_page}") - print(filepath) except Exception as e: print(f"Metadata Fetcher: {e}") raise(e) diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index 5ebd2e1b3..d4f78b68f 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -28,10 +28,11 @@ def fetch_collection(payload, vernacular_version, context): logger.debug(f"fetch_collection payload: {payload}") fetcher_class = import_fetcher(payload.get('harvest_type')) + payload.update({'vernacular_version': vernacular_version}) fetch_status = [] try: - fetcher = fetcher_class(payload, vernacular_version) + fetcher = fetcher_class(payload) fetch_status.append(fetcher.fetch_page()) except InvalidHarvestEndpoint as e: logger.error(e) From 9dc2e17f754b276afb00e4a74a4da3409ed42bf2 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Thu, 9 Nov 2023 16:07:17 -0800 Subject: [PATCH 21/42] add full filepaths to map_page output --- dags/harvest_dag.py | 6 ++-- dags/shared_tasks.py | 3 +- dags/utils_by_mapper_type.py | 2 +- metadata_mapper/lambda_function.py | 11 +++---- metadata_mapper/lambda_shepherd.py | 5 +++- metadata_mapper/validate_mapping.py | 20 ++++++------- metadata_mapper/validator/validation_log.py | 9 ++---- utils/rikolti_storage.py | 33 +++++++++++++++++++++ 8 files changed, 61 insertions(+), 28 deletions(-) diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py index 49a3f420a..eeac13dd8 100644 --- a/dags/harvest_dag.py +++ b/dags/harvest_dag.py @@ -17,7 +17,7 @@ @task() def get_mapped_page_filenames_task(mapped_pages): - return [mapped['page_filename'] for mapped in mapped_pages] + return [mapped['mapped_page_path'] for mapped in mapped_pages] @dag( dag_id="harvest_collection", @@ -51,7 +51,7 @@ def harvest(): mapping_status = get_mapping_status_task(collection, mapped_pages) validate_collection_task(mapping_status) - mapped_page_filenames = get_mapped_page_filenames_task(mapped_pages) + mapped_page_paths = get_mapped_page_filenames_task(mapped_pages) content_harvest_task = ( ContentHarvestOperator @@ -60,7 +60,7 @@ def harvest(): collection_id="{{ params.collection_id }}", ) .expand( - page=mapped_page_filenames + page=mapped_page_paths ) ) content_harvest_task diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index c7b9cd463..792875f21 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -131,7 +131,8 @@ def validate_collection_task(collection_status: dict, params=None) -> str: if collection_status.get('status') != 'success': raise Exception(f"Collection {collection_id} not successfully mapped") - num_rows, file_location = create_collection_validation_csv(collection_id) + num_rows, file_location = create_collection_validation_csv( + collection_id, collection_status['mapped_page_paths']) print(f"Output {num_rows} rows to {file_location}") # create a link to the file in the logs diff --git a/dags/utils_by_mapper_type.py b/dags/utils_by_mapper_type.py index bda78ff83..85d1b0386 100644 --- a/dags/utils_by_mapper_type.py +++ b/dags/utils_by_mapper_type.py @@ -66,7 +66,7 @@ def validate_endpoint_task(url, params=None): for collection in registry_endpoint(url): print(f"{collection['collection_id']:<6} Validating collection") num_rows, file_location = create_collection_validation_csv( - collection['collection_id']) + collection['collection_id'], mapped_page_paths) csv_paths.append(file_location) if file_location.startswith('s3://'): s3_path = urlparse(file_location) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 8e92784a6..a5922700e 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -73,14 +73,14 @@ def run_enrichments(records, collection, enrichment_set, page_filename): return records -def map_page(collection_id: int, page_path: str, mapped_data_version: str, collection: Union[dict, str]): +def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: str, collection: Union[dict, str]): if isinstance(collection, str): collection = json.loads(collection) vernacular_reader = import_vernacular_reader( collection.get('rikolti_mapper_type')) - page_filename = os.path.basename(page_path) - api_resp = get_page_content(page_path) + page_filename = os.path.basename(vernacular_page_path) + api_resp = get_page_content(vernacular_page_path) source_vernacular = vernacular_reader(collection_id, page_filename) source_metadata_records = source_vernacular.parse(api_resp) @@ -118,7 +118,7 @@ def map_page(collection_id: int, page_path: str, mapped_data_version: str, colle # for record in mapped_records] mapped_metadata = [record.to_dict() for record in mapped_records] - put_page_content( + mapped_page_path = put_page_content( json.dumps(mapped_metadata), f"{mapped_data_version.rstrip('/')}/data/{page_filename}.jsonl" ) @@ -127,7 +127,7 @@ def map_page(collection_id: int, page_path: str, mapped_data_version: str, colle 'status': 'success', 'num_records_mapped': len(mapped_records), 'page_exceptions': group_page_exceptions, - 'page_filename': page_filename, + 'mapped_page_path': mapped_page_path, } @@ -144,6 +144,7 @@ def map_page(collection_id: int, page_path: str, mapped_data_version: str, colle mapped_page = map_page(args.collection_id, args.page_path, args.mapped_data_path, args.collection) print(f"{mapped_page.get('num_records_mapped')} records mapped") + print(f"mapped page at {mapped_page.get('mapped_page_path')}") for report, couch_ids in mapped_page.get('exceptions', {}).items(): print(f"{len(couch_ids)} records report enrichments errors: {report}") diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py index 506c35385..baa5da644 100644 --- a/metadata_mapper/lambda_shepherd.py +++ b/metadata_mapper/lambda_shepherd.py @@ -69,7 +69,8 @@ def get_mapping_status(collection, mapped_pages): 'missing_enrichments': check_for_missing_enrichments(collection), 'count': count, 'page_count': page_count, - 'group_exceptions': group_exceptions + 'group_exceptions': group_exceptions, + 'mapped_page_paths': [page['mapped_page_path'] for page in mapped_pages], } def map_collection(collection_id, vernacular_version=None, validate=False): @@ -107,12 +108,14 @@ def map_collection(collection_id, vernacular_version=None, validate=False): continue collection_stats = get_mapping_status(collection, mapped_pages) + mapped_page_paths = [page['mapped_page_path'] for page in mapped_pages] if validate: opts = validate if isinstance(validate, dict) else {} num_rows, file_location = ( validate_mapping.create_collection_validation_csv( collection_id, + mapped_page_paths, **opts ) ) diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py index 584e53aa7..f8b606164 100644 --- a/metadata_mapper/validate_mapping.py +++ b/metadata_mapper/validate_mapping.py @@ -10,12 +10,13 @@ from .validator.validation_log import ValidationLogLevel from .validator.validation_mode import ValidationMode from .validator.validator import Validator -from rikolti.utils.rikolti_storage import list_pages, get_page_content +from rikolti.utils.rikolti_storage import get_page_content urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def validate_collection(collection_id: int, + mapped_page_paths: list[str], validator_class: Type[Validator] = None, validator: Validator = None, validation_mode = ValidationMode.STRICT, @@ -50,12 +51,7 @@ def validate_collection(collection_id: int, log_level = log_level, verbose = verbose) - mapped_pages = list_pages( - f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/", - recursive=False - ) - - for page_path in mapped_pages: + for page_path in mapped_page_paths: validate_page(collection_id, page_path, validator) return validator @@ -115,9 +111,11 @@ def validate_page(collection_id: int, page_path: str, return validator -def create_collection_validation_csv(collection_id: int, **options) -> tuple[int, str]: - result = validate_collection(collection_id, **options) - filename = result.log.output_csv_to_bucket(collection_id) +def create_collection_validation_csv( + collection_id: int, mapped_page_paths: list[str], **options) -> tuple[int, str]: + result = validate_collection(collection_id, mapped_page_paths, **options) + + filename = result.log.output_csv_to_bucket(collection_id, mapped_page_paths[0]) return len(result.log.log), filename ## Private-ish @@ -276,5 +274,5 @@ def get_validator_class(collection_id: int) -> Type[Validator]: print(kwargs) num_rows, file_location = create_collection_validation_csv( - args.collection_id, **kwargs) + args.collection_id, mapped_page_paths, **kwargs) print(f"Output {num_rows} rows to {file_location}") diff --git a/metadata_mapper/validator/validation_log.py b/metadata_mapper/validator/validation_log.py index 72e6eeeae..516eafaa3 100644 --- a/metadata_mapper/validator/validation_log.py +++ b/metadata_mapper/validator/validation_log.py @@ -3,7 +3,7 @@ from typing import IO, Any from .. import settings -from rikolti.utils.rikolti_storage import put_page_content +from rikolti.utils.rikolti_storage import put_page_content, create_validation_version class ValidationLogLevel(Enum): @@ -110,7 +110,7 @@ def output_csv_to_file(self, file: IO[str], append: bool = False, with open(file, "a" if append else "w") as f: f.write(self._csv_content_string(include_fields, append)) - def output_csv_to_bucket(self, collection_id: int, filename: str = None, + def output_csv_to_bucket(self, collection_id: int, mapped_data_path: str = None, include_fields: list[str] = None) -> str: """ Writes a CSV to the env-appropriate bucket (local or S3). @@ -124,14 +124,11 @@ def output_csv_to_bucket(self, collection_id: int, filename: str = None, include_fields: list[str] (default: None) A list of fields to include in the CSV. Defaults to all. """ - if not filename: - filename = f"{datetime.now().strftime('%m-%d-%YT%H:%M:%S')}.csv" - content = self._csv_content_string(include_fields) if isinstance(content, list) or isinstance(content, dict): content = json.dumps(content) - file_location = f"{settings.DATA_SRC_URL}/{collection_id}/validation/{filename}" + file_location = create_validation_version(collection_id, mapped_data_path) put_page_content(content, file_location) return file_location diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index f885a3927..8e217dc00 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -283,5 +283,38 @@ def create_mapped_version( # return False +def create_validation_version( + collection_id: int or str, + mapped_data_path: str, + validation_suffix: Optional[str] = None +): + validation_data_dest = os.environ.get("VALIDATION_DATA_DEST") + # get path of the mapped data version, not the mapped data + validation_root = mapped_data_path.rsplit('data', 1)[0] + + if validation_data_dest: + # get path relative to collection_id + mapped_data_path = mapped_data_path.split(str(collection_id))[-1] + validation_root = ( + f"{validation_data_dest.rstrip('/')}/{collection_id}/{mapped_data_path}" + ) + + if not validation_suffix: + validation_suffix = ( + datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + validation_data_path = ( + f"{validation_root.rstrip('/')}/validation_{validation_suffix}.csv") + return validation_data_path + + validation_data_dest = os.environ.get( + "VALIDATION_DATA_DEST", "file:///tmp") + collection_path = ( + f"{validation_data_dest.rstrip('/')}/{collection_id}/") + if not validation_suffix: + validation_suffix = ( + datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + validation_version_path = ( + f"{collection_path}validation_{validation_suffix}/") + return validation_version_path From df41fb0cef65c08b36c5e032266ba5e9e6769c50 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 15 Nov 2023 11:42:10 -0800 Subject: [PATCH 22/42] update content harvester to use versioning --- content_harvester/by_collection.py | 22 +++++---- content_harvester/by_page.py | 51 +++++++++++---------- utils/rikolti_storage.py | 71 ++++++++++-------------------- 3 files changed, 65 insertions(+), 79 deletions(-) diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py index 9d7215b5d..dc8bd5a39 100644 --- a/content_harvester/by_collection.py +++ b/content_harvester/by_collection.py @@ -2,12 +2,12 @@ from . import settings from .by_page import harvest_page_content -from rikolti.utils.rikolti_storage import list_pages +from rikolti.utils.rikolti_storage import list_pages, create_content_data_version -def get_mapped_pages(collection_id): +def get_mapped_pages(mapped_data_version:str): page_list = [] page_list = list_pages( - f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata", + f"{mapped_data_version.rstrip('/')}/data/", recursive=False, aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, @@ -18,20 +18,25 @@ def get_mapped_pages(collection_id): # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo"} -def harvest_collection(collection): +def harvest_collection(collection, mapped_data_version: str): if isinstance(collection, str): collection = json.loads(collection) collection_id = collection.get('collection_id') - if not collection_id: - print("ERROR ERROR ERROR\ncollection_id required") + if not collection_id or not mapped_data_version: + print("ERROR ERROR ERROR\ncollection_id and mapped_data_version required") exit() - page_list = get_mapped_pages(collection_id) + page_list = get_mapped_pages(mapped_data_version) print(f"[{collection_id}]: Harvesting content for {len(page_list)} pages") collection_stats = {} + collection.update({ + 'content_data_version': create_content_data_version( + collection_id, mapped_data_version) + }) + for page_path in page_list: collection.update({'page_path': page_path}) page_stats = harvest_page_content(**collection) @@ -56,6 +61,7 @@ def harvest_collection(collection): parser = argparse.ArgumentParser( description="Harvest content by collection using mapped metadata") parser.add_argument('collection_id', help="Collection ID") + parser.add_argument('mapped_data_version', help="URI to mapped data version: ex: s3://rikolti-data-root/3433/vernacular_data_version_1/mapped_data_version_2/") parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth") args = parser.parse_args() arguments = { @@ -63,4 +69,4 @@ def harvest_collection(collection): } if args.nuxeo: arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo' - print(harvest_collection(arguments)) \ No newline at end of file + print(harvest_collection(arguments, args.mapped_data_version)) \ No newline at end of file diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index 4789a4cbd..0c2515638 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -14,7 +14,7 @@ from . import derivatives from . import settings -from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content +from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version class DownloadError(Exception): pass @@ -30,32 +30,33 @@ def get_mapped_records(page_path) -> list: return mapped_records -def write_mapped_record(collection_id, record): - put_page_content( +def write_mapped_record(content_data_version, record): + filename = put_page_content( json.dumps(record), ( - f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/" - f"{record.get('calisphere-id').replace(os.sep, '_')}" + f"{content_data_version.rstrip('/')}/data/" + f"{record.get('calisphere-id').replace(os.sep, '_')}.json" ) -) + ) + return filename -def write_mapped_page(collection_id, page, records): - put_page_content( +def write_mapped_page(content_data_version, page, records): + filename = put_page_content( json.dumps(records), - f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/{page}" + f"{content_data_version.rstrip('/')}/data/{page}" ) + return filename -def get_child_records(collection_id, parent_id) -> list: +def get_child_records(mapped_page_path, parent_id) -> list: mapped_child_records = [] children = list_pages( - f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children", + f"{mapped_page_path.rsplit('/', 1)[0]}/children/", recursive=False ) - if rikolti_data.data_store == 'file': - children = [page for page in children - if os.path.basename(page).startswith(parent_id)] + children = [page for page in children + if (page.rsplit('/')[-1]).startswith(parent_id)] for child in children: mapped_child_records.extend(json.loads(get_page_content(child))) return mapped_child_records @@ -179,7 +180,8 @@ def check_mimetype(self, mimetype): class ContentHarvester(object): # context = {'collection_id': '12345', 'page_filename': '1.jsonl'} - def __init__(self, collection_id, page_filename, src_auth=None): + def __init__(self, mapped_page_path, collection_id, page_filename, src_auth=None): + self.mapped_page_path = mapped_page_path self.http = requests.Session() retry_strategy = Retry( @@ -249,7 +251,7 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict: # Recurse through the record's children (if any) child_records = get_child_records( - self.collection_id, calisphere_id) + self.mapped_page_path, calisphere_id) if child_records: print( f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: " @@ -347,9 +349,9 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "file:///rikolti_data/r-0"} -def harvest_page_content(collection_id, page_path, **kwargs): +def harvest_page_content(collection_id, mapped_page_path, content_data_version, **kwargs): rikolti_mapper_type = kwargs.get('rikolti_mapper_type') - page_filename = os.path.basename(page_path) + page_filename = os.path.basename(mapped_page_path) # Weird how we have to use username/pass to hit this endpoint # but we have to use auth token to hit API endpoint @@ -357,12 +359,13 @@ def harvest_page_content(collection_id, page_path, **kwargs): if rikolti_mapper_type == 'nuxeo.nuxeo': auth = (settings.NUXEO_USER, settings.NUXEO_PASS) harvester = ContentHarvester( + mapped_page_path, collection_id=collection_id, page_filename=page_filename, src_auth=auth ) - records = get_mapped_records(page_path) + records = get_mapped_records(mapped_page_path) print( f"[{collection_id}, {page_filename}]: " f"Harvesting content for {len(records)} records" @@ -377,7 +380,7 @@ def harvest_page_content(collection_id, page_path, **kwargs): try: record_with_content = harvester.harvest(record) # write_mapped_record( - # collection_id, record_with_content) + # content_data_version, record_with_content) if not record_with_content.get('thumbnail'): warn_level = "ERROR" if 'sound' in record.get('type', []): @@ -396,7 +399,7 @@ def harvest_page_content(collection_id, page_path, **kwargs): print(f"Exiting after harvesting {i} of {len(records)} items " f"in page {page_filename} of collection {collection_id}") - write_mapped_page(collection_id, page_filename, records) + write_mapped_page(content_data_version, page_filename, records) media_source = [r for r in records if r.get('media_source')] media_harvested = [r for r in records if r.get('media')] @@ -455,12 +458,14 @@ def harvest_page_content(collection_id, page_path, **kwargs): parser = argparse.ArgumentParser( description="Harvest content using a page of mapped metadata") parser.add_argument('collection_id', help="Collection ID") - parser.add_argument('page_path', help="URI-formatted path to a mapped metadata page") + parser.add_argument('mapped_page_path', help="URI-formatted path to a mapped metadata page") parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth") args = parser.parse_args() arguments = { 'collection_id': args.collection_id, - 'page_filename': args.page_path, + 'mapped_page_path': args.mapped_page_path, + 'content_data_version': create_content_data_version( + args.collection_id, args.mapped_page_path.rsplit('data', 1)[0]) } if args.nuxeo: arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo' diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 8e217dc00..4043e80ae 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -244,45 +244,6 @@ def create_mapped_version( return mapped_data_path - # def list_fetched_content(self, recursive: bool=True, **kwargs) -> list: - # return list_pages( - # f"{self.vernacular_data}/{self.collection_id}/" - # f"vernacular_metadata{self.suffix}/", - # recursive=recursive - # ) - - # def search_page(self, search_str: str, page: str) -> bool: - # if self.data_store == 's3': - # return self.search_s3_page(search_str, page) - # elif self.data_store == 'file': - # return self.search_file_page(search_str, page) - # else: - # raise Exception(f"Unknown data store: {self.data_store}") - - # def search_s3_page(self, search_str: str, s3_key: str) -> bool: - # """ - # Check if search_str is in the body of the object located at s3_key - # Returns the s3_key of the object if so, otherwise returns None - # """ - # obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key) - # body = obj['Body'].read().decode('utf-8') - # if search_str in body: - # return True - # else: - # return False - - # def search_file_page(self, search_str: str, file_path: str) -> bool: - # """ - # Check if search_str is in the body of the file located at file_path - # """ - # with open(file_path, 'r') as f: - # body = f.read() - # if search_str in body: - # return True - # else: - # return False - - def create_validation_version( collection_id: int or str, mapped_data_path: str, @@ -306,15 +267,29 @@ def create_validation_version( f"{validation_root.rstrip('/')}/validation_{validation_suffix}.csv") return validation_data_path - validation_data_dest = os.environ.get( - "VALIDATION_DATA_DEST", "file:///tmp") - collection_path = ( - f"{validation_data_dest.rstrip('/')}/{collection_id}/") - if not validation_suffix: - validation_suffix = ( + +def create_content_data_version( + collection_id: int or str, + mapped_data_version: str, + content_data_suffix: Optional[str] = None +)-> str: + mapped_with_content_dest = os.environ.get('CONTENT_DATA_DEST') + # get path of the mapped data version, not the mapped data + content_data_root = mapped_data_version + + if mapped_with_content_dest: + # get path relative to collection_id + mapped_data_path = mapped_data_version.split(str(collection_id))[-1] + content_data_root = ( + f"{mapped_with_content_dest.rstrip('/')}/{collection_id}/{mapped_data_path}" + ) + + if not content_data_suffix: + content_data_suffix = ( datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) - validation_version_path = ( - f"{collection_path}validation_{validation_suffix}/") - return validation_version_path + content_data_path = ( + f"{content_data_root.rstrip('/')}/content_data_{content_data_suffix}/") + ) + return content_data_path From 8abb2808a06535f96ddbc61c197c01af64423d4a Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 10 Nov 2023 10:50:29 -0800 Subject: [PATCH 23/42] Resolve simple content harvester versioning issues --- content_harvester/by_collection.py | 4 ++-- content_harvester/by_page.py | 29 +++++++++++++++-------------- content_harvester/settings.py | 5 +++++ dags/harvest_dag.py | 3 +++ dags/shared_content_harvester.py | 25 +++++++++++++++++++------ dags/shared_tasks.py | 8 ++++++++ utils/rikolti_storage.py | 13 +++++++------ 7 files changed, 59 insertions(+), 28 deletions(-) diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py index dc8bd5a39..398697094 100644 --- a/content_harvester/by_collection.py +++ b/content_harvester/by_collection.py @@ -2,7 +2,7 @@ from . import settings from .by_page import harvest_page_content -from rikolti.utils.rikolti_storage import list_pages, create_content_data_version +from .rikolti_storage import list_pages, create_content_data_version def get_mapped_pages(mapped_data_version:str): page_list = [] @@ -38,7 +38,7 @@ def harvest_collection(collection, mapped_data_version: str): }) for page_path in page_list: - collection.update({'page_path': page_path}) + collection.update({'mapped_page_path': page_path}) page_stats = harvest_page_content(**collection) # in some cases, value is int and in some cases, value is Counter diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index 0c2515638..ef163d715 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -14,7 +14,7 @@ from . import derivatives from . import settings -from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version +from .rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version class DownloadError(Exception): pass @@ -51,10 +51,13 @@ def write_mapped_page(content_data_version, page, records): def get_child_records(mapped_page_path, parent_id) -> list: mapped_child_records = [] - children = list_pages( - f"{mapped_page_path.rsplit('/', 1)[0]}/children/", - recursive=False - ) + try: + children = list_pages( + f"{mapped_page_path.rsplit('/', 1)[0]}/children/", + recursive=False + ) + except FileNotFoundError: + return mapped_child_records children = [page for page in children if (page.rsplit('/')[-1]).startswith(parent_id)] for child in children: @@ -196,10 +199,6 @@ def __init__(self, mapped_page_path, collection_id, page_filename, src_auth=None self.collection_id = collection_id self.page_filename = page_filename - if settings.CONTENT_DEST["STORE"] == 's3': - self.s3 = boto3.client('s3') - else: - self.s3 = None # returns content = {thumbnail, media, children} where children # is an array of the self-same content dictionary @@ -332,9 +331,10 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = shutil.copyfile(filepath, dest_path) if settings.CONTENT_DEST["STORE"] == 's3': + s3 = boto3.client('s3') dest_path = ( f"{settings.CONTENT_DEST['PATH']}/{dest_prefix}/{dest_filename}") - self.s3.upload_file( + s3.upload_file( filepath, settings.CONTENT_DEST["BUCKET"], dest_path) # (mime, dimensions) = image_info(filepath) @@ -398,6 +398,7 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version, ) print(f"Exiting after harvesting {i} of {len(records)} items " f"in page {page_filename} of collection {collection_id}") + raise(e) write_mapped_page(content_data_version, page_filename, records) @@ -444,9 +445,9 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version, child_contents = [len(record.get('children', [])) for record in records] return { - 'thumb_source': Counter(thumb_src_mimetypes), + 'thumb_source_mimetypes': Counter(thumb_src_mimetypes), 'thumb_mimetypes': Counter(thumb_mimetypes), - 'media_source': Counter(media_src_mimetypes), + 'media_source_mimetypes': Counter(media_src_mimetypes), 'media_mimetypes': Counter(media_mimetypes), 'children': sum(child_contents), 'records': len(records) @@ -459,13 +460,13 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version, description="Harvest content using a page of mapped metadata") parser.add_argument('collection_id', help="Collection ID") parser.add_argument('mapped_page_path', help="URI-formatted path to a mapped metadata page") + parser.add_argument('content_data_version', help="URI-formatted path to a content data version") parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth") args = parser.parse_args() arguments = { 'collection_id': args.collection_id, 'mapped_page_path': args.mapped_page_path, - 'content_data_version': create_content_data_version( - args.collection_id, args.mapped_page_path.rsplit('data', 1)[0]) + 'content_data_version': args.content_data_version } if args.nuxeo: arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo' diff --git a/content_harvester/settings.py b/content_harvester/settings.py index f7bebe969..df924d801 100644 --- a/content_harvester/settings.py +++ b/content_harvester/settings.py @@ -9,6 +9,11 @@ DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp') DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp') CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp') +CONTENT_DEST = { + "STORE": urlparse(CONTENT_DEST_URL).scheme, + "BUCKET": urlparse(CONTENT_DEST_URL).netloc, + "PATH": urlparse(CONTENT_DEST_URL).path, +} AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False) AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', False) diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py index eeac13dd8..a3181a878 100644 --- a/dags/harvest_dag.py +++ b/dags/harvest_dag.py @@ -12,6 +12,7 @@ from rikolti.dags.shared_tasks import map_page_task from rikolti.dags.shared_tasks import get_mapping_status_task from rikolti.dags.shared_tasks import validate_collection_task +from rikolti.dags.shared_tasks import create_content_data_version_task from rikolti.dags.shared_content_harvester import ContentHarvestOperator @@ -53,11 +54,13 @@ def harvest(): validate_collection_task(mapping_status) mapped_page_paths = get_mapped_page_filenames_task(mapped_pages) + content_data_version = create_content_data_version_task(collection, mapped_pages) content_harvest_task = ( ContentHarvestOperator .partial( task_id="content_harvest", collection_id="{{ params.collection_id }}", + content_data_version=content_data_version, ) .expand( page=mapped_page_paths diff --git a/dags/shared_content_harvester.py b/dags/shared_content_harvester.py index 4352d0e16..4c64943e2 100644 --- a/dags/shared_content_harvester.py +++ b/dags/shared_content_harvester.py @@ -38,7 +38,13 @@ def get_awsvpc_config(): class ContentHarvestEcsOperator(EcsRunTaskOperator): - def __init__(self, collection_id=None, page=None, **kwargs): + def __init__(self, collection_id=None, content_data_version=None, page=None, **kwargs): + container_name = "rikolti-content_harvester" + if page: + page_basename = page.split('/')[-1] + container_name = ( + f"content_harvester_{collection_id}_{page_basename.split('.')[0]}") + args = { "cluster": "rikolti-ecs-cluster", "launch_type": "FARGATE", @@ -47,8 +53,12 @@ def __init__(self, collection_id=None, page=None, **kwargs): "overrides": { "containerOverrides": [ { - "name": "rikolti-content_harvester", - "command": [f"{collection_id}", f"{page}"], + "name": container_name, + "command": [ + f"{collection_id}", + f"{page}", + f"{content_data_version}" + ], "environment": [ { "CONTENT_DATA_SRC": os.environ.get("CONTENT_DATA_SRC"), @@ -86,7 +96,7 @@ def execute(self, context): class ContentHarvestDockerOperator(DockerOperator): - def __init__(self, collection_id, page, **kwargs): + def __init__(self, collection_id, content_data_version, page, **kwargs): mounts = [] if os.environ.get("CONTENT_DATA_MOUNT"): mounts.append(Mount( @@ -109,11 +119,14 @@ def __init__(self, collection_id, page, **kwargs): ) container_version = os.environ.get( 'CONTENT_HARVEST_VERSION', 'latest') + page_basename = page.split('/')[-1] + container_name = ( + f"content_harvester_{collection_id}_{page_basename.split('.')[0]}") args = { "image": f"{container_image}:{container_version}", - "container_name": f"content_harvester_{collection_id}_{page}", - "command": [f"{collection_id}", f"{page}"], + "container_name": container_name, + "command": [f"{collection_id}", f"{page}", f"{content_data_version}"], "network_mode": "bridge", "auto_remove": 'force', "mounts": mounts, diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index 792875f21..ab49035a1 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -19,6 +19,7 @@ from rikolti.record_indexer.move_index_to_prod import move_index_to_prod from rikolti.utils.rikolti_storage import create_mapped_version from rikolti.utils.rikolti_storage import create_vernacular_version +from rikolti.utils.rikolti_storage import create_content_data_version # TODO: remove the rikoltifetcher registry endpoint and restructure @@ -146,6 +147,13 @@ def validate_collection_task(collection_status: dict, params=None) -> str: return file_location +@task() +def create_content_data_version_task(collection: dict, mapped_pages: list[dict]): + content_data_version = create_content_data_version( + collection['id'], mapped_pages[0]['mapped_page_path']) + return content_data_version + + @task() def create_stage_index_task(collection: dict, index_name: str): collection_id = collection.get('id') diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index 4043e80ae..ff416d721 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -54,15 +54,17 @@ def list_pages(data_uri: str, recursive: bool=True, **kwargs) -> list: f"https://{data.bucket}.s3.us-west-2.amazonaws" ".com/index.html#{data.path}/" ) - raise Exception( + print( f"Error listing files at {data.uri}\n" f"Check that {data.path} exists at {url}\n{e}" - ) + ) + raise e elif data.store == 'file': try: return list_file_pages(data, recursive=recursive) except Exception as e: - raise Exception(f"Error listing files in {data.path}\n{e}") + print(f"Error listing files in {data.path}\n{e}") + raise e else: raise Exception(f"Unknown data store: {data.store}") @@ -97,14 +99,14 @@ def list_file_pages(data: DataStorage, recursive: bool=True) -> list: file_objects = [] if recursive: for root, dirs, files in os.walk(data.path): - root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}" + root_uri = f"file://{root}/" if root[-1] != '/' else f"file://{root}" for file in files: file_objects.append(f"{root_uri}{file}") if not recursive: for file in os.listdir(data.path): if os.path.isfile(os.path.join(data.path, file)): - root_uri = "file://{data.path}/" if data.path[-1] != '/' else "file://{data.path}" + root_uri = f"file://{data.path}/" if data.path[-1] != '/' else f"file://{data.path}" file_objects.append(f"{root_uri}{file}") return file_objects @@ -289,7 +291,6 @@ def create_content_data_version( datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) content_data_path = ( f"{content_data_root.rstrip('/')}/content_data_{content_data_suffix}/") - ) return content_data_path From 927c6d541013e15be81f1d9b0a89642d9a29b49b Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Fri, 10 Nov 2023 13:40:21 -0800 Subject: [PATCH 24/42] detangle data root from version path for fetched data --- dags/mapper_dag.py | 7 +-- dags/shared_tasks.py | 21 ++++++- metadata_fetcher/fetchers/Fetcher.py | 9 ++- metadata_fetcher/fetchers/ucd_json_fetcher.py | 6 +- metadata_mapper/lambda_function.py | 6 +- metadata_mapper/lambda_shepherd.py | 17 +----- utils/rikolti_storage.py | 57 ++++++++++++++++--- 7 files changed, 84 insertions(+), 39 deletions(-) diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py index bcf639796..dc5be8aa2 100644 --- a/dags/mapper_dag.py +++ b/dags/mapper_dag.py @@ -9,18 +9,15 @@ from rikolti.dags.shared_tasks import map_page_task from rikolti.dags.shared_tasks import get_mapping_status_task from rikolti.dags.shared_tasks import validate_collection_task -from rikolti.metadata_mapper.lambda_shepherd import get_vernacular_pages +from rikolti.utils.rikolti_storage import get_vernacular_pages from rikolti.utils.rikolti_storage import get_most_recent_vernacular_version @task() def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str] = None): - collection_id = collection.get('id') + collection_id = collection['id'] if not vernacular_version: vernacular_version = get_most_recent_vernacular_version(collection_id) - if not collection_id: - raise ValueError( - f"Collection ID not found in collection metadata: {collection}") pages = get_vernacular_pages(collection_id, vernacular_version) return pages diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index ab49035a1..47b98f07f 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -19,6 +19,7 @@ from rikolti.record_indexer.move_index_to_prod import move_index_to_prod from rikolti.utils.rikolti_storage import create_mapped_version from rikolti.utils.rikolti_storage import create_vernacular_version +from rikolti.utils.rikolti_storage import get_version from rikolti.utils.rikolti_storage import create_content_data_version @@ -46,6 +47,13 @@ def create_vernacular_version_task(collection): @task() def fetch_collection_task(collection: dict, vernacular_version: str): + """ + returns a list of the filepaths of the vernacular metadata relative to the + collection id, ex: [ + '3433/vernacular_metadata_2023-01-01T00:00:00/data/1', + '3433/vernacular_metadata_2023-01-01T00:00:00/data/2' + ] + """ fetch_status = fetch_collection(collection, vernacular_version, {}) success = all([page['status'] == 'success' for page in fetch_status]) total_items = sum([page['document_count'] for page in fetch_status]) @@ -116,8 +124,19 @@ def get_mapping_status_task(collection: dict, mapped_pages: list): @task() def create_mapped_version_task(collection, vernacular_pages): + """ + vernacular pages is a list of the filepaths of the vernacular metadata + relative to the collection id, ex: [ + '3433/vernacular_metadata_2023-01-01T00:00:00/data/1', + '3433/vernacular_metadata_2023-01-01T00:00:00/data/2' + ] + """ + vernacular_version = get_version(collection.get('id'), vernacular_pages[0]) + if not vernacular_version: + raise ValueError( + f"Vernacular version not found in {vernacular_pages[0]}") mapped_data_version = create_mapped_version( - collection.get('id'), vernacular_pages[0]) + collection.get('id'), vernacular_version) return mapped_data_version diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 02085d18c..86ab6ec45 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -1,9 +1,8 @@ import logging import requests -import os from requests.adapters import HTTPAdapter, Retry -from rikolti.utils.rikolti_storage import put_page_content +from rikolti.utils.rikolti_storage import put_vernacular_content logger = logging.getLogger(__name__) @@ -26,7 +25,7 @@ def __init__(self, params): self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - self.data_destination = params.get('vernacular_version') + self.vernacular_version = params.get('vernacular_version') if not self.collection_id: @@ -50,8 +49,8 @@ def fetch_page(self): if record_count: content = self.aggregate_vernacular_content(response.text) try: - filepath = put_page_content( - content, f"{self.data_destination}data/{self.write_page}") + filepath = put_vernacular_content( + content, self.write_page, self.vernacular_version) except Exception as e: print(f"Metadata Fetcher: {e}") raise(e) diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py index 860495116..cbc15e0bc 100644 --- a/metadata_fetcher/fetchers/ucd_json_fetcher.py +++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup from .Fetcher import Fetcher, FetchError -from rikolti.utils.rikolti_storage import put_page_content +from rikolti.utils.rikolti_storage import put_vernacular_content class UcdJsonFetcher(Fetcher): def __init__(self, params: dict[str]): @@ -69,8 +69,8 @@ def fetch_all_pages(self, response: requests.Response) -> list: records = [self.fetch_json_ld(url) for url in urls] document_count = len(records) try: - filepath = put_page_content( - json.dumps(records), f"{self.data_destination}data/{self.write_page}") + filepath = put_vernacular_content( + json.dumps(records), self.write_page, self.vernacular_version) fetch_status.append({ 'document_count': document_count, 'vernacular_filepath': filepath, diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index a5922700e..eaf7350d4 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -8,7 +8,7 @@ from . import settings from .mappers.mapper import Record, Vernacular -from rikolti.utils.rikolti_storage import get_page_content, put_page_content +from rikolti.utils.rikolti_storage import get_mapped_page, put_page_content logger = logging.getLogger(__name__) @@ -80,7 +80,7 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: vernacular_reader = import_vernacular_reader( collection.get('rikolti_mapper_type')) page_filename = os.path.basename(vernacular_page_path) - api_resp = get_page_content(vernacular_page_path) + api_resp = get_mapped_page(vernacular_page_path) source_vernacular = vernacular_reader(collection_id, page_filename) source_metadata_records = source_vernacular.parse(api_resp) @@ -136,7 +136,7 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: parser = argparse.ArgumentParser( description="Map metadata from the institution's vernacular") parser.add_argument('collection_id', help='collection id') - parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/data/1') + parser.add_argument('page_path', help='relative file path to vernauclar metadata page filename; ex: 3433/vernacular_data_version_1/data/1') parser.add_argument('mapped_data_version', help='uri file path to mapped data version; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/mapped_data_version_1/') parser.add_argument('collection', help='json collection metadata from registry') diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py index baa5da644..a376e0c14 100644 --- a/metadata_mapper/lambda_shepherd.py +++ b/metadata_mapper/lambda_shepherd.py @@ -8,7 +8,7 @@ from . import validate_mapping from .lambda_function import map_page from .mappers.mapper import Record -from rikolti.utils.rikolti_storage import list_pages, create_mapped_version, get_most_recent_vernacular_version +from rikolti.utils.rikolti_storage import get_vernacular_pages, create_mapped_version, get_most_recent_vernacular_version def get_collection(collection_id): @@ -37,19 +37,6 @@ def check_for_missing_enrichments(collection): return not_yet_implemented -def get_vernacular_pages(collection_id, vernacular_version): - try: - page_list = list_pages(vernacular_version, recursive=True) - except FileNotFoundError as e: - print( - f"{e} - have you fetched {collection_id}? " - f"looked in dir {e.filename} for vernacular pages" - ) - raise(e) - - # TODO: split page_list into pages and children? - return page_list - def get_mapping_status(collection, mapped_pages): count = sum([page['num_records_mapped'] for page in mapped_pages]) @@ -131,7 +118,7 @@ def map_collection(collection_id, vernacular_version=None, validate=False): parser.add_argument('collection_id', help='collection ID from registry') parser.add_argument('--validate', help='validate mapping; may provide json opts', const=True, nargs='?') - parser.add_argument('vernacular_version', help='URI to a folder of vernacular pages to map') + parser.add_argument('vernacular_version', help='relative path describing a vernacular version, ex: 3433/vernacular_data_version_1/') args = parser.parse_args(sys.argv[1:]) mapped_collection = map_collection(args.collection_id, args.vernacular_version, args.validate) missing_enrichments = mapped_collection.get('missing_enrichments') diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py index ff416d721..150b67551 100644 --- a/utils/rikolti_storage.py +++ b/utils/rikolti_storage.py @@ -183,6 +183,7 @@ def put_s3_content(data: DataStorage, content, **kwargs) -> str: ) return data.uri + def put_file_content(data: DataStorage, content) -> str: """ Write content to a file at data.path @@ -197,34 +198,70 @@ def put_file_content(data: DataStorage, content) -> str: return data.uri +def get_version(collection_id, uri): + """ + From an arbitrary path, try to get the version string + """ + uri = uri.rstrip('/') + if collection_id not in uri or uri.endswith(collection_id): + return None + rikolti_data_root, relative_path = uri.split(f"/{collection_id}/") + path_list = relative_path.split('/') + if 'data' in path_list: + path_list = path_list[:path_list.index('data')] + path_list.insert(0, collection_id) + version = "/".join(path_list) + return version + + def create_vernacular_version( collection_id: int or str, version_suffix: Optional[str] = None ): - fetcher_data_dest = os.environ.get( - "FETCHER_DATA_DEST", "file:///tmp") - collection_path = ( - f"{fetcher_data_dest.rstrip('/')}/{collection_id}/") if not version_suffix: version_suffix = ( datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) vernacular_version_path = ( - f"{collection_path}vernacular_metadata_{version_suffix}/") + f"{collection_id}/vernacular_metadata_{version_suffix}/") return vernacular_version_path +def put_vernacular_content(content: str, page_name: int or str, version: str): + fetcher_data_dest = os.environ.get( + "FETCHER_DATA_DEST", "file:///tmp") + path = f"{fetcher_data_dest.rstrip('/')}/{version}/data/{page_name}" + put_page_content(content, path) + return f"{version}/data/{page_name}" + + def get_most_recent_vernacular_version(collection_id: int or str): mapper_data_src = os.environ.get("MAPPED_DATA_SRC") vernacular_versions = list_dirs(f"{mapper_data_src}/{collection_id}/") if not vernacular_versions: raise Exception( "No vernacular metadata versions found for {collection_id}") - return sorted(vernacular_versions)[-1] + return get_version(collection_id, sorted(vernacular_versions)[-1]) + + +def get_vernacular_pages(collection_id, vernacular_version): + mapper_data_src = os.environ.get("MAPPED_DATA_SRC", "file:///tmp").rstrip('/') + vernacular_path = f"{mapper_data_src}/{vernacular_version}/data/" + try: + page_list = list_pages(vernacular_path, recursive=True) + except FileNotFoundError as e: + print( + f"{e} - have you fetched {collection_id}? " + f"looked in dir {e.filename} for vernacular pages" + ) + raise(e) + + # TODO: split page_list into pages and children? + return page_list def create_mapped_version( collection_id: int or str, - vernacular_path: str, + vernacular_version: str, mapped_data_suffix: Optional[str] = None, ): mapper_data_dest = os.environ.get("MAPPED_DATA_DEST") @@ -246,6 +283,12 @@ def create_mapped_version( return mapped_data_path +def get_mapped_page(relative_vernacular_path): + mapper_data_src = os.environ.get("MAPPER_DATA_SRC", "file:///tmp").rstrip('/') + relative_vernacular_path = relative_vernacular_path.lstrip('/') + return get_page_content(f"{mapper_data_src}/{relative_vernacular_path}") + + def create_validation_version( collection_id: int or str, mapped_data_path: str, From 3509146dbf09485aa8631949a086a127daa0d060 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Mon, 13 Nov 2023 16:26:36 -0800 Subject: [PATCH 25/42] Rename rikolti_storage to storage and add versions module --- README.md | 24 ++- content_harvester/README.md | 2 +- content_harvester/by_collection.py | 27 ++- content_harvester/by_page.py | 87 ++++------ content_harvester/docker-compose.yml | 6 +- content_harvester/settings.py | 12 +- dags/docker_content_harvest.py | 2 + dags/harvest_dag.py | 7 +- dags/mapper_dag.py | 9 +- dags/shared_content_harvester.py | 12 +- dags/shared_tasks.py | 65 +++++-- dags/utils_by_mapper_type.py | 18 +- env.example | 10 +- .../fetch_registry_collections.py | 14 +- metadata_fetcher/fetchers/Fetcher.py | 23 ++- metadata_fetcher/fetchers/ucd_json_fetcher.py | 4 +- metadata_fetcher/lambda_function.py | 11 +- metadata_fetcher/settings.py | 4 - metadata_fetcher/tests.py | 2 +- metadata_mapper/lambda_function.py | 32 +++- metadata_mapper/lambda_shepherd.py | 28 ++- metadata_mapper/settings.py | 3 - metadata_mapper/validate_mapping.py | 25 ++- metadata_mapper/validator/validation_log.py | 25 +-- utils/{rikolti_storage.py => storage.py} | 141 +-------------- utils/versions.py | 160 ++++++++++++++++++ 26 files changed, 434 insertions(+), 319 deletions(-) rename utils/{rikolti_storage.py => storage.py} (53%) create mode 100644 utils/versions.py diff --git a/README.md b/README.md index 028d07850..186e02603 100644 --- a/README.md +++ b/README.md @@ -48,20 +48,18 @@ vi env.local Currently, I only use one virtual environment, even though each folder located at the root of this repository represents an isolated component. If dependency conflicts are encountered, I'll wind up creating separate environments. -Similarly, I also only use one env.local as well. Rikolti fetches data to your local system, maps that data, and then fetches relevant content files (media files, previews, and thumbnails). Set `FETCHER_DATA_DEST` to the URI where you would like Rikolti to store fetched data - Rikolti will create a folder (or s3 prefix) `/vernacular_metadata` at this location. Set `MAPPER_DATA_SRC` to the URI where Rikolti can find a `/vernacular_metadata` folder that contains the fetched data you're attempting to map. Set `MAPPER_DATA_DEST` to the URI where you would like Rikolti to store mapped data - Rikolti will create a folder (or s3 prefix) `/mapped_metadata` at this location. Set `CONTENT_DATA_SRC` to the URI where Rikolti can find a `/mapped_metadata` folder that contains the mapped metadata describing where to find content. Set `CONTENT_DATA_DEST` to the URI where you would like Rikolti to store mapped data that has been updated with pointers to content files - Rikolti will create a folder (or s3 prefix) `/mapped_with_content` at this location. Set `CONTENT_DEST` to the URI where you would like Rikolti to store content files. +Similarly, I also only use one env.local as well. Rikolti fetches data to your local system, maps that data, and then fetches relevant content files (media files, previews, and thumbnails). Set `VERNACULAR_DATA` to the URI where you would like Rikolti to store and retrieve fetched data - Rikolti will create a folder (or s3 prefix) `/vernacular_metadata` at this location. Set `MAPPED_DATA` to the URI where you would like Rikolti to store and retrieve mapped data - Rikolti will create a folder (or s3 prefix) `/mapped_metadata` at this location. Set `CONTENT_DATA` to the URI where you would like Rikolti to store mapped data that has been updated with pointers to content files - Rikolti will create a folder (or s3 prefix) `/mapped_with_content` at this location. Set `CONTENT_ROOT` to the URI where you would like Rikolti to store content files. For example, one way to configure `env.local` is: ``` -FETCHER_DATA_DEST=file:///Users/awieliczka/Projects/rikolti/rikolti_data -MAPPER_DATA_SRC=$FETCHER_DATA_DEST -MAPPER_DATA_DEST=$FETCHER_DATA_DEST -CONTENT_DATA_SRC=$FETCHER_DATA_DEST -CONTENT_DATA_DEST=$FETCHER_DATA_DEST -CONTENT_DEST=file:///Users/awieliczka/Projects/rikolti/rikolti_content +VERNACULAR_DATA=file:///Users/awieliczka/Projects/rikolti/rikolti_data +MAPPED_DATA=$VERNACULAR_DATA +CONTENT_DATA=$VERNACULAR_DATA +CONTENT_ROOT=file:///Users/awieliczka/Projects/rikolti/rikolti_content ``` -Each of these can be different locations, however. For example, if you're attempting to re-run a mapper locally off of previously fetched data stored on s3, you might set `MAPPER_DATA_SRC=s3://rikolti_data`. +Each of these can be different locations, however. For example, if you're attempting to re-run a mapper locally off of previously fetched data stored on s3, you might set `VERNACULAR_DATA=s3://rikolti_data`. In env.example you'll also see `CONTENT_DATA_MOUNT` and `CONTENT_MOUNT` environment variables. These are only relevant if you are running the content harvester using airflow, and want to set and of the CONTENT_ environment variables to the local filesystem. Their usage is described below in the Airflow Development section. @@ -172,9 +170,8 @@ The docker socket will typically be at `/var/run/docker.sock`. On Mac OS Docker Next, back in the Rikolti repository, create the `startup.sh` file by running `cp env.example dags/startup.sh`. Update the startup.sh file with Nuxeo, Flickr, and Solr keys as available, and make sure that the following environment variables are set: ``` -export FETCHER_DATA_DEST=file:///usr/local/airflow/rikolti_data -export MAPPER_DATA_SRC=file:///usr/local/airflow/rikolti_data -export MAPPER_DATA_DEST=file:///usr/local/airflow/rikolti_data +export VERNACULAR_DATA=file:///usr/local/airflow/rikolti_data +export MAPPED_DATA=file:///usr/local/airflow/rikolti_data ``` The folder located at `RIKOLTI_DATA_HOME` (set in `aws-mwaa-local-runner/docker/.env`) is mounted to `/usr/local/airflow/rikolti_data` on the airflow docker container. @@ -184,9 +181,8 @@ Please also make sure the following `CONTENT_*` variables are set - `CONTENT_DAT ``` export CONTENT_DATA_MOUNT=/Users/awieliczka/Projects/rikolti_data export CONTENT_MOUNT=/Users/awieliczka/Projects/rikolti_content -export CONTENT_DATA_SRC=file:///rikolti_data -export CONTENT_DATA_DEST=file:///rikolti_data -export CONTENT_DEST=file:///rikolti_content +export CONTENT_DATA=file:///rikolti_data +export CONTENT_ROOT=file:///rikolti_content ``` The folder located at `CONTENT_DATA_MOUNT` is mounted to `/rikolti_data` and the folder located at `CONTENT_MOUNT` is mounted to `/rikolti_content` on the content_harvester docker container. diff --git a/content_harvester/README.md b/content_harvester/README.md index e5f810588..badf0551f 100644 --- a/content_harvester/README.md +++ b/content_harvester/README.md @@ -34,7 +34,7 @@ The above media and thumbnail fetching processes are enacted upon child metadata # Settings -You can bypass uploading to s3 by setting `settings.CONTENT_DATA_DEST = "file://"` and `settings.CONTENT_DEST = "file://"`. This is useful for local development and testing. This will, however, set the metadata records' `media['media_filepath']` and `thumbnail['thumbnail_filepath']` to a local filepath. +You can bypass uploading to s3 by setting `settings.CONTENT_DATA = "file://"` and `settings.CONTENT_ROOT = "file://"`. This is useful for local development and testing. This will, however, set the metadata records' `media['media_filepath']` and `thumbnail['thumbnail_filepath']` to a local filepath. # Local Development diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py index 398697094..bf0bc057a 100644 --- a/content_harvester/by_collection.py +++ b/content_harvester/by_collection.py @@ -1,20 +1,7 @@ import json -from . import settings from .by_page import harvest_page_content -from .rikolti_storage import list_pages, create_content_data_version - -def get_mapped_pages(mapped_data_version:str): - page_list = [] - page_list = list_pages( - f"{mapped_data_version.rstrip('/')}/data/", - recursive=False, - aws_access_key_id=settings.AWS_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, - aws_session_token=settings.AWS_SESSION_TOKEN, - region_name=settings.AWS_REGION - ) - return page_list +from .versions import get_mapped_pages, create_content_data_version # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo"} @@ -28,13 +15,19 @@ def harvest_collection(collection, mapped_data_version: str): print("ERROR ERROR ERROR\ncollection_id and mapped_data_version required") exit() - page_list = get_mapped_pages(mapped_data_version) + page_list = get_mapped_pages( + mapped_data_version, + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, + aws_session_token=settings.AWS_SESSION_TOKEN, + region_name=settings.AWS_REGION + ) print(f"[{collection_id}]: Harvesting content for {len(page_list)} pages") collection_stats = {} + collection.update({ - 'content_data_version': create_content_data_version( - collection_id, mapped_data_version) + 'content_data_version': create_content_data_version(mapped_data_version) }) for page_path in page_list: diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index ef163d715..5700d65cb 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -14,7 +14,10 @@ from . import derivatives from . import settings -from .rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version +from .versions import ( + get_mapped_page, get_child_directories, get_child_pages, get_child_page, + get_version, put_content_data_page +) class DownloadError(Exception): pass @@ -24,44 +27,13 @@ class UnsupportedMimetype(Exception): pass -def get_mapped_records(page_path) -> list: - mapped_records = [] - mapped_records = json.loads(get_page_content(page_path)) - return mapped_records - - -def write_mapped_record(content_data_version, record): - filename = put_page_content( - json.dumps(record), - ( - f"{content_data_version.rstrip('/')}/data/" - f"{record.get('calisphere-id').replace(os.sep, '_')}.json" - ) - ) - return filename - - -def write_mapped_page(content_data_version, page, records): - filename = put_page_content( - json.dumps(records), - f"{content_data_version.rstrip('/')}/data/{page}" - ) - return filename - - def get_child_records(mapped_page_path, parent_id) -> list: mapped_child_records = [] - try: - children = list_pages( - f"{mapped_page_path.rsplit('/', 1)[0]}/children/", - recursive=False - ) - except FileNotFoundError: - return mapped_child_records + children = get_child_pages(mapped_page_path) children = [page for page in children if (page.rsplit('/')[-1]).startswith(parent_id)] for child in children: - mapped_child_records.extend(json.loads(get_page_content(child))) + mapped_child_records.extend(get_child_page(child)) return mapped_child_records @@ -235,7 +207,7 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict: dest_filename = os.path.basename(content.derivative_filepath) content_s3_filepath = self._upload( - content.dest_prefix, dest_filename, content.derivative_filepath) + f"{content.dest_prefix}/{collection_id}", dest_filename, content.derivative_filepath) content.set_s3_filepath(content_s3_filepath) # print( @@ -249,14 +221,19 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict: } # Recurse through the record's children (if any) - child_records = get_child_records( - self.mapped_page_path, calisphere_id) - if child_records: - print( - f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: " - f"{len(child_records)} children found." - ) - record['children'] = [self.harvest(c, download_cache=download_cache) for c in child_records] + mapped_version = get_version( + self.collection_id, self.mapped_page_path) + child_directories = get_child_directories(mapped_version) + print(f"CHILD DIRECTORIES: {child_directories}") + if child_directories: + child_records = get_child_records( + self.mapped_page_path, calisphere_id) + if child_records: + print( + f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: " + f"{len(child_records)} children found." + ) + record['children'] = [self.harvest(c, download_cache=download_cache) for c in child_records] return record @@ -312,7 +289,7 @@ def _download(self, url: str, destination_file: str, cache: Optional[dict] = Non def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = None) -> str: ''' - upload file to CONTENT_DEST + upload file to CONTENT_ROOT ''' if not cache: cache = {} @@ -322,20 +299,20 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = dest_path = '' - if settings.CONTENT_DEST["STORE"] == 'file': + if settings.CONTENT_ROOT["STORE"] == 'file': dest_path = os.path.join( - settings.CONTENT_DEST["PATH"], dest_prefix) + settings.CONTENT_ROOT["PATH"], dest_prefix) if not os.path.exists(dest_path): os.makedirs(dest_path) dest_path = os.path.join(dest_path, dest_filename) shutil.copyfile(filepath, dest_path) - if settings.CONTENT_DEST["STORE"] == 's3': + if settings.CONTENT_ROOT["STORE"] == 's3': s3 = boto3.client('s3') dest_path = ( - f"{settings.CONTENT_DEST['PATH']}/{dest_prefix}/{dest_filename}") + f"{settings.CONTENT_ROOT['PATH']}/{dest_prefix}/{dest_filename}") s3.upload_file( - filepath, settings.CONTENT_DEST["BUCKET"], dest_path) + filepath, settings.CONTENT_ROOT["BUCKET"], dest_path) # (mime, dimensions) = image_info(filepath) cache_updates = { @@ -365,7 +342,7 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version, src_auth=auth ) - records = get_mapped_records(mapped_page_path) + records = json.loads(get_mapped_page(mapped_page_path)) print( f"[{collection_id}, {page_filename}]: " f"Harvesting content for {len(records)} records" @@ -379,8 +356,11 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version, # spit out progress so far if an error has been encountered try: record_with_content = harvester.harvest(record) - # write_mapped_record( - # content_data_version, record_with_content) + # put_content_data_page( + # json.dumps(record_with_content), + # record_with_content.get('calisphere-id').replace(os.sep, '_') + ".json", + # content_data_version + # ) if not record_with_content.get('thumbnail'): warn_level = "ERROR" if 'sound' in record.get('type', []): @@ -400,7 +380,8 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version, f"in page {page_filename} of collection {collection_id}") raise(e) - write_mapped_page(content_data_version, page_filename, records) + put_content_data_page( + json.dumps(records), page_filename, content_data_version) media_source = [r for r in records if r.get('media_source')] media_harvested = [r for r in records if r.get('media')] diff --git a/content_harvester/docker-compose.yml b/content_harvester/docker-compose.yml index af1aef4d0..7d28ead0c 100644 --- a/content_harvester/docker-compose.yml +++ b/content_harvester/docker-compose.yml @@ -17,8 +17,8 @@ services: - ../rikolti_content:/rikolti_content - ./:/content_harvester environment: - - CONTENT_DATA_SRC=file:///rikolti_data - - CONTENT_DATA_DEST=file:///rikolti_data - - CONTENT_DEST=file:///rikolti_content + - MAPPED_DATA=file:///rikolti_data + - CONTENT_DATA=file:///rikolti_data + - CONTENT_ROOT=file:///rikolti_content - NUXEO_USER=${NUXEO_USER} - NUXEO_PASS=${NUXEO_PASS} diff --git a/content_harvester/settings.py b/content_harvester/settings.py index df924d801..56aeae3fb 100644 --- a/content_harvester/settings.py +++ b/content_harvester/settings.py @@ -6,13 +6,11 @@ load_dotenv() -DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp') -DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp') -CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp') -CONTENT_DEST = { - "STORE": urlparse(CONTENT_DEST_URL).scheme, - "BUCKET": urlparse(CONTENT_DEST_URL).netloc, - "PATH": urlparse(CONTENT_DEST_URL).path, +CONTENT_ROOT_URL = os.environ.get("CONTENT_ROOT", 'file:///tmp') +CONTENT_ROOT = { + "STORE": urlparse(CONTENT_ROOT_URL).scheme, + "BUCKET": urlparse(CONTENT_ROOT_URL).netloc, + "PATH": urlparse(CONTENT_ROOT_URL).path, } AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False) diff --git a/dags/docker_content_harvest.py b/dags/docker_content_harvest.py index 73974c87f..19a23484d 100644 --- a/dags/docker_content_harvest.py +++ b/dags/docker_content_harvest.py @@ -22,6 +22,7 @@ def docker_content_harvest(): harvest_content_for_page_task = ContentHarvestDockerOperator( task_id="page_content_harvester_on_local_docker", collection_id="{{ params.collection_id }}", + content_data_version="{{ params.content_data_version }}", page="{{ params.page_filename }}", ) harvest_content_for_page_task @@ -31,6 +32,7 @@ def docker_content_harvest(): entrypoint="python3 -m content_harvester.by_collection", command=["{{ params.collection_id }}"], collection_id="{{ params.collection_id }}", + content_data_version="{{ params.content_data_version }}", page="all", ) harvest_content_for_collection_task diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py index a3181a878..5365a730f 100644 --- a/dags/harvest_dag.py +++ b/dags/harvest_dag.py @@ -1,5 +1,5 @@ from datetime import datetime - +import os from airflow.decorators import dag, task from airflow.models.param import Param @@ -47,7 +47,7 @@ def harvest(): mapped_pages = ( map_page_task .partial(collection=collection, mapped_data_version=mapped_data_version) - .expand(page=fetched_pages) + .expand(vernacular_page=fetched_pages) ) mapping_status = get_mapping_status_task(collection, mapped_pages) @@ -55,12 +55,13 @@ def harvest(): mapped_page_paths = get_mapped_page_filenames_task(mapped_pages) content_data_version = create_content_data_version_task(collection, mapped_pages) + content_harvest_task = ( ContentHarvestOperator .partial( task_id="content_harvest", collection_id="{{ params.collection_id }}", - content_data_version=content_data_version, + content_data_version=content_data_version ) .expand( page=mapped_page_paths diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py index dc5be8aa2..ece23aa63 100644 --- a/dags/mapper_dag.py +++ b/dags/mapper_dag.py @@ -9,8 +9,8 @@ from rikolti.dags.shared_tasks import map_page_task from rikolti.dags.shared_tasks import get_mapping_status_task from rikolti.dags.shared_tasks import validate_collection_task -from rikolti.utils.rikolti_storage import get_vernacular_pages -from rikolti.utils.rikolti_storage import get_most_recent_vernacular_version +from rikolti.utils.versions import get_most_recent_vernacular_version +from rikolti.utils.versions import get_vernacular_pages @task() @@ -18,7 +18,8 @@ def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str collection_id = collection['id'] if not vernacular_version: vernacular_version = get_most_recent_vernacular_version(collection_id) - pages = get_vernacular_pages(collection_id, vernacular_version) + pages = get_vernacular_pages(vernacular_version) + # TODO: split page_list into pages and children? return pages # This is a functional duplicate of @@ -57,7 +58,7 @@ def mapper_dag(): mapped_pages = ( map_page_task .partial(collection=collection, mapped_data_version=mapped_data_version) - .expand(page=page_list) + .expand(vernacular_page=page_list) ) mapping_status = get_mapping_status_task(collection, mapped_pages) diff --git a/dags/shared_content_harvester.py b/dags/shared_content_harvester.py index 4c64943e2..6b2f238d3 100644 --- a/dags/shared_content_harvester.py +++ b/dags/shared_content_harvester.py @@ -61,9 +61,9 @@ def __init__(self, collection_id=None, content_data_version=None, page=None, **k ], "environment": [ { - "CONTENT_DATA_SRC": os.environ.get("CONTENT_DATA_SRC"), - "CONTENT_DATA_DEST": os.environ.get("CONTENT_DATA_DEST"), - "CONTENT_DEST": os.environ.get("CONTENT_DEST"), + "MAPPED_DATA": os.environ.get("CONTENT_DATA"), + "CONTENT_DATA": os.environ.get("CONTENT_DATA"), + "CONTENT_ROOT": os.environ.get("CONTENT_ROOT"), "NUXEO_USER": os.environ.get("NUXEO_USER"), "NUXEO_PASS": os.environ.get("NUXEO_PASS") } @@ -132,9 +132,9 @@ def __init__(self, collection_id, content_data_version, page, **kwargs): "mounts": mounts, "mount_tmp_dir": False, "environment": { - "CONTENT_DATA_SRC": os.environ.get("CONTENT_DATA_SRC"), - "CONTENT_DATA_DEST": os.environ.get("CONTENT_DATA_DEST"), - "CONTENT_DEST": os.environ.get("CONTENT_DEST"), + "MAPPED_DATA": os.environ.get("CONTENT_DATA"), + "CONTENT_DATA": os.environ.get("CONTENT_DATA"), + "CONTENT_ROOT": os.environ.get("CONTENT_ROOT"), "NUXEO_USER": os.environ.get("NUXEO_USER"), "NUXEO_PASS": os.environ.get("NUXEO_PASS") }, diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py index 47b98f07f..96fd2b072 100644 --- a/dags/shared_tasks.py +++ b/dags/shared_tasks.py @@ -17,10 +17,10 @@ from rikolti.record_indexer.create_collection_index import get_index_name from rikolti.record_indexer.create_collection_index import delete_index from rikolti.record_indexer.move_index_to_prod import move_index_to_prod -from rikolti.utils.rikolti_storage import create_mapped_version -from rikolti.utils.rikolti_storage import create_vernacular_version -from rikolti.utils.rikolti_storage import get_version -from rikolti.utils.rikolti_storage import create_content_data_version +from rikolti.utils.versions import create_vernacular_version +from rikolti.utils.versions import get_version +from rikolti.utils.versions import create_mapped_version +from rikolti.utils.versions import create_content_data_version # TODO: remove the rikoltifetcher registry endpoint and restructure @@ -41,7 +41,8 @@ def get_collection_fetchdata_task(params=None): @task() -def create_vernacular_version_task(collection): +def create_vernacular_version_task(collection) -> str: + # returns: '3433/vernacular_metadata_v1/' return create_vernacular_version(collection.get('collection_id')) @@ -108,16 +109,43 @@ def get_collection_metadata_task(params=None): # max_active_tis_per_dag - setting on the task to restrict how many # instances can be running at the same time, *across all DAG runs* @task() -def map_page_task(page: str, collection: dict, mapped_data_version: str): +def map_page_task(vernacular_page: str, collection: dict, mapped_data_version: str): + """ + vernacular_page is a filepath relative to the collection id, ex: + 3433/vernacular_metadata_2023-01-01T00:00:00/data/1 + mapped_data_version is a path relative to the collection id, ex: + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/ + returns a dictionary with the following keys: + status: success + num_records_mapped: int + page_exceptions: TODO + mapped_page_path: str, ex: + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl + """ collection_id = collection.get('id') if not collection_id or not mapped_data_version: return False - mapped_page = map_page(collection_id, page, mapped_data_version, collection) + mapped_page = map_page( + collection_id, vernacular_page, mapped_data_version, collection) return mapped_page @task() def get_mapping_status_task(collection: dict, mapped_pages: list): + """ + mapped_pages is a list of dicts with the following keys: + status: success + num_records_mapped: int + page_exceptions: TODO + mapped_page_path: str, ex: + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl + returns a dict with the following keys: + mapped_page_paths: ex: [ + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl, + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/2.jsonl, + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/3.jsonl + ] + """ mapping_status = get_mapping_status(collection, mapped_pages) return mapping_status @@ -130,18 +158,27 @@ def create_mapped_version_task(collection, vernacular_pages): '3433/vernacular_metadata_2023-01-01T00:00:00/data/1', '3433/vernacular_metadata_2023-01-01T00:00:00/data/2' ] + returns the path to a new mapped version, ex: + "3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/" """ vernacular_version = get_version(collection.get('id'), vernacular_pages[0]) if not vernacular_version: raise ValueError( f"Vernacular version not found in {vernacular_pages[0]}") - mapped_data_version = create_mapped_version( - collection.get('id'), vernacular_version) + mapped_data_version = create_mapped_version(vernacular_version) return mapped_data_version @task() def validate_collection_task(collection_status: dict, params=None) -> str: + """ + collection_status is a dict containing the following keys: + mapped_page_paths: ex: [ + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl, + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/2.jsonl, + 3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/3.jsonl + ] + """ if not params or not params.get('validate'): raise ValueError("Validate flag not found in params") @@ -156,8 +193,10 @@ def validate_collection_task(collection_status: dict, params=None) -> str: print(f"Output {num_rows} rows to {file_location}") # create a link to the file in the logs - if file_location.startswith('s3://'): - parsed_loc = urlparse(file_location) + mapper_data_dest = os.environ.get("MAPPED_DATA", "file:///tmp") + if mapper_data_dest.startswith("s3"): + parsed_loc = urlparse( + f"{mapper_data_dest.rstrip('/')}/{file_location}") file_location = ( f"https://{parsed_loc.netloc}.s3.us-west-2." f"amazonaws.com{parsed_loc.path}" @@ -168,9 +207,9 @@ def validate_collection_task(collection_status: dict, params=None) -> str: @task() def create_content_data_version_task(collection: dict, mapped_pages: list[dict]): - content_data_version = create_content_data_version( + mapped_version = get_version( collection['id'], mapped_pages[0]['mapped_page_path']) - return content_data_version + return create_content_data_version(mapped_version) @task() diff --git a/dags/utils_by_mapper_type.py b/dags/utils_by_mapper_type.py index 85d1b0386..a4781afd2 100644 --- a/dags/utils_by_mapper_type.py +++ b/dags/utils_by_mapper_type.py @@ -1,5 +1,6 @@ import requests import logging +import os from urllib.parse import urlparse @@ -29,6 +30,18 @@ def make_mapper_type_endpoint(params=None): @task() def fetch_endpoint_task(endpoint, params=None): + """ + TODO: map the output of this job to the input of the map_endpoint_task + re: versioning + 3433: [ + { + document_count: int + vernacular_filepath: path relative to collection id + ex: "3433/vernacular_version_1/data/1" + status: 'success' or 'error' + } + ] + """ limit = params.get('limit', None) if params else None fetcher_job_result = fetch_endpoint(endpoint, limit, logger) for collection_id in fetcher_job_result.keys(): @@ -68,8 +81,9 @@ def validate_endpoint_task(url, params=None): num_rows, file_location = create_collection_validation_csv( collection['collection_id'], mapped_page_paths) csv_paths.append(file_location) - if file_location.startswith('s3://'): - s3_path = urlparse(file_location) + validation_data_dest = os.environ.get("MAPPED_DATA", "file:///tmp") + if validation_data_dest.startswith("s3"): + s3_path = urlparse(f"{validation_data_dest.rstrip('/')}/{file_location}") s3_paths.append(f"https://{s3_path.netloc}.s3.amazonaws.com{s3_path.path}") print(f"Output {num_rows} rows to {file_location}") diff --git a/env.example b/env.example index 40252ca62..65a09f7d5 100644 --- a/env.example +++ b/env.example @@ -1,11 +1,10 @@ # metadata_fetcher -export FETCHER_DATA_DEST=file:///usr/local/airflow/rikolti_data +export VERNACULAR_DATA=file:///usr/local/airflow/rikolti_data export NUXEO= # ask for a key - required to run the NuxeoFetcher export FLICKR_API_KEY= # ask for a key - required to run the FlickrFetcher # metadata_mapper -export MAPPER_DATA_SRC=file:///usr/local/airflow/rikolti_data -export MAPPER_DATA_DEST=file:///usr/local/airflow/rikolti_data +export MAPPED_DATA=file:///usr/local/airflow/rikolti_data export SKIP_UNDEFINED_ENRICHMENTS=True # validator @@ -14,9 +13,8 @@ export UCLDC_SOLR_URL="https://solr.calisphere.org/solr" # this is so export UCLDC_SOLR_API_KEY= # ask for a key # content_harvester -export CONTENT_DATA_SRC=file:///rikolti_data -export CONTENT_DATA_DEST=file:///rikolti_data -export CONTENT_DEST=file:///rikolti_content +export CONTENT_DATA=file:///rikolti_data +export CONTENT_ROOT=file:///rikolti_content # content_harvester when run locally via aws_mwaa_local_runner export CONTENT_DATA_MOUNT=/Users/awieliczka/Projects/rikolti_data diff --git a/metadata_fetcher/fetch_registry_collections.py b/metadata_fetcher/fetch_registry_collections.py index 27a322b54..eb39438a5 100644 --- a/metadata_fetcher/fetch_registry_collections.py +++ b/metadata_fetcher/fetch_registry_collections.py @@ -5,7 +5,7 @@ import requests from . import lambda_function -from rikolti.utils.rikolti_storage import create_vernacular_version +from rikolti.utils.versions import create_vernacular_version logger = logging.getLogger(__name__) @@ -24,6 +24,18 @@ def registry_endpoint(url): def fetch_endpoint(url, limit=None, job_logger=logger): + """ + returns a dictionary of collection ids and fetch results, where + fetch results are a list of of dictionaries with the following keys: + ex: 3433: [ + { + document_count: int + vernacular_filepath: path relative to collection id + ex: "3433/vernacular_version_1/data/1" + status: 'success' or 'error' + } + ] + """ response = requests.get(url=url) response.raise_for_status() total = response.json().get('meta', {}).get('total_count', 1) diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 86ab6ec45..13b9c55d5 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -2,7 +2,7 @@ import requests from requests.adapters import HTTPAdapter, Retry -from rikolti.utils.rikolti_storage import put_vernacular_content +from rikolti.utils.versions import put_vernacular_page logger = logging.getLogger(__name__) @@ -21,17 +21,32 @@ class FetchError(Exception): class Fetcher(object): - def __init__(self, params): + def __init__(self, params: dict): + """ + params: dict + harvest_type: str + collection_id: str or int + write_page: str or int filename of the page to write to + vernacular_version: path relative to collection id + ex: "3433/vernacular_version_1" + """ self.harvest_type = params.get('harvest_type') self.collection_id = params.get('collection_id') self.write_page = params.get('write_page', 0) - self.vernacular_version = params.get('vernacular_version') + self.vernacular_version = params['vernacular_version'] if not self.collection_id: raise CollectionIdRequired("collection_id is required") def fetch_page(self): + """ + returns a dict with the following keys: + document_count: int + vernacular_filepath: path relative to collection id + ex: "3433/vernacular_version_1/data/1" + status: 'success' or 'error' + """ page = self.build_fetch_request() logger.debug( f"[{self.collection_id}]: fetching page {self.write_page} " @@ -49,7 +64,7 @@ def fetch_page(self): if record_count: content = self.aggregate_vernacular_content(response.text) try: - filepath = put_vernacular_content( + filepath = put_vernacular_page( content, self.write_page, self.vernacular_version) except Exception as e: print(f"Metadata Fetcher: {e}") diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py index cbc15e0bc..1c91fc4f6 100644 --- a/metadata_fetcher/fetchers/ucd_json_fetcher.py +++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup from .Fetcher import Fetcher, FetchError -from rikolti.utils.rikolti_storage import put_vernacular_content +from rikolti.utils.versions import put_vernacular_page class UcdJsonFetcher(Fetcher): def __init__(self, params: dict[str]): @@ -69,7 +69,7 @@ def fetch_all_pages(self, response: requests.Response) -> list: records = [self.fetch_json_ld(url) for url in urls] document_count = len(records) try: - filepath = put_vernacular_content( + filepath = put_vernacular_page( json.dumps(records), self.write_page, self.vernacular_version) fetch_status.append({ 'document_count': document_count, diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index d4f78b68f..17489fb51 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -4,7 +4,7 @@ import sys from .fetchers.Fetcher import Fetcher, InvalidHarvestEndpoint -from rikolti.utils.rikolti_storage import create_vernacular_version +from rikolti.utils.versions import create_vernacular_version logger = logging.getLogger(__name__) @@ -21,7 +21,14 @@ def import_fetcher(harvest_type): # AWS Lambda entry point -def fetch_collection(payload, vernacular_version, context): +def fetch_collection(payload, vernacular_version, context) -> list[dict]: + """ + returns a list of dicts with the following keys: + document_count: int + vernacular_version: path relative to collection id + ex: "3433/vernacular_version_1/data/1" + status: 'success' or 'error' + """ if isinstance(payload, str): payload = json.loads(payload) diff --git a/metadata_fetcher/settings.py b/metadata_fetcher/settings.py index e18110918..3baf224a6 100644 --- a/metadata_fetcher/settings.py +++ b/metadata_fetcher/settings.py @@ -1,8 +1,6 @@ import logging import os -from urllib.parse import urlparse - from dotenv import load_dotenv logger = logging.getLogger(__name__) @@ -12,7 +10,5 @@ NUXEO_TOKEN = os.environ.get('NUXEO') FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY') -DATA_DEST_URL = os.environ.get("FETCHER_DATA_DEST", "file:///tmp") - for key, value in os.environ.items(): logger.debug(f"{key}={value}") diff --git a/metadata_fetcher/tests.py b/metadata_fetcher/tests.py index 85f6cebb1..f6594fe86 100644 --- a/metadata_fetcher/tests.py +++ b/metadata_fetcher/tests.py @@ -10,7 +10,7 @@ nuxeo_nested_complex_object_harvests) from .sample_data.oac_harvests import oac_harvests from .sample_data.oai_harvests import oai_harvests -from rikolti.utils.rikolti_storage import create_vernacular_version +from rikolti.utils.versions import create_vernacular_version def main(): diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index eaf7350d4..b62271999 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -8,7 +8,7 @@ from . import settings from .mappers.mapper import Record, Vernacular -from rikolti.utils.rikolti_storage import get_mapped_page, put_page_content +from rikolti.utils.versions import get_vernacular_page, put_mapped_page logger = logging.getLogger(__name__) @@ -73,14 +73,32 @@ def run_enrichments(records, collection, enrichment_set, page_filename): return records -def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: str, collection: Union[dict, str]): +def map_page( + collection_id: int, + vernacular_page_path: str, + mapped_data_version: str, + collection: Union[dict, str] + ): + """ + vernacular_page_path is a filepath relative to the collection id, ex: + 3433/vernacular_metadata_v1/data/1 + mapped_data_version is a version path relative to the collection id, ex: + 3433/vernacular_metadata_v1/mapped_metadata_v1/ + + returns a dict with the following keys: + status: success + num_records_mapped: int + page_exceptions: TODO + mapped_page_path: str, ex: + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl + """ if isinstance(collection, str): collection = json.loads(collection) vernacular_reader = import_vernacular_reader( collection.get('rikolti_mapper_type')) page_filename = os.path.basename(vernacular_page_path) - api_resp = get_mapped_page(vernacular_page_path) + api_resp = get_vernacular_page(vernacular_page_path) source_vernacular = vernacular_reader(collection_id, page_filename) source_metadata_records = source_vernacular.parse(api_resp) @@ -118,10 +136,8 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: # for record in mapped_records] mapped_metadata = [record.to_dict() for record in mapped_records] - mapped_page_path = put_page_content( - json.dumps(mapped_metadata), - f"{mapped_data_version.rstrip('/')}/data/{page_filename}.jsonl" - ) + mapped_page_path = put_mapped_page( + json.dumps(mapped_metadata), page_filename, mapped_data_version) return { 'status': 'success', @@ -144,7 +160,7 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: mapped_page = map_page(args.collection_id, args.page_path, args.mapped_data_path, args.collection) print(f"{mapped_page.get('num_records_mapped')} records mapped") - print(f"mapped page at {mapped_page.get('mapped_page_path')}") + print(f"mapped page at {os.environ.get('MAPPED_DATA')}/{mapped_page.get('mapped_page_path')}") for report, couch_ids in mapped_page.get('exceptions', {}).items(): print(f"{len(couch_ids)} records report enrichments errors: {report}") diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py index a376e0c14..264f948f1 100644 --- a/metadata_mapper/lambda_shepherd.py +++ b/metadata_mapper/lambda_shepherd.py @@ -8,7 +8,10 @@ from . import validate_mapping from .lambda_function import map_page from .mappers.mapper import Record -from rikolti.utils.rikolti_storage import get_vernacular_pages, create_mapped_version, get_most_recent_vernacular_version +from rikolti.utils.versions import ( + get_most_recent_vernacular_version, get_vernacular_pages, + get_version, create_mapped_version +) def get_collection(collection_id): @@ -39,6 +42,20 @@ def check_for_missing_enrichments(collection): def get_mapping_status(collection, mapped_pages): + """ + mapped_pages is a list of dicts with the following keys: + status: success + num_records_mapped: int + page_exceptions: TODO + mapped_page_path: str, ex: + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl + returns a dict, one of the keys is mapped_page_paths: + mapped_page_paths: ex: [ + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl, + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/2.jsonl, + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/3.jsonl + ] + """ count = sum([page['num_records_mapped'] for page in mapped_pages]) page_count = len(mapped_pages) collection_exceptions = [page.get('page_exceptions', {}) for page in mapped_pages] @@ -78,9 +95,11 @@ def map_collection(collection_id, vernacular_version=None, validate=False): if not vernacular_version: vernacular_version = get_most_recent_vernacular_version(collection_id) - page_list = get_vernacular_pages(collection_id, vernacular_version) + page_list = get_vernacular_pages(vernacular_version) + # TODO: split page_list into pages and children? - mapped_data_version = create_mapped_version(collection_id, page_list[0]) + vernacular_version = get_version(collection_id, page_list[0]) + mapped_data_version = create_mapped_version(vernacular_version) mapped_pages = [] for page in page_list: try: @@ -95,14 +114,13 @@ def map_collection(collection_id, vernacular_version=None, validate=False): continue collection_stats = get_mapping_status(collection, mapped_pages) - mapped_page_paths = [page['mapped_page_path'] for page in mapped_pages] if validate: opts = validate if isinstance(validate, dict) else {} num_rows, file_location = ( validate_mapping.create_collection_validation_csv( collection_id, - mapped_page_paths, + collection_stats['mapped_page_paths'], **opts ) ) diff --git a/metadata_mapper/settings.py b/metadata_mapper/settings.py index aaecef5fc..d1dfd6cf8 100644 --- a/metadata_mapper/settings.py +++ b/metadata_mapper/settings.py @@ -4,9 +4,6 @@ load_dotenv() -DATA_SRC_URL = os.environ.get('MAPPER_DATA_SRC', 'file:///tmp') -DATA_DEST_URL = os.environ.get('MAPPER_DATA_DEST', 'file:///tmp') - SKIP_UNDEFINED_ENRICHMENTS = os.environ.get('SKIP_UNDEFINED_ENRICHMENTS', False) SOLR_URL = os.environ.get('UCLDC_SOLR_URL', False) diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py index f8b606164..0580b2c07 100644 --- a/metadata_mapper/validate_mapping.py +++ b/metadata_mapper/validate_mapping.py @@ -10,7 +10,8 @@ from .validator.validation_log import ValidationLogLevel from .validator.validation_mode import ValidationMode from .validator.validator import Validator -from rikolti.utils.rikolti_storage import get_page_content +from rikolti.utils.versions import ( + get_mapped_page, get_version, get_mapped_pages) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -29,6 +30,13 @@ def validate_collection(collection_id: int, Parameters: collection_id: int The collection ID + mapped_page_paths: list[str] + A list of the relative paths to pages of vernacular metadata, ex: + [ + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl, + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/2.jsonl, + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/3.jsonl + ] validator_class: Type[Validator] (default: None) The validator class to use. Can be derived if not provided. validator: Validator (default: None) @@ -66,7 +74,8 @@ def validate_page(collection_id: int, page_path: str, collection_id: int The collection ID page_path: str - The absolute path to a page within the collection + The relative path to a specific page of mapped metadata, ex: + 3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl validator: Validator The validator instance to use @@ -78,7 +87,7 @@ def validate_page(collection_id: int, page_path: str, "page_path": page_path } mapped_metadata = validator.generate_keys( - get_mapped_data(page_path), + json.loads(get_mapped_page(page_path)), type="Rikolti", context=context ) @@ -115,16 +124,13 @@ def create_collection_validation_csv( collection_id: int, mapped_page_paths: list[str], **options) -> tuple[int, str]: result = validate_collection(collection_id, mapped_page_paths, **options) - filename = result.log.output_csv_to_bucket(collection_id, mapped_page_paths[0]) + mapped_version = get_version(collection_id, mapped_page_paths[0]) + filename = result.log.output_csv_to_bucket(collection_id, mapped_version) return len(result.log.log), filename ## Private-ish -def get_mapped_data(page_path: str) -> list[dict]: - return json.loads(get_page_content(page_path)) - - def get_comparison_data(collection_id: int, harvest_ids: list[str]) -> list[dict]: solr_data = get_solr_data(collection_id, harvest_ids) couch_data = get_couch_db_data(collection_id, harvest_ids) @@ -255,6 +261,7 @@ def get_validator_class(collection_id: int) -> Type[Validator]: description="Validate mapped metadata against SOLR") parser.add_argument('collection_id', help='Collection ID') + parser.add_argument('mapped_data_version', help="Mapped data version, ex: 3433/vernacular_data_1/mapped_data_1") parser.add_argument("--log-level", dest="log_level", help="Log level - can be ERROR, WARNING, INFO, or DEBUG") parser.add_argument('-v', '--verbose', action="store_true", help="Verbose mode") @@ -273,6 +280,8 @@ def get_validator_class(collection_id: int) -> Type[Validator]: print(f"Generating validations for collection {args.collection_id} with options:") print(kwargs) + mapped_page_paths = get_mapped_pages(args.mapped_data_version) + num_rows, file_location = create_collection_validation_csv( args.collection_id, mapped_page_paths, **kwargs) print(f"Output {num_rows} rows to {file_location}") diff --git a/metadata_mapper/validator/validation_log.py b/metadata_mapper/validator/validation_log.py index 516eafaa3..df455fa18 100644 --- a/metadata_mapper/validator/validation_log.py +++ b/metadata_mapper/validator/validation_log.py @@ -1,10 +1,8 @@ -from datetime import datetime from enum import Enum +import json from typing import IO, Any - -from .. import settings -from rikolti.utils.rikolti_storage import put_page_content, create_validation_version - +from rikolti.utils.versions import ( + create_validation_version, put_validation_report) class ValidationLogLevel(Enum): DEBUG = "DEBUG" @@ -110,7 +108,7 @@ def output_csv_to_file(self, file: IO[str], append: bool = False, with open(file, "a" if append else "w") as f: f.write(self._csv_content_string(include_fields, append)) - def output_csv_to_bucket(self, collection_id: int, mapped_data_path: str = None, + def output_csv_to_bucket(self, collection_id: int, mapped_version: str = None, include_fields: list[str] = None) -> str: """ Writes a CSV to the env-appropriate bucket (local or S3). @@ -118,19 +116,22 @@ def output_csv_to_bucket(self, collection_id: int, mapped_data_path: str = None, Parameters: collection_id: int The collection ID (for finding appropriate folder) - filename: str (default: None) - The name of the created file. If not provided, defaults to - timestamp + mapped_version: str (default: None) + the mapped_data version, ex: + 3433/vernacular_metadata_v1/mapped_metadata_v1/ include_fields: list[str] (default: None) A list of fields to include in the CSV. Defaults to all. + + Returns: str + the relative path to the created file, ex: + 3433/vernacular_metadata_v1/mapped_metadata_v1/validation_v1.csv """ content = self._csv_content_string(include_fields) if isinstance(content, list) or isinstance(content, dict): content = json.dumps(content) - file_location = create_validation_version(collection_id, mapped_data_path) - put_page_content(content, file_location) - + file_location = create_validation_version(mapped_version) + put_validation_report(content, file_location) return file_location diff --git a/utils/rikolti_storage.py b/utils/storage.py similarity index 53% rename from utils/rikolti_storage.py rename to utils/storage.py index 150b67551..467d79d49 100644 --- a/utils/rikolti_storage.py +++ b/utils/storage.py @@ -5,7 +5,7 @@ from datetime import datetime from urllib.parse import urlparse -from typing import Optional +from typing import Optional, Union from collections import namedtuple DataStorage = namedtuple( @@ -198,142 +198,3 @@ def put_file_content(data: DataStorage, content) -> str: return data.uri -def get_version(collection_id, uri): - """ - From an arbitrary path, try to get the version string - """ - uri = uri.rstrip('/') - if collection_id not in uri or uri.endswith(collection_id): - return None - rikolti_data_root, relative_path = uri.split(f"/{collection_id}/") - path_list = relative_path.split('/') - if 'data' in path_list: - path_list = path_list[:path_list.index('data')] - path_list.insert(0, collection_id) - version = "/".join(path_list) - return version - - -def create_vernacular_version( - collection_id: int or str, - version_suffix: Optional[str] = None - ): - if not version_suffix: - version_suffix = ( - datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) - vernacular_version_path = ( - f"{collection_id}/vernacular_metadata_{version_suffix}/") - return vernacular_version_path - - -def put_vernacular_content(content: str, page_name: int or str, version: str): - fetcher_data_dest = os.environ.get( - "FETCHER_DATA_DEST", "file:///tmp") - path = f"{fetcher_data_dest.rstrip('/')}/{version}/data/{page_name}" - put_page_content(content, path) - return f"{version}/data/{page_name}" - - -def get_most_recent_vernacular_version(collection_id: int or str): - mapper_data_src = os.environ.get("MAPPED_DATA_SRC") - vernacular_versions = list_dirs(f"{mapper_data_src}/{collection_id}/") - if not vernacular_versions: - raise Exception( - "No vernacular metadata versions found for {collection_id}") - return get_version(collection_id, sorted(vernacular_versions)[-1]) - - -def get_vernacular_pages(collection_id, vernacular_version): - mapper_data_src = os.environ.get("MAPPED_DATA_SRC", "file:///tmp").rstrip('/') - vernacular_path = f"{mapper_data_src}/{vernacular_version}/data/" - try: - page_list = list_pages(vernacular_path, recursive=True) - except FileNotFoundError as e: - print( - f"{e} - have you fetched {collection_id}? " - f"looked in dir {e.filename} for vernacular pages" - ) - raise(e) - - # TODO: split page_list into pages and children? - return page_list - - -def create_mapped_version( - collection_id: int or str, - vernacular_version: str, - mapped_data_suffix: Optional[str] = None, -): - mapper_data_dest = os.environ.get("MAPPED_DATA_DEST") - # get path of the vernacular version, not the vernacular data - mapped_root = vernacular_path.rsplit('data', 1)[0] - - if mapper_data_dest: - # get path relative to collection_id - vernacular_path = vernacular_path.split(str(collection_id))[-1] - mapped_root = ( - f"{mapper_data_dest.rstrip('/')}/{collection_id}/{vernacular_path}" - ) - - if not mapped_data_suffix: - mapped_data_suffix = ( - datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) - mapped_data_path = ( - f"{mapped_root.rstrip('/')}/mapped_metadata_{mapped_data_suffix}/") - return mapped_data_path - - -def get_mapped_page(relative_vernacular_path): - mapper_data_src = os.environ.get("MAPPER_DATA_SRC", "file:///tmp").rstrip('/') - relative_vernacular_path = relative_vernacular_path.lstrip('/') - return get_page_content(f"{mapper_data_src}/{relative_vernacular_path}") - - -def create_validation_version( - collection_id: int or str, - mapped_data_path: str, - validation_suffix: Optional[str] = None -): - validation_data_dest = os.environ.get("VALIDATION_DATA_DEST") - # get path of the mapped data version, not the mapped data - validation_root = mapped_data_path.rsplit('data', 1)[0] - - if validation_data_dest: - # get path relative to collection_id - mapped_data_path = mapped_data_path.split(str(collection_id))[-1] - validation_root = ( - f"{validation_data_dest.rstrip('/')}/{collection_id}/{mapped_data_path}" - ) - - if not validation_suffix: - validation_suffix = ( - datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) - validation_data_path = ( - f"{validation_root.rstrip('/')}/validation_{validation_suffix}.csv") - return validation_data_path - - -def create_content_data_version( - collection_id: int or str, - mapped_data_version: str, - content_data_suffix: Optional[str] = None -)-> str: - mapped_with_content_dest = os.environ.get('CONTENT_DATA_DEST') - # get path of the mapped data version, not the mapped data - content_data_root = mapped_data_version - - if mapped_with_content_dest: - # get path relative to collection_id - mapped_data_path = mapped_data_version.split(str(collection_id))[-1] - content_data_root = ( - f"{mapped_with_content_dest.rstrip('/')}/{collection_id}/{mapped_data_path}" - ) - - if not content_data_suffix: - content_data_suffix = ( - datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) - content_data_path = ( - f"{content_data_root.rstrip('/')}/content_data_{content_data_suffix}/") - return content_data_path - - diff --git a/utils/versions.py b/utils/versions.py new file mode 100644 index 000000000..f53bebf78 --- /dev/null +++ b/utils/versions.py @@ -0,0 +1,160 @@ +import os +from datetime import datetime +from typing import Union, Optional +from . import storage + +def get_version(collection_id: Union[int, str], uri: str) -> str: + """ + From an arbitrary path, try to get the version string + """ + collection_id = str(collection_id) + uri = uri.rstrip('/') + if str(collection_id) not in uri or uri.endswith(str(collection_id)): + raise Exception("Not a valid version path") + rikolti_data_root, relative_path = uri.split(f"{collection_id}/") + path_list = relative_path.split('/') + if 'data' in path_list: + path_list = path_list[:path_list.index('data')] + path_list.insert(0, str(collection_id)) + version = "/".join(path_list) + return version + +def create_version( + base_version: str, + pipeline_step: str, + suffix: Optional[str] = None +): + """ + Given a path to a version, ex: 3433/vernacular_metadata_v1/, + compose a new version path, ex: 3433/vernacular_metadata_v1/mapped_metadata_v1/ + + base_version: str + a version path + pipeline_step: str + a name for the branch indicating metadata state, ex: mapped_metadata + branch_suffix: str + a uniquely identifying suffix for this branch + """ + if not suffix: + suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') + base_version = base_version.rstrip('/') + branch_version = ( + f"{base_version}/{pipeline_step}_{suffix}/" + ) + return branch_version + +def create_vernacular_version( + collection_id: Union[int, str], + suffix: Optional[str] = None + ) -> str: + version_path = f"{collection_id}/" + return create_version(version_path, 'vernacular_metadata', suffix) + +def create_mapped_version( + vernacular_version: str, suffix: Optional[str] = None) -> str: + return create_version(vernacular_version, 'mapped_metadata', suffix) + +def create_validation_version( + mapped_version: str, + suffix: Optional[str] = None +): + validation_version = create_version(mapped_version, 'validation', suffix) + return validation_version.rstrip('/') + ".csv" + +def create_content_data_version( + mapped_version: str, suffix: Optional[str] = None) -> str: + return create_version(mapped_version, 'content_data', suffix) + + +def get_most_recent_vernacular_version(collection_id: Union[int, str]): + mapper_data_src = os.environ.get("VERNACULAR_DATA") + vernacular_versions = storage.list_dirs(f"{mapper_data_src}/{collection_id}/") + if not vernacular_versions: + raise Exception( + "No vernacular metadata versions found for {collection_id}") + return get_version(collection_id, sorted(vernacular_versions)[-1]) + +def get_vernacular_pages(version): + data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp") + data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/" + try: + page_list = storage.list_pages(data_path, recursive=True) + except FileNotFoundError as e: + print( + f"\n\nNo vernacular pages found in {e.filename}\n\n" + ) + raise(e) + return [path[len(data_root)+1:] for path in page_list] + +def get_mapped_pages(version, **kwargs): + data_root = os.environ.get("MAPPED_DATA", "file:///tmp") + data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/" + try: + page_list = storage.list_pages(data_path, recursive=True, **kwargs) + except FileNotFoundError as e: + print( + f"\n\nNo mapped pages found in {e.filename}\n\n" + ) + raise(e) + return [path[len(data_root)+1:] for path in page_list] + +def get_child_directories(version, **kwargs): + data_root = os.environ.get('MAPPED_DATA', "file:///tmp") + child_directories = storage.list_dirs( + f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/", + recursive=False + ) + return child_directories + +def get_child_pages(version, **kwargs): + data_root = os.environ.get("MAPPED_DATA", "file:///tmp") + data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/children/" + try: + page_list = storage.list_pages(data_path, recursive=False, **kwargs) + except FileNotFoundError: + return [] + except OSError: + return [] + return [path[len(data_root)+1:] for path in page_list] + +def get_vernacular_page(version_page): + data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp").rstrip('/') + return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}") + +# TODO: check if this is always json.loads +def get_mapped_page(version_page): + data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/') + return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}") + +def get_child_page(version_page): + data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/') + content = storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}") + return json.loads(content) + +def put_vernacular_page(content: str, page_name: Union[int, str], version: str): + data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp") + path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}" + storage.put_page_content(content, path) + return f"{version.rstrip('/')}/data/{page_name}" + +def put_mapped_page(content, page_name, version): + data_root = os.environ.get("MAPPED_DATA", "file:///tmp") + path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}.jsonl" + storage.put_page_content(content, path) + return f"{version.rstrip('/')}/data/{page_name}.jsonl" + +def put_validation_report(content, version_page): + data_root = os.environ.get("MAPPED_DATA", "file:///tmp") + path = f"{data_root.rstrip('/')}/{version_page}" + storage.put_page_content(content, path) + return version_page + +def put_content_data_page(content, page_name, version): + data_root = os.environ.get("CONTENT_DATA", "file:///tmp") + path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}" + storage.put_page_content(content, path) + return f"{version.rstrip('/')}/data/{page_name}" + + + + From b98f63eb45953e7149b265a5165943d4b962ade3 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Mon, 13 Nov 2023 17:11:16 -0800 Subject: [PATCH 26/42] Use storage utilities in content harvest also --- content_harvester/by_page.py | 40 ++++++++++++----------------------- content_harvester/settings.py | 9 -------- utils/storage.py | 32 +++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 36 deletions(-) diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py index 5700d65cb..169dc8040 100644 --- a/content_harvester/by_page.py +++ b/content_harvester/by_page.py @@ -1,11 +1,9 @@ import hashlib import json import os -import shutil from collections import Counter from typing import Optional -import boto3 import requests from requests.adapters import HTTPAdapter, Retry @@ -13,6 +11,7 @@ from . import derivatives from . import settings +from .storage import upload_file from .versions import ( get_mapped_page, get_child_directories, get_child_pages, get_child_page, @@ -206,8 +205,9 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict: else: dest_filename = os.path.basename(content.derivative_filepath) - content_s3_filepath = self._upload( - f"{content.dest_prefix}/{collection_id}", dest_filename, content.derivative_filepath) + dest_path = f"{content.dest_prefix}/{collection_id}/{dest_filename}" + content_s3_filepath = self._upload(dest_path, content.derivative_filepath) + content.set_s3_filepath(content_s3_filepath) # print( @@ -287,42 +287,30 @@ def _download(self, url: str, destination_file: str, cache: Optional[dict] = Non return md5 - def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = None) -> str: + def _upload(self, dest_filepath, src_filepath, cache: Optional[dict] = None) -> str: ''' upload file to CONTENT_ROOT ''' if not cache: cache = {} - if cache.get(dest_filename, {}).get('path'): - return cache[dest_filename]['path'] - - dest_path = '' - - if settings.CONTENT_ROOT["STORE"] == 'file': - dest_path = os.path.join( - settings.CONTENT_ROOT["PATH"], dest_prefix) - if not os.path.exists(dest_path): - os.makedirs(dest_path) - dest_path = os.path.join(dest_path, dest_filename) - shutil.copyfile(filepath, dest_path) + filename = os.path.basename(dest_filepath) + if cache.get(filename, {}).get('path'): + return cache[filename]['path'] - if settings.CONTENT_ROOT["STORE"] == 's3': - s3 = boto3.client('s3') - dest_path = ( - f"{settings.CONTENT_ROOT['PATH']}/{dest_prefix}/{dest_filename}") - s3.upload_file( - filepath, settings.CONTENT_ROOT["BUCKET"], dest_path) + content_root = os.environ.get("CONTENT_ROOT", 'file:///tmp') + content_path = f"{content_root.rstrip('/')}/{dest_filepath}" + upload_file(src_filepath, content_path) # (mime, dimensions) = image_info(filepath) cache_updates = { # 'mime': mime, # 'dimensions': dimensions, - 'path': dest_path + 'path': content_path } - cache[dest_filename] = cache_updates + cache[filename] = cache_updates - return dest_path + return content_path # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "file:///rikolti_data/r-0"} diff --git a/content_harvester/settings.py b/content_harvester/settings.py index 56aeae3fb..880d2eea7 100644 --- a/content_harvester/settings.py +++ b/content_harvester/settings.py @@ -1,18 +1,9 @@ import os -from urllib.parse import urlparse - from dotenv import load_dotenv load_dotenv() -CONTENT_ROOT_URL = os.environ.get("CONTENT_ROOT", 'file:///tmp') -CONTENT_ROOT = { - "STORE": urlparse(CONTENT_ROOT_URL).scheme, - "BUCKET": urlparse(CONTENT_ROOT_URL).netloc, - "PATH": urlparse(CONTENT_ROOT_URL).path, -} - AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False) AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', False) AWS_SESSION_TOKEN = os.environ.get('AWS_SESSION_TOKEN', False) diff --git a/utils/storage.py b/utils/storage.py index 467d79d49..3c553f8e9 100644 --- a/utils/storage.py +++ b/utils/storage.py @@ -2,7 +2,7 @@ import re import boto3 -from datetime import datetime +import shutil from urllib.parse import urlparse from typing import Optional, Union @@ -198,3 +198,33 @@ def put_file_content(data: DataStorage, content) -> str: return data.uri +def upload_file(filepath:str, data_uri: str, **kwargs): + data = parse_data_uri(data_uri) + + if data.store == 's3': + return upload_s3_file(data, filepath, **kwargs) + elif data.store == 'file': + return move_file(data, filepath) + else: + raise Exception(f"Unknown data store: {data.store}") + +def upload_s3_file(data: DataStorage, filepath, **kwargs): + """ + Upload a file to s3 at data.path + """ + s3 = boto3.client('s3', **kwargs) + s3.upload_file( + filepath, + data.bucket, + data.path + ) + return data.uri + +def move_file(data: DataStorage, filepath): + destination_path = os.sep.join(data.path.split('/')) + directory_path = os.path.dirname(destination_path) + if not os.path.exists(directory_path): + os.makedirs(directory_path) + + shutil.copyfile(filepath, destination_path) + return data.uri From 727bf2adbac97ba3848985992b586b1dc9aa5063 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Mon, 13 Nov 2023 17:35:31 -0800 Subject: [PATCH 27/42] mapper: accept optional vernacular version arg, defaults to most recent --- dags/mapper_dag.py | 4 +++- utils/versions.py | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py index ece23aa63..824603607 100644 --- a/dags/mapper_dag.py +++ b/dags/mapper_dag.py @@ -14,8 +14,9 @@ @task() -def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str] = None): +def get_vernacular_pages_task(collection: dict, params: Optional[dict]=None): collection_id = collection['id'] + vernacular_version = params.get('vernacular_version') if params else None if not vernacular_version: vernacular_version = get_most_recent_vernacular_version(collection_id) pages = get_vernacular_pages(vernacular_version) @@ -45,6 +46,7 @@ def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str params={ 'collection_id': Param(None, description="Collection ID to map"), 'validate': Param(True, description="Validate mapping?"), + 'vernacular_version': Param(None, description="Vernacular version to map, ex: 3433/vernacular_metadata_v1/") }, tags=["rikolti"], ) diff --git a/utils/versions.py b/utils/versions.py index f53bebf78..90dc9ee33 100644 --- a/utils/versions.py +++ b/utils/versions.py @@ -65,14 +65,14 @@ def create_content_data_version( mapped_version: str, suffix: Optional[str] = None) -> str: return create_version(mapped_version, 'content_data', suffix) - def get_most_recent_vernacular_version(collection_id: Union[int, str]): - mapper_data_src = os.environ.get("VERNACULAR_DATA") - vernacular_versions = storage.list_dirs(f"{mapper_data_src}/{collection_id}/") - if not vernacular_versions: + data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp") + versions = storage.list_dirs(f"{data_root.rstrip('/')}/{collection_id}/") + if not versions: raise Exception( "No vernacular metadata versions found for {collection_id}") - return get_version(collection_id, sorted(vernacular_versions)[-1]) + recent_version = sorted(versions)[-1] + return f"{collection_id}/{recent_version}/" def get_vernacular_pages(version): data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp") From 1ccde3fb54c6b6f14f2e70a206f0d5abad03e391 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Tue, 14 Nov 2023 11:16:52 -0800 Subject: [PATCH 28/42] Update by_mapper_type jobs to use versioning --- dags/dev_validate_by_mapper_type_to_gdrive.py | 9 ++--- dags/utils_by_mapper_type.py | 35 ++++++++++++++----- dags/validate_by_mapper_type.py | 9 ++--- .../fetch_registry_collections.py | 2 +- metadata_mapper/map_registry_collections.py | 8 +++-- 5 files changed, 38 insertions(+), 25 deletions(-) diff --git a/dags/dev_validate_by_mapper_type_to_gdrive.py b/dags/dev_validate_by_mapper_type_to_gdrive.py index 9cade3a27..168a87de1 100644 --- a/dags/dev_validate_by_mapper_type_to_gdrive.py +++ b/dags/dev_validate_by_mapper_type_to_gdrive.py @@ -28,12 +28,9 @@ ) def dev_validate_by_mapper_type(): endpoint=make_mapper_type_endpoint() - validation_reports = validate_endpoint_task(endpoint) - ( - fetch_endpoint_task(endpoint) >> - map_endpoint_task(endpoint) >> - validation_reports - ) + fetched_versions = fetch_endpoint_task(endpoint) + mapped_versions = map_endpoint_task(endpoint, fetched_versions) + validation_reports = validate_endpoint_task(endpoint, mapped_versions) local_filepaths = s3_to_localfilesystem.expand( s3_url=validation_reports) diff --git a/dags/utils_by_mapper_type.py b/dags/utils_by_mapper_type.py index a4781afd2..0aab734c8 100644 --- a/dags/utils_by_mapper_type.py +++ b/dags/utils_by_mapper_type.py @@ -10,6 +10,7 @@ from rikolti.metadata_mapper.map_registry_collections import map_endpoint from rikolti.metadata_mapper.map_registry_collections import registry_endpoint from rikolti.metadata_mapper.validate_mapping import create_collection_validation_csv +from rikolti.utils.versions import get_version, get_mapped_pages logger = logging.getLogger("airflow.task") @@ -31,8 +32,6 @@ def make_mapper_type_endpoint(params=None): @task() def fetch_endpoint_task(endpoint, params=None): """ - TODO: map the output of this job to the input of the map_endpoint_task - re: versioning 3433: [ { document_count: int @@ -44,26 +43,41 @@ def fetch_endpoint_task(endpoint, params=None): """ limit = params.get('limit', None) if params else None fetcher_job_result = fetch_endpoint(endpoint, limit, logger) + fetched_versions = {} for collection_id in fetcher_job_result.keys(): + version = get_version( + collection_id, + fetcher_job_result[collection_id][0]['vernacular_filepath'] + ) print( "Review fetched data at: https://rikolti-data.s3.us-west-2." - f"amazonaws.com/index.html#{collection_id}/" + f"amazonaws.com/index.html#{version}" ) - return fetcher_job_result + fetched_versions[collection_id] = version + return fetched_versions @task() -def map_endpoint_task(endpoint, params=None): +def map_endpoint_task(endpoint, fetched_versions, params=None): limit = params.get('limit', None) if params else None - mapper_job_results = map_endpoint(endpoint, limit) + mapper_job_results = map_endpoint(endpoint, fetched_versions, limit) for mapper_job in mapper_job_results: print( "Review mapped data at: https://rikolti-data.s3.us-west-2." f"amazonaws.com/index.html#{mapper_job['collection_id']}/" ) - return mapper_job_results + mapped_versions = {} + for mapper_job_result in mapper_job_results: + print(mapper_job_result.keys()) + mapped_version = get_version( + mapper_job_result['collection_id'], + mapper_job_result['mapped_page_paths'][0] + ) + mapped_versions[mapper_job_result['collection_id']] = mapped_version + + return mapped_versions @task() -def validate_endpoint_task(url, params=None): +def validate_endpoint_task(url, mapped_versions, params=None): limit = params.get('limit', None) if params else None response = requests.get(url=url) @@ -78,8 +92,11 @@ def validate_endpoint_task(url, params=None): s3_paths = [] for collection in registry_endpoint(url): print(f"{collection['collection_id']:<6} Validating collection") + collection_id = collection['collection_id'] + mapped_version = mapped_versions.get(str(collection_id)) + mapped_pages = get_mapped_pages(mapped_version) num_rows, file_location = create_collection_validation_csv( - collection['collection_id'], mapped_page_paths) + collection_id, mapped_pages) csv_paths.append(file_location) validation_data_dest = os.environ.get("MAPPED_DATA", "file:///tmp") if validation_data_dest.startswith("s3"): diff --git a/dags/validate_by_mapper_type.py b/dags/validate_by_mapper_type.py index 560f4ea1d..54d678325 100644 --- a/dags/validate_by_mapper_type.py +++ b/dags/validate_by_mapper_type.py @@ -27,11 +27,8 @@ ) def validate_by_mapper_type(): endpoint=make_mapper_type_endpoint() - validation_reports = validate_endpoint_task(endpoint) - ( - fetch_endpoint_task(endpoint) >> - map_endpoint_task(endpoint) >> - validation_reports - ) + fetched_versions = fetch_endpoint_task(endpoint) + mapped_versions = map_endpoint_task(endpoint, fetched_versions) + validation_reports = validate_endpoint_task(endpoint, mapped_versions) validate_by_mapper_type() \ No newline at end of file diff --git a/metadata_fetcher/fetch_registry_collections.py b/metadata_fetcher/fetch_registry_collections.py index eb39438a5..7637ea3d4 100644 --- a/metadata_fetcher/fetch_registry_collections.py +++ b/metadata_fetcher/fetch_registry_collections.py @@ -71,7 +71,7 @@ def fetch_endpoint(url, limit=None, job_logger=logger): success = all([page['status'] == 'success' for page in fetch_result]) total_items = sum([page['document_count'] for page in fetch_result]) - total_pages = fetch_result[-1]['page'] + 1 + total_pages = len(fetch_result) diff_items = total_items - collection['solr_count'] diff_items_label = "" if diff_items > 0: diff --git a/metadata_mapper/map_registry_collections.py b/metadata_mapper/map_registry_collections.py index 584830b86..93fdcac20 100644 --- a/metadata_mapper/map_registry_collections.py +++ b/metadata_mapper/map_registry_collections.py @@ -23,7 +23,7 @@ def registry_endpoint(url): yield collection -def map_endpoint(url, limit=None): +def map_endpoint(url, fetched_versions, limit=None): response = requests.get(url=url) response.raise_for_status() total = response.json().get('meta', {}).get('total_count', 1) @@ -51,7 +51,9 @@ def map_endpoint(url, limit=None): f"{collection_id:<6}: call lambda with collection_id: {collection_id}") try: - map_result = lambda_shepherd.map_collection(collection_id) + vernacular_version = fetched_versions[str(collection_id)] + map_result = lambda_shepherd.map_collection( + collection_id, vernacular_version) except FileNotFoundError: print(f"{collection_id:<6}: not fetched yet", file=sys.stderr) continue @@ -112,10 +114,10 @@ def map_endpoint(url, limit=None): f"solr count last updated: {collection['solr_last_updated']}" ) print(map_report_row) + map_report.append(map_result) if limit and progress >= limit: break - map_report.append(map_result) return map_report From f580e47833aee92655fe94d4343b30eb74952c38 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 15 Nov 2023 10:46:48 -0800 Subject: [PATCH 29/42] Add utilities to the content harvester docker image --- .../Dockerfile => Dockerfile.content_harvester | 5 +++-- README.md | 2 +- content_harvester/README.md | 6 ++++-- content_harvester/by_collection.py | 3 ++- content_harvester/by_page.py | 4 ++-- content_harvester/docker-compose.yml | 3 ++- 6 files changed, 14 insertions(+), 9 deletions(-) rename content_harvester/Dockerfile => Dockerfile.content_harvester (86%) diff --git a/content_harvester/Dockerfile b/Dockerfile.content_harvester similarity index 86% rename from content_harvester/Dockerfile rename to Dockerfile.content_harvester index aa0ba15e4..8a485c917 100644 --- a/content_harvester/Dockerfile +++ b/Dockerfile.content_harvester @@ -9,11 +9,12 @@ RUN sed -i 's//