diff --git a/metadata_fetcher/fetchers/preservica_api_fetcher.py b/metadata_fetcher/fetchers/preservica_api_fetcher.py new file mode 100644 index 000000000..28485a8a7 --- /dev/null +++ b/metadata_fetcher/fetchers/preservica_api_fetcher.py @@ -0,0 +1,239 @@ +import json +from xml.etree import ElementTree +from .Fetcher import Fetcher +from requests.auth import HTTPBasicAuth +from requests.adapters import Retry +from requests.adapters import HTTPAdapter +import requests +import re + + +class PreservicaApiFetcher(Fetcher): + BASE_URL: str = "https://us.preservica.com/api/entity/v6.0" + + NAMESPACES: dict = { + "pra": "http://preservica.com/EntityAPI/v6.0", + "xip": "http://preservica.com/XIP/v6.0" + } + + def __init__(self, params: dict[str]): + """ + Parameters: + params: dict[str] + """ + super(PreservicaApiFetcher, self).__init__(params) + + # Tracks where we're at processing record sub-requests + self.record_index = 1 + self.record_total = 0 + + # If `next_url` is a param, we know that this is not + # the fetch of the first page, so skip setting those + # attributes + if "next_url" in params: + for key in params: + setattr(self, key, params[key]) + return + + credentials = params.get("harvest_data").get("harvest_extra_data") + self.basic_auth_credentials = ([v.strip() for v in credentials.split(',')]) + self.internal_collection_id = \ + re.search( + r"(?<=SO_)[0-9a-f-]+", + params.get("harvest_data").get("url") + ).group(0) + + self.next_url = ( + f"{self.BASE_URL}/structural-objects/" + f"{self.internal_collection_id}/children?max=1" + ) + + def build_fetch_request(self) -> dict[str]: + """ + Generates arguments for `requests.get()`. + + Returns: dict[str] + """ + return self.build_url_request(self.next_url) + + def aggregate_vernacular_content(self, response: str) -> str: + """ + TODO: at time of dev, response is a requests.Response, but will be str when + merged + + Parameters: + response: str + + Returns: str + """ + + response_body = response.text + + # Starting with a list of `information-objects` URLs + object_url = ElementTree.fromstring(response_body).\ + find("pra:Children/pra:Child", self.NAMESPACES).text + + # Getting an individual `information-object`, extracting the URL + # for the oai_dc metadata fragment + metadata_url = self.get_metadata_url_from_object(object_url) + + self.record_total = 1 if metadata_url else 0 + + # Getting the metadata + return self.get_metadata_from_url(metadata_url) + + def get_object_type(self, response_body: str): + io_tag = ElementTree.fromstring(response_body).\ + find("xip:InformationObject", self.NAMESPACES) + + if io_tag: + return "information" + + so_tag = ElementTree.fromstring(response_body).\ + find("xip:StructuralObject", self.NAMESPACES) + + if so_tag: + return "structural" + + def get_information_response(self, response_body): + """ + Takes an information object response and performs two additional requests + to get to the structural object. This function is optimistic about the + structure of the XML responses. + + Parameters: + response_body: string + Returns: string + """ + object_type = self.get_object_type(response_body) + + if object_type == "information": + return response_body + + path = "pra:AdditionalInformation/pra:Children" + url = ElementTree.fromstring(response_body).find(path, self.NAMESPACES).text + + request = self.build_url_request(url) + response_body = self.build_retry_session().get(**request).text + + path = "pra:Children/pra:Child" + url = ElementTree.fromstring(response_body).find(path, self.NAMESPACES).text + + request = self.build_url_request(url) + return self.build_retry_session().get(**request).text + + def get_metadata_url_from_object(self, url: str): + """ + Second request. It may get a structural object or a information object. With + the former, a couple additional requests have to take place. + + Parameters: + url: str + + Returns: str + """ + request = self.build_url_request(url) + response_body = self.build_retry_session().get(**request).text + + if self.get_object_type(response_body) == "structural": + response_body = self.get_information_response(response_body) + + root = ElementTree.fromstring(response_body) + + path = ".//pra:Fragment[@schema='http://www.openarchives.org/OAI/2.0/oai_dc/']" + fragment = root.find(path, self.NAMESPACES) + + return fragment.text if fragment is not None else None + + def get_metadata_from_url(self, url: str) -> str: + """ + Final request. It returns the metadata. + + Parameters: + url: str + + Returns: str + """ + print( + f"[{self.collection_id}]: Fetching record " + f"({self.record_index} of {self.record_total}) at {url}" + ) + + self.record_index += 1 + + request = self.build_url_request(url) + response = self.build_retry_session().get(**request) + + return response.text + + def check_page(self, response: requests.Response) -> int: + """ + Parameters: + response: requests.Response + + Return: int + """ + hits = len(ElementTree.fromstring(response.content). + findall(".//pra:Child", self.NAMESPACES)) + + print( + f"[{self.collection_id}]: Fetched page {self.write_page} " + f"at {response.url} with {hits} hits" + ) + + return hits + + def increment(self, response: requests.Response): + """ + Sets the `next_url` to fetch and increments the page number. + + Parameters: + response: requests.Response + """ + super(PreservicaApiFetcher, self).increment(response) + + next_element = ElementTree.fromstring(response.content).\ + find("pra:Paging/pra:Next", self.NAMESPACES) + self.next_url = next_element.text if next_element is not None else None + + def json(self) -> str: + """ + Generates JSON for the next page of results. + + Returns: str + """ + current_state = { + "harvest_type": self.harvest_type, + "basic_auth_credentials": self.basic_auth_credentials, + "collection_id": self.collection_id, + "internal_collection_id": self.internal_collection_id, + "next_url": self.next_url, + "write_page": self.write_page + } + if not self.next_url: + current_state.update({"finished": True}) + + return json.dumps(current_state) + + def build_url_request(self, url: str) -> dict: + """ + Creates a dictionary of parameters needed for requests for this fetcher + + Parameters: + url: str + + Returns: dict + """ + return {"url": url, "auth": HTTPBasicAuth(*self.basic_auth_credentials)} + + @staticmethod + def build_retry_session() -> requests.Session: + """ + Creates and returns request sessions that will retry requests + + Requests: requests.Session + """ + session = requests.Session() + retries = Retry(total=3, backoff_factor=2) + session.mount("https://", HTTPAdapter(max_retries=retries)) + return session diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py index 49ca654d4..74e016f1a 100644 --- a/metadata_mapper/mappers/mapper.py +++ b/metadata_mapper/mappers/mapper.py @@ -283,7 +283,7 @@ def select_cmis_atom_id(self): return self def select_preservica_id(self): - calisphere_id = self.mapped_data.get("preservica_id", {}).get('$') + calisphere_id = self.source_metadata.get("entity_id") self.legacy_couch_db_id = f"{self.collection_id}--{calisphere_id}" return self diff --git a/metadata_mapper/mappers/preservica/preservica_mapper.py b/metadata_mapper/mappers/preservica/preservica_mapper.py new file mode 100644 index 000000000..6c2d4e6cf --- /dev/null +++ b/metadata_mapper/mappers/preservica/preservica_mapper.py @@ -0,0 +1,74 @@ +import re +from xml.etree import ElementTree + +from ..mapper import Record, Vernacular + + +class PreservicaRecord(Record): + BASE_URL = "https://oakland.access.preservica.com" + + FILE_PREPEND = "sdb:digitalFile%7C" + + def UCLDC_map(self): + entity_id = self.source_metadata.get("entity_id") + + return { + "calisphere-id": entity_id, + "contributor": self.source_metadata.get("contributor"), + "coverage": self.source_metadata.get("spatial"), + "creator": self.source_metadata.get("creator"), + "date": self.source_metadata.get("date"), + "description": self.source_metadata.get("description"), + "format": self.source_metadata.get("format"), + "identifier": self.source_metadata.get("identifier"), + "isShownAt": ( + f"{self.BASE_URL}/file/{self.FILE_PREPEND}{entity_id}/" + ), + "isShownBy": ( + f"{self.BASE_URL}/download/thumbnail/{self.FILE_PREPEND}{entity_id}" + ), + "language": self.source_metadata.get("language"), + "publisher": self.source_metadata.get("publisher"), + "relation": self.source_metadata.get("relation"), + "rights": self.source_metadata.get("rights"), + "source": self.source_metadata.get("source"), + "state_located_in": {"stateLocatedIn": "California"}, + "subject": self.source_metadata.get("subject"), + "title": self.source_metadata.get("title"), + "type": self.source_metadata.get("type"), + } + + +class PreservicaVernacular(Vernacular): + record_cls = PreservicaRecord + + # TODO: consider putting this namespace mapping in a place that can be imported + # into both the mapper and fetcher + NAMESPACES: dict = { + "pra": "http://preservica.com/EntityAPI/v6.0", + "xip": "http://preservica.com/XIP/v6.0", + "oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/" + } + + def parse(self, response_body): + """ + We expect only one record per file for preservica. Minor changes will need to + be made if we begin importing more per page. + """ + et = ElementTree.fromstring(response_body) + container = et.find(".//xip:MetadataContainer", self.NAMESPACES) + + dc_record = container.find("xip:Content", self.NAMESPACES).\ + find("oai_dc:dc", self.NAMESPACES) + + record = { + "entity_id": container.find("xip:Entity", self.NAMESPACES).text, + } + for element in dc_record: + key = re.sub(r"{\S+}", "", element.tag) # Strip the namespace off the tag + value = element.text + if key not in record: + record[key] = [] + record[key].append(value) + + return self.get_records([record])