Skip to content

Commit

Permalink
Merge branch 'main' into calisphere-etl
Browse files Browse the repository at this point in the history
  • Loading branch information
barbarahui committed Jan 27, 2024
2 parents da142b1 + 209402d commit e76a597
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 43 deletions.
4 changes: 2 additions & 2 deletions metadata_mapper/mappers/nuxeo/nuxeo_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def is_shown_by_validation(validation_def: dict,
return

if not comparison_value:
return "Solr value is empty"
return "CouchDB value is empty"
if not rikolti_value or not isinstance(rikolti_value, dict):
return "Invalid Rikolti value type"

Expand All @@ -291,7 +291,7 @@ def is_shown_by_validation(validation_def: dict,
"ucldc-nuxeo-thumb-media/"
)
if not comparison_value.startswith(legacy_location):
return "Unusual Solr value"
return "Unusual CouchDB value"

expected_keys = ['url', 'mimetype', 'filename', 'nuxeo_type']
if not set(rikolti_value.keys()).issubset(set(expected_keys)):
Expand Down
76 changes: 39 additions & 37 deletions metadata_mapper/mappers/oai/islandora_mapper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import requests
from requests.adapters import HTTPAdapter, Retry

from .oai_mapper import OaiRecord, OaiVernacular

Expand All @@ -19,52 +20,53 @@ def map_subject(self):
return None

def map_is_shown_at(self):
if self.source_metadata.get('request_url'):
coll_url = (
self.source_metadata
.get('request_url')
.replace('/oai2', '')
)
oai_url = self.source_metadata.get('request_url', '')
request_url = oai_url.replace('/oai2', '')

ident = self.source_metadata.get('id', '')
if ':' in ident:
collID, recID = ident.rsplit(':', 1)
newID = recID.replace('_', '%3A')
identifier = self.source_metadata.get('id', '')
_, record_id = identifier.rsplit(':', 1)
new_id = record_id.replace('_', '%3A', 1)

return f"{coll_url}/islandora/object/{newID}"
if request_url and new_id:
return f"{request_url}/islandora/object/{new_id}"
else:
return None

def map_is_shown_by(self):
if self.source_metadata.get('request_url'):
coll_url = (
self.source_metadata
.get('request_url')
.replace('/oai2', '')
)
oai_url = self.source_metadata.get('request_url', '')
request_url = oai_url.replace('/oai2', '')

ident = self.source_metadata.get('id', '')
if ':' in ident:
collID, recID = ident.rsplit(':', 1)
newID = recID.replace('_', '%3A')

thumb_url = (
f"{coll_url}/islandora/object/{newID}/datastream/TN/view")

# Change URL from 'TN' to 'JPG' for larger versions of image
# objects & test to make sure the link resolves
if 'image' or 'StillImage' in self.source_metadata.get('type', ''):
jpg_url = thumb_url.replace("/TN/", "/JPG/")
# TODO: should figure out a way to punt a request
# to minimize the mapper's reliance on external systems
request = requests.get(jpg_url)
if request.status_code == 200:
thumb_url = jpg_url

return thumb_url
else:
identifier = self.source_metadata.get('id', '')
_, record_id = identifier.rsplit(':', 1)
new_id = record_id.replace('_', '%3A', 1)

if not (request_url and new_id):
return None

thumb_url = (
f"{request_url}/islandora/object/{new_id}/datastream/TN/view")

# Change URL from 'TN' to 'JPG' for larger versions of image
# objects & test to make sure the link resolves
if 'image' or 'StillImage' in self.source_metadata.get('type', ''):
jpg_url = thumb_url.replace("/TN/", "/JPG/")
# TODO: should figure out a way to punt a request
# to minimize the mapper's reliance on external systems
http = requests.Session()
retry_strategy = Retry(
total=3,
status_forcelist=[413, 429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http.mount("https://", adapter)
http.mount("http://", adapter)

request = http.get(jpg_url)
if request.status_code == 200:
thumb_url = jpg_url

return thumb_url


class IslandoraVernacular(OaiVernacular):
record_cls = IslandoraRecord
29 changes: 29 additions & 0 deletions metadata_mapper/mappers/oai/quartex_mapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,36 @@
from typing import Union
from .oai_mapper import OaiRecord, OaiVernacular


class QuartexRecord(OaiRecord):
def UCLDC_map(self):
return {
'spatial': self.map_spatial
}

def map_spatial(self) -> Union[list[str], None]:
spatial = self.collate_fields(["coverage", "spatial"])()
split_spatial = []
for value in spatial:
split_spatial.extend(value.split(';'))

return [val.strip() for val in split_spatial if val]

def map_subject(self) -> Union[list[dict[str, str]], None]:
# https://github.com/calisphere-legacy-harvester/dpla-ingestion/blob/ucldc/lib/mappers/dublin_core_mapper.py#L117-L127 # noqa: E501
value = self.source_metadata.get("subject")
if not value:
return None

if isinstance(value, str):
value = [value]

split_subjects = []
for v in value:
split_subjects.extend(v.split(';'))

return [{"name": v.strip()} for v in split_subjects if v]

def map_is_shown_at(self):
if "identifier" not in self.source_metadata:
return
Expand Down
134 changes: 131 additions & 3 deletions metadata_mapper/mappers/oai/samvera_mapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import re
from datetime import datetime
from typing import Any, Optional

from ..mapper import Validator
from ...validator import ValidationLogLevel
from ...validator import ValidationLogLevel, ValidationMode
from .oai_mapper import OaiRecord, OaiVernacular


Expand Down Expand Up @@ -39,6 +41,29 @@ def setup(self):
"field": "contributor",
"validations": [SamveraValidator.contributor_match],
"level": ValidationLogLevel.WARNING
},
{
"field": "rights",
"validations": [
SamveraValidator.rights_match,
Validator.verify_type(Validator.list_of(str))
]
},
{
"field": "date",
"validations": [SamveraValidator.date_match],
"level": ValidationLogLevel.WARNING,
},
{
"field": "source",
"validations": [SamveraValidator.source_match],
"level": ValidationLogLevel.WARNING
},
{
"field": "description",
"validations": [Validator.content_match],
"level": ValidationLogLevel.WARNING,
"validation_mode": ValidationMode.ORDER_INSENSITIVE_IF_LIST
}
])

Expand All @@ -50,7 +75,10 @@ def replace_ursus_with_digital(validation_def: dict,
return

if comparison_value.startswith("https://ursus.library.ucla.edu"):
comparison_value.replace("https://ursus.library.ucla.edu", "https://digital.library.ucla.edu")
comparison_value = comparison_value.replace(
"https://ursus.library.ucla.edu",
"https://digital.library.ucla.edu"
)

if rikolti_value == comparison_value:
return
Expand All @@ -65,12 +93,112 @@ def contributor_match(validation_def: dict,
if rikolti_value == comparison_value:
return

comparison_value[0] = comparison_value[0] + '.'
if comparison_value:
comparison_value = [c + '.' for c in comparison_value]
if rikolti_value == comparison_value:
return

return "Content mismatch"

@staticmethod
def rights_match(validation_def: dict,
rikolti_value: Any,
comparison_value: Any) -> Optional[str]:
"""
matches values that differ only in phone number when new phone number
is '(310) 825-4988' - in legacy collection 153, this number seemed to
be auto-incrementing with each record (whoops). example:
legacy: [
'US',
(
'UCLA Library Special Collections, A1713 Charles E. Young '
'Research Library, Box 951575, Los Angeles, CA 90095-1575. '
'Email: [email protected]. Phone: (310) 825-4987'
(310) 825-4986
)
]
rikolti: [
'US',
(
'UCLA Library Special Collections, A1713 Charles E. Young '
'Research Library, Box 951575, Los Angeles, CA 90095-1575. '
'Email: [email protected]. Phone: (310) 825-4988'
)
]
"""
if rikolti_value == comparison_value:
return
new_phone_number = '(310) 825-4988'
if comparison_value and len(comparison_value) == 2:
new_comparison_value = re.sub(
r'\(310\) 825-\d{4}', # old phone number regex
new_phone_number,
comparison_value[1]
)
comparison_value[1] = new_comparison_value

if rikolti_value == comparison_value:
return

return "Content mismatch"

@staticmethod
def date_match(validation_def: dict,
rikolti_value: Any,
comparison_value: Any) -> Optional[str]:
"""
if comparison value is a list of one string date and rikolti
value is a list of two string dates, one in Month, DD, YYYY
that matches the comparison value, and the other in YYYY-MM-DD
that is the same logical date at the comparison value, then
return None.
comparison value example: ['August 20, 1951']
rikolti value example: ['August 20, 1951', '1951-08-20']
"""
if comparison_value == rikolti_value:
return

if not comparison_value or not rikolti_value:
return "Content mismatch"

if sorted(rikolti_value) == sorted(comparison_value):
return

if len(comparison_value) == 1 and len(rikolti_value) == 2:
if comparison_value[0] == rikolti_value[0]:
try:
comparison_datetime = datetime.strptime(
comparison_value[0], '%B %d, %Y')
rikolti_datetime = datetime.strptime(
rikolti_value[1], '%Y-%m-%d')
except ValueError:
return "Content mismatch"
if comparison_datetime == rikolti_datetime:
return

return "Content mismatch"

@staticmethod
def source_match(validation_def: dict,
rikolti_value: Any,
comparison_value: Any) -> Optional[str]:
"""
matches
"['Los Angeles Times Photographic Collection']"
"['OpenUCLA Collections', 'Los Angeles Times Photographic Collection']"
"""
if rikolti_value == comparison_value:
return
if (
comparison_value and rikolti_value and
len(comparison_value) == 1 and len(rikolti_value) == 2 and
comparison_value[0] == "Los Angeles Times Photographic Collection"
and 'OpenUCLA Collections' in rikolti_value and
'Los Angeles Times Photographic Collection' in rikolti_value
):
return

class SamveraVernacular(OaiVernacular):
record_cls = SamveraRecord
validator = SamveraValidator
2 changes: 1 addition & 1 deletion metadata_mapper/validate_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def validate_page(collection_id: int, page_path: str,
if not solr_record:
validator.log.add(
key=harvest_id,
field="missing record",
field="new record",
description="No Solr data found",
expected=None,
actual=rikolti_record,
Expand Down

0 comments on commit e76a597

Please sign in to comment.