Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calisphere etl #714

Merged
merged 14 commits into from
Jan 30, 2024
Merged
1 change: 1 addition & 0 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export MERGED_DATA=$VERNACULAR_DATA
# metadata_fetcher
export NUXEO= # ask for a key - required to run the NuxeoFetcher
export FLICKR_API_KEY= # ask for a key - required to run the FlickrFetcher
export CALISPHERE_ETL_TOKEN= # ask for token - required to run Calisphere Solr Fetcher

# metadata_mapper
export SKIP_UNDEFINED_ENRICHMENTS=True
Expand Down
5 changes: 4 additions & 1 deletion metadata_fetcher/fetchers/Fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def fetch_page(self):
'status': 'success'
}

def check_page(self, response: requests.Response) -> int:
raise NotImplementedError

def aggregate_vernacular_content(self, response: requests.Response):
return response.text

Expand All @@ -103,7 +106,7 @@ def increment(self, http_resp):
"""increment internal state for fetching the next page

takes as an argument the http_resp from institution API call
https://docs.aiohttp.org/en/stable/client_reference.html#aiohttp.ClientResponse
https://docs.python-requests.org/en/latest/api/#requests.Response
"""
self.write_page = self.write_page + 1

Expand Down
99 changes: 99 additions & 0 deletions metadata_fetcher/fetchers/calisphere_solr_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import json
import requests

from .Fetcher import Fetcher
from ..settings import CALISPHERE_ETL_TOKEN

class CalisphereSolrFetcher(Fetcher):
def __init__(self, params: dict[str, str]):
super(CalisphereSolrFetcher, self).__init__(params)
self.collection_id = params.get("collection_id")
self.cursor_mark = params.get("cursor_mark", "*")
self.num_found = params.get("num_found", 0)
self.num_fetched = params.get("num_fetched", 0)

def build_fetch_request(self) -> dict[str, str]:
"""
Generates arguments for `requests.get()`.

Returns: dict[str, str]
"""
params = {
"fq": (
"collection_url:\"https://registry.cdlib.org/api/v1/"
f"collection/{self.collection_id}/\""
),
"rows": 100,
"cursorMark": self.cursor_mark,
"wt": "json",
"sort": "id asc"
}

request = {
"url": "https://solr.calisphere.org/solr/query",
"headers": {'X-Authentication-Token': CALISPHERE_ETL_TOKEN},
"params": params
}

print(
f"[{self.collection_id}]: Fetching page {self.write_page} "
f"at {request.get('url')} with params {params}")

return request

def check_page(self, http_resp: requests.Response) -> int:
"""
Parameters:
http_resp: requests.Response

Returns: int: number of records in the response
"""

resp_dict = http_resp.json()
hits = len(resp_dict["response"]["docs"])

print(
f"[{self.collection_id}]: Fetched page {self.write_page} "
f"at {http_resp.url} with {hits} hits"
)

return hits

def increment(self, http_resp: requests.Response):
"""
Sets the `next_url` to fetch and increments the page number.

Parameters:
http_resp: requests.Response
"""
super(CalisphereSolrFetcher, self).increment(http_resp)
resp_dict = http_resp.json()

# this is a workaround for solr giving us an extra page
# with zero docs after the last page of results
self.num_found = resp_dict["response"]["numFound"]
self.num_fetched = self.num_fetched + len(resp_dict["response"]["docs"])
if self.cursor_mark != resp_dict["nextCursorMark"] \
and self.num_fetched != self.num_found:
self.cursor_mark = resp_dict["nextCursorMark"]
self.finished = False
else:
self.finished = True

def json(self) -> str:
"""
Generates JSON for the next page of results.

Returns: str
"""
current_state = {
"harvest_type": self.harvest_type,
"collection_id": self.collection_id,
"write_page": self.write_page,
"cursor_mark": self.cursor_mark,
"num_found": self.num_found,
"num_fetched": self.num_fetched,
"finished": self.finished
}

return json.dumps(current_state)
4 changes: 2 additions & 2 deletions metadata_fetcher/fetchers/solr_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def build_fetch_request(self) -> dict[str]:

return request

def check_page(self, http_resp: requests.Response) -> bool:
def check_page(self, http_resp: requests.Response) -> int:
"""
Parameters:
http_resp: requests.Response
Expand All @@ -73,7 +73,7 @@ def check_page(self, http_resp: requests.Response) -> bool:
f"at {http_resp.url} with {hits} hits"
)

return hits > 0
return hits

def increment(self, http_resp: requests.Response):
"""
Expand Down
1 change: 1 addition & 0 deletions metadata_fetcher/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

NUXEO_TOKEN = os.environ.get('NUXEO')
FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY')
CALISPHERE_ETL_TOKEN = os.environ.get('CALISPHERE_ETL_TOKEN')

for key, value in os.environ.items():
logger.debug(f"{key}={value}")
6 changes: 4 additions & 2 deletions metadata_mapper/lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def parse_enrichment_url(enrichment_url):


def run_enrichments(records, collection, enrichment_set, page_filename):
for enrichment_url in collection.get(enrichment_set, []):
enrichment_urls = collection.get(enrichment_set) or []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You've done this in several places, replacing collection.get(some_value, []) with collection.get(some_value) or []. Is there a difference in behavior between those two? It isn't obvious to me what that is. No issue with it, just wondering.

Copy link
Collaborator Author

@barbarahui barbarahui Jan 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amywieliczka @bibliotechy so yeah, it turns out that if the key exists and the value is explicitly None then this is what happens:

Python 3.9.0 (default, May 23 2023, 15:28:21)
[Clang 13.0.0 (clang-1300.0.27.3)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> mydict = {"foo": None}
>>> for x in mydict.get("foo", []):
...    print(x)
...
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: 'NoneType' object is not iterable

Whereas:

>>> mydict.get("foo") or []
[]

for enrichment_url in enrichment_urls:
enrichment_func, kwargs = parse_enrichment_url(enrichment_url)
if not enrichment_func and settings.SKIP_UNDEFINED_ENRICHMENTS:
continue
Expand Down Expand Up @@ -116,7 +117,8 @@ def map_page(

# TODO: analyze and simplify this straight port of the
# solr updater module into the Rikolti framework
mapped_records = [record.solr_updater() for record in mapped_records]
if collection.get('rikolti_mapper_type') != 'calisphere_solr.calisphere_solr':
mapped_records = [record.solr_updater() for record in mapped_records]
mapped_records = [record.remove_none_values() for record in mapped_records]

group_page_exceptions = {}
Expand Down
2 changes: 1 addition & 1 deletion metadata_mapper/lambda_shepherd.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def check_for_missing_enrichments(collection):
not_yet_implemented = []
collection_enrichments = (
(collection.get('rikolti__pre_mapping') or []) +
collection.get('rikolti__enrichments')
(collection.get('rikolti__enrichments') or [])
)
for e_url in collection_enrichments:
e_path = urlparse(e_url).path
Expand Down
4 changes: 2 additions & 2 deletions metadata_mapper/map_registry_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ def map_endpoint(url, fetched_versions, limit=None):
print(f"{ collection_id:<6}: not fetched yet", file=sys.stderr)
continue

pre_mapping = map_result.get('pre_mapping', [])
pre_mapping = map_result.get('pre_mapping') or []
if len(pre_mapping) > 0:
print(
f"{collection_id:<6}: {'pre-mapping enrichments':<24}: "
f"\"{pre_mapping}\""
)

enrichments = map_result.get('enrichments', [])
enrichments = map_result.get('enrichments') or []
if len(enrichments) > 0:
print(
f"{collection_id:<6}, {'post-mapping enrichments':<24}: "
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import json

from ..mapper import Record, Vernacular, Validator

class CalisphereSolrRecord(Record):
# This mapper does not handle Nuxeo record complexities, meaning:
# - it ignores structmap* solr fields for complex objects
# - it does not map media_source
def UCLDC_map(self) -> dict:
return {
"calisphere-id": self.map_calisphere_id(),
"is_shown_at": self.source_metadata.get("url_item"),
"thumbnail_source": self.map_thumbnail_source(),
"title": self.source_metadata.get("title"),
"alternative_title": self.source_metadata.get("alternative_title", None),
"contributor": self.source_metadata.get("contributor", None),
"coverage": self.source_metadata.get("coverage", None),
"creator": self.source_metadata.get("creator", None),
"date": self.source_metadata.get("date", None),
"extent": self.source_metadata.get("extent", None),
"format": self.source_metadata.get("format", None),
"genre": self.source_metadata.get("genre", None),
"identifier": self.source_metadata.get("identifier", None),
"language": self.source_metadata.get("language", None),
"location": self.source_metadata.get("location", None),
"publisher": self.source_metadata.get("publisher", None),
"relation":self.source_metadata.get("relation", None),
"rights": self.source_metadata.get("rights", None),
"rights_holder": self.source_metadata.get("rights_holder", None),
"rights_note": self.source_metadata.get("rights_note", None),
"rights_date": self.source_metadata.get("rights_date", None),
"source": self.source_metadata.get("source", None),
"spatial": self.source_metadata.get("spatial", None),
"subject": self.source_metadata.get("subject", None),
"temporal": self.source_metadata.get("temporal", None),
"type": self.source_metadata.get("type", None),
"sort_title": self.source_metadata.get("sort_title", None),
"description": self.source_metadata.get("description", None),
"provenance": self.source_metadata.get("provenance", None),
"transcription": self.source_metadata.get("transcription", None),
"id": self.source_metadata.get("id", None),
"campus_name": self.source_metadata.get("campus_name", None),
"campus_data": self.source_metadata.get("campus_data", None),
"collection_name": self.source_metadata.get("collection_name", None),
"collection_data": self.source_metadata.get("collection_data", None),
"collection_url": self.source_metadata.get("collection_url", None),
"sort_collection_data": self.source_metadata.get("sort_collection_data", None),
"repository_name": self.source_metadata.get("repository_name", None),
"repository_data": self.source_metadata.get("repository_data", None),
"repository_url": self.source_metadata.get("repository_url", None),
"rights_uri": self.source_metadata.get("rights_uri", None),
"manifest": self.source_metadata.get("manifest", None),
"object_template": self.source_metadata.get("object_template", None),
"url_item": self.source_metadata.get("url_item", None),
"created": self.source_metadata.get("created", None),
"last_modified": self.source_metadata.get("last_modified", None),
"sort_date_start": self.source_metadata.get("sort_date_start", None),
"sort_date_end": self.source_metadata.get("sort_date_end", None),
"campus_id": self.source_metadata.get("campus_id", None),
"collection_id": self.source_metadata.get("collection_id", None),
"repository_id": self.source_metadata.get("repository_id", None),
"item_count": self.source_metadata.get("item_count", None),
"reference_image_md5": self.source_metadata.get("reference_image_md5", None),
"reference_image_dimensions": self.source_metadata.get("reference_image_dimensions", None),
}

def map_calisphere_id(self):
harvest_id = self.source_metadata.get('harvest_id_s')
return harvest_id.split("--")[1]

def map_thumbnail_source(self):
image_md5 = self.source_metadata.get("reference_image_md5", None)
if image_md5:
return f"https://static-ucldc-cdlib-org.s3.us-west-2.amazonaws.com/harvested_images/{image_md5}"

class CalisphereSolrValidator(Validator):
def setup(self):
self.remove_validatable_field(field="is_shown_by")

class CalisphereSolrVernacular(Vernacular):
record_cls = CalisphereSolrRecord
validator = CalisphereSolrValidator

def parse(self, api_response):
page_element = json.loads(api_response)
records = page_element.get("response", {}).get("docs", [])
return self.get_records([record for record in records])
Loading