Skip to content

Commit

Permalink
[WIP] Rough out PRA fetcher
Browse files Browse the repository at this point in the history
  • Loading branch information
lthurston committed Mar 29, 2023
1 parent 5c13baa commit 3c6d140
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 7 deletions.
15 changes: 10 additions & 5 deletions metadata_fetcher/fetchers/Fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,34 @@ def fetch_page(self):
raise FetchError(
f"[{self.collection_id}]: unable to fetch page {page}")


if self.check_page(response):
text = self.get_text_from_response(response)
if settings.DATA_DEST == 'local':
self.fetchtolocal(response.text)
self.fetchtolocal(text)
else:
self.fetchtos3(response.text)
self.fetchtos3(text)

self.increment(response)

return self.json()

def get_text_from_response(self, response):
return response.text

def build_fetch_request(self):
"""build parameters for the institution's requests.get()
this should minimally return {'url': str} but may also include
{'headers': {}, 'params': {}} or any other options accepted by
this should minimally return {'url': str} but may also include
{'headers': {}, 'params': {}} or any other options accepted by
https://docs.python-requests.org/en/latest/api/#requests.get
"""
pass

def get_records(self, http_resp):
"""parses http_resp from institutional API into a list of records
should return a list of dictionaries which can easily be serialized
should return a list of dictionaries which can easily be serialized
by json.dumps into json line format; takes as an argument:
https://docs.python-requests.org/en/latest/api/#requests.Response
"""
Expand Down
4 changes: 2 additions & 2 deletions metadata_fetcher/fetchers/oac_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def __init__(self, params):
# https://stackoverflow.com/questions/20129996/why-does-boolxml-etree-elementtree-element-evaluate-to-false
counts = {
'total': total.attrib['totalDocs'] if total else 0,
'image': int(image_group.attrib['totalDocs'])
'image': int(image_group.attrib['totalDocs'])
if image_group is not None else 0,
'text': int(text_group.attrib['totalDocs'])
'text': int(text_group.attrib['totalDocs'])
if text_group is not None else 0,
'harvested': 0,
'harvested_image': 0,
Expand Down
160 changes: 160 additions & 0 deletions metadata_fetcher/fetchers/pra_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import json
from xml.etree import ElementTree
from .Fetcher import Fetcher
from requests.auth import HTTPBasicAuth
import requests
import re


class PraFetcher(Fetcher):

BASE_URL = "https://us.preservica.com/api/entity/v6.0"

NAMESPACES = {"pra": "http://preservica.com/EntityAPI/v6.0"}

def __init__(self, params):
super(PraFetcher, self).__init__(params)

# If `next_url` is a param, we know that this is not
# the fetch of the first page, so skip setting those
# attributes
if "next_url" in params:
for key in params:
setattr(self, key, params[key])
return

credentials = params.get("harvest_data").get("harvest_extra_data")
self.basic_auth_credentials = ([v.strip() for v in credentials.split(',')])
self.internal_collection_id = re.search(r"(?<=SO_)[0-9a-f-]+",
params.get("harvest_data").get("url")).group(0)
self.original_url = f"{self.BASE_URL}/structural-objects/{self.internal_collection_id}/children"
self.next_url = self.get_first_page_url()

def get_first_page_url(self):
"""
Two possibilities exist:
1) The `original_url` contains a list of IO children and is the first page of results, or
2) The `original_url` is a list of SO children, hopefully contain 1 item, and we must do more
to get to the list of IO children:
Fetching the first page of items requires two requests to get to it. The first, the original_url,
returns a ChildrenResponse, from which the URL from the first Child is extracted. The second request returns
an EntityResponse, from which the URL is extracted from AdditionalInformation/Children's text node.
"""
request = self.build_url_request(self.original_url)
response = requests.get(**request)
root = ElementTree.fromstring(response.text)

# If we have IO (Information Object) children, then this is the first page. Otherwise,
# we have to continue digging.
io_children = root.findall(".//pra:Child[@type='IO']", self.NAMESPACES)
if len(io_children) > 0:
return self.original_url

child_url = root.find(".//pra:Child[@type='SO']", self.NAMESPACES).text

request = self.build_url_request(child_url)
response = requests.get(**request)
root = ElementTree.fromstring(response.text)

return root.find(".//pra:AdditionalInformation/pra:Children", self.NAMESPACES).text

def build_fetch_request(self):
request = self.build_url_request(self.next_url)

print(
f"[{self.collection_id}]: Fetching page {self.write_page} "
f"at {request.get('url')}")

return request

def get_text_from_response(self, response):
# Starting with a list of `information-objects` URLs
object_url_elements = ElementTree.fromstring(response.text).findall("pra:Children/"
"pra:Child", self.NAMESPACES)

object_urls = [element.text for element in object_url_elements]

# Getting an individual `information-object`, extracting the URL
# for the oai_dc metadata fragment
metadata_urls = {object_url: self.get_metadata_url_from_object(object_url)
for object_url in object_urls}

# Getting the metadata
items = {object_url: self.get_metadata_from_url(metadata_url)
for (object_url, metadata_url) in metadata_urls.items()
if metadata_url is not None}

# TODO: replace this string based XML generation with ET-based approach
# Replace XML text node with the response
output_document = response.text

for search, replace in items.items():
output_document = output_document.replace(search, replace)

return output_document

def get_metadata_url_from_object(self, url):
request = self.build_url_request(url)
response = requests.get(**request)
root = ElementTree.fromstring(response.text)
fragment = root.find(".//pra:Fragment[@schema='http://www.openarchives.org/OAI/2.0/oai_dc/']", self.NAMESPACES)

return fragment.text if fragment is not None else None

def get_metadata_from_url(self, url):
request = self.build_url_request(url)
response = requests.get(**request)
return response.text

# TODO: strip superfluous junk out of metadata XML
# root = ElementTree.fromstring(response.text)
# metadata_response = root.find(".//pra:MetadataResponse", self.NAMESPACES)
#
# return metadata_response.text if metadata_response is not None else None

def check_page(self, http_resp):
"""
TODO: review other fetchers, do what they do
"""
hits = len(ElementTree.fromstring(http_resp.content).\
findall(".//pra:Child", self.NAMESPACES))

print(
f"[{self.collection_id}]: Fetched page {self.write_page} "
f"at {http_resp.url} with {hits} hits"
)

return True

def increment(self, http_resp):
"""
TODO: DOCUMENT ME
"""
super(PraFetcher, self).increment(http_resp)

next_element = ElementTree.fromstring(http_resp.content).find("pra:Paging/pra:Next", self.NAMESPACES)
self.next_url = next_element.text if next_element is not None else None

def json(self):
"""
TODO: DOCUMENT ME
"""
current_state = {
"harvest_type": self.harvest_type,
"basic_auth_credentials": self.basic_auth_credentials,
"collection_id": self.collection_id,
"internal_collection_id": self.internal_collection_id,
"original_url": self.original_url,
"next_url": self.next_url,
"write_page": self.write_page
}

if not self.next_url:
current_state.update({"finished": True})

return json.dumps(current_state)

def build_url_request(self, url):
return {"url": url, "auth": HTTPBasicAuth(*self.basic_auth_credentials)}

0 comments on commit 3c6d140

Please sign in to comment.