-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
174 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import json | ||
from xml.etree import ElementTree | ||
from .Fetcher import Fetcher | ||
from requests.auth import HTTPBasicAuth | ||
import requests | ||
import re | ||
|
||
|
||
class PraFetcher(Fetcher): | ||
|
||
BASE_URL = "https://us.preservica.com/api/entity/v6.0" | ||
|
||
NAMESPACES = {"pra": "http://preservica.com/EntityAPI/v6.0"} | ||
|
||
def __init__(self, params): | ||
super(PraFetcher, self).__init__(params) | ||
|
||
# If `next_url` is a param, we know that this is not | ||
# the fetch of the first page, so skip setting those | ||
# attributes | ||
if "next_url" in params: | ||
for key in params: | ||
setattr(self, key, params[key]) | ||
return | ||
|
||
credentials = params.get("harvest_data").get("harvest_extra_data") | ||
self.basic_auth_credentials = ([v.strip() for v in credentials.split(',')]) | ||
self.internal_collection_id = re.search(r"(?<=SO_)[0-9a-f-]+", | ||
params.get("harvest_data").get("url")).group(0) | ||
self.original_url = f"{self.BASE_URL}/structural-objects/{self.internal_collection_id}/children" | ||
self.next_url = self.get_first_page_url() | ||
|
||
def get_first_page_url(self): | ||
""" | ||
Two possibilities exist: | ||
1) The `original_url` contains a list of IO children and is the first page of results, or | ||
2) The `original_url` is a list of SO children, hopefully contain 1 item, and we must do more | ||
to get to the list of IO children: | ||
Fetching the first page of items requires two requests to get to it. The first, the original_url, | ||
returns a ChildrenResponse, from which the URL from the first Child is extracted. The second request returns | ||
an EntityResponse, from which the URL is extracted from AdditionalInformation/Children's text node. | ||
""" | ||
request = self.build_url_request(self.original_url) | ||
response = requests.get(**request) | ||
root = ElementTree.fromstring(response.text) | ||
|
||
# If we have IO (Information Object) children, then this is the first page. Otherwise, | ||
# we have to continue digging. | ||
io_children = root.findall(".//pra:Child[@type='IO']", self.NAMESPACES) | ||
if len(io_children) > 0: | ||
return self.original_url | ||
|
||
child_url = root.find(".//pra:Child[@type='SO']", self.NAMESPACES).text | ||
|
||
request = self.build_url_request(child_url) | ||
response = requests.get(**request) | ||
root = ElementTree.fromstring(response.text) | ||
|
||
return root.find(".//pra:AdditionalInformation/pra:Children", self.NAMESPACES).text | ||
|
||
def build_fetch_request(self): | ||
request = self.build_url_request(self.next_url) | ||
|
||
print( | ||
f"[{self.collection_id}]: Fetching page {self.write_page} " | ||
f"at {request.get('url')}") | ||
|
||
return request | ||
|
||
def get_text_from_response(self, response): | ||
# Starting with a list of `information-objects` URLs | ||
object_url_elements = ElementTree.fromstring(response.text).findall("pra:Children/" | ||
"pra:Child", self.NAMESPACES) | ||
|
||
object_urls = [element.text for element in object_url_elements] | ||
|
||
# Getting an individual `information-object`, extracting the URL | ||
# for the oai_dc metadata fragment | ||
metadata_urls = {object_url: self.get_metadata_url_from_object(object_url) | ||
for object_url in object_urls} | ||
|
||
# Getting the metadata | ||
items = {object_url: self.get_metadata_from_url(metadata_url) | ||
for (object_url, metadata_url) in metadata_urls.items() | ||
if metadata_url is not None} | ||
|
||
# Replace XML text node with the response | ||
output_document = response.text | ||
|
||
for search, replace in items.items(): | ||
output_document = output_document.replace(search, replace) | ||
|
||
return output_document | ||
|
||
def get_metadata_url_from_object(self, url): | ||
request = self.build_url_request(url) | ||
response = requests.get(**request) | ||
root = ElementTree.fromstring(response.text) | ||
fragment = root.find(".//pra:Fragment[@schema='http://www.openarchives.org/OAI/2.0/oai_dc/']", self.NAMESPACES) | ||
|
||
return fragment.text if fragment is not None else None | ||
|
||
def get_metadata_from_url(self, url): | ||
request = self.build_url_request(url) | ||
response = requests.get(**request) | ||
return response.text | ||
|
||
# TODO: strip superfluous junk out of metadata XML | ||
# root = ElementTree.fromstring(response.text) | ||
# metadata_response = root.find(".//pra:MetadataResponse", self.NAMESPACES) | ||
# | ||
# return metadata_response.text if metadata_response is not None else None | ||
|
||
def check_page(self, http_resp): | ||
""" | ||
TODO: review other fetchers, do what they do | ||
""" | ||
hits = len(ElementTree.fromstring(http_resp.content).\ | ||
findall(".//pra:Child", self.NAMESPACES)) | ||
|
||
print( | ||
f"[{self.collection_id}]: Fetched page {self.write_page} " | ||
f"at {http_resp.url} with {hits} hits" | ||
) | ||
|
||
return True | ||
|
||
def increment(self, http_resp): | ||
""" | ||
TODO: DOCUMENT ME | ||
""" | ||
super(PraFetcher, self).increment(http_resp) | ||
|
||
next_element = ElementTree.fromstring(http_resp.content).find("pra:Paging/pra:Next", self.NAMESPACES) | ||
self.next_url = next_element.text if next_element is not None else None | ||
|
||
def json(self): | ||
""" | ||
TODO: DOCUMENT ME | ||
""" | ||
current_state = { | ||
"harvest_type": self.harvest_type, | ||
"basic_auth_credentials": self.basic_auth_credentials, | ||
"collection_id": self.collection_id, | ||
"internal_collection_id": self.internal_collection_id, | ||
"original_url": self.original_url, | ||
"next_url": self.next_url, | ||
"write_page": self.write_page | ||
} | ||
|
||
if not self.next_url: | ||
current_state.update({"finished": True}) | ||
|
||
return json.dumps(current_state) | ||
|
||
def build_url_request(self, url): | ||
return {"url": url, "auth": HTTPBasicAuth(*self.basic_auth_credentials)} |