Skip to content

Commit

Permalink
Merge pull request #32 from openeduhub/niedersachsen_abi
Browse files Browse the repository at this point in the history
niedersachsen_abi_spider v0.0.2
  • Loading branch information
torsten-simon authored Oct 5, 2021
2 parents 8a4aa6a + 2c0171b commit 498ccfd
Show file tree
Hide file tree
Showing 9 changed files with 743 additions and 7 deletions.
29 changes: 26 additions & 3 deletions converter/es_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,27 @@ def setPermissions(self, uuid, permissions) -> bool:
return True
except ApiException as e:
return False
def setNodeBinaryData(self, uuid, item) -> bool:
if "binary" in item:
logging.info(get_project_settings().get("EDU_SHARING_BASE_URL")
+ "rest/node/v1/nodes/-home-/"
+ uuid
+ "/content?mimetype="
+ item["lom"]["technical"]["format"]
)
files = {"file": item["binary"]}
response = requests.post(
get_project_settings().get("EDU_SHARING_BASE_URL")
+ "rest/node/v1/nodes/-home-/"
+ uuid
+ "/content?mimetype="
+ item["lom"]["technical"]["format"],
headers=self.getHeaders(None),
files=files,
)
return response.status_code == 200
else:
return False

def setNodePreview(self, uuid, item) -> bool:
if "thumbnail" in item:
Expand Down Expand Up @@ -243,8 +264,9 @@ def transformItem(self, uuid, spider, item):
"ccm:objecttype": item["type"],
"ccm:replicationsourceuuid": uuid,
"cm:name": item["lom"]["general"]["title"],
"ccm:wwwurl": item["lom"]["technical"]["location"],
"cclom:location": item["lom"]["technical"]["location"],
"ccm:wwwurl": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
"cclom:location": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
"cclom:format": item["lom"]["technical"]["format"] if "format" in item["lom"]["technical"] else None,
"cclom:title": item["lom"]["general"]["title"],
}
if "notes" in item:
Expand Down Expand Up @@ -491,7 +513,8 @@ def insertItem(self, spider, uuid, item):
node = self.syncNode(spider, "ccm:io", self.transformItem(uuid, spider, item))
self.setNodePermissions(node["ref"]["id"], item)
self.setNodePreview(node["ref"]["id"], item)
self.setNodeText(node["ref"]["id"], item)
if not self.setNodeBinaryData(node["ref"]["id"], item):
self.setNodeText(node["ref"]["id"], item)

def updateItem(self, spider, uuid, item):
self.insertItem(spider, uuid, item)
Expand Down
5 changes: 4 additions & 1 deletion converter/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,18 @@ class BaseItem(Item):
ranking = Field()
fulltext = Field()
thumbnail = Field()
"thumbnail data in base64"
lastModified = Field()
lom = Field(serializer=LomBaseItem)
valuespaces = Field(serializer=ValuespaceItem)
permissions = Field(serializer=PermissionItem)
"permissions (access rights) for this entry"
license = Field(serializer=LicenseItem)
publisher = Field()
# editorial notes
notes = Field()
"editorial notes"
binary = Field()
"binary data which should be uploaded (raw data)"


class BaseItemLoader(ItemLoader):
Expand Down
6 changes: 3 additions & 3 deletions converter/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ def process_item(self, raw_item, spider):
except KeyError:
raise DropItem(f'Item {item} has no lom.technical.location')
try:
if "location" not in item["lom"]["technical"]:
if "location" not in item["lom"]["technical"] and not "binary" in item:
raise DropItem(
"Entry {} has no technical location".format(item["lom"]["general"]["title"])
"Entry {} has no technical location or binary data".format(item["lom"]["general"]["title"])
)
except KeyError:
raise DropItem(f'Item {item} has no lom.technical.location')
Expand Down Expand Up @@ -519,7 +519,7 @@ def process_item(self, raw_item, spider):
title = "<no title>"
if "title" in item["lom"]["general"]:
title = str(item["lom"]["general"]["title"])
entryUUID = EduSharing.buildUUID(item["response"]["url"])
entryUUID = EduSharing.buildUUID(item["response"]["url"] if "url" in item["response"] else item["hash"])
self.insertItem(spider, entryUUID, item)
logging.info("item " + entryUUID + " inserted/updated")

Expand Down
209 changes: 209 additions & 0 deletions converter/spiders/niedersachsen_abi_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import logging
import os
from _datetime import datetime

import scrapy

from .base_classes import LomBase
from .scripts.lower_saxony_abi.directory_routine import DirectoryInitializer, UnZipper, \
DirectoryScanner
from .scripts.lower_saxony_abi.keyword_mapper import LoSaxKeywordMapper
from ..constants import Constants
from ..items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
LomLifecycleItemloader, LomEducationalItemLoader, LicenseItemLoader, ResponseItemLoader, \
ValuespaceItemLoader


class NiedersachsenAbiSpider(scrapy.Spider, LomBase):
name = 'niedersachsen_abi_spider'

allowed_domains = ['za-aufgaben.nibis.de']
start_urls = ['https://za-aufgaben.nibis.de']
version = "0.0.2"
# Default values for the 2 expected parameters. Parameter "filename" is always required, "skip_unzip" is optional.
filename = None
skip_unzip = False
pdf_dictionary_general = dict()
pdf_dictionary_additional = dict()

# Running the crawler from the command line with the exact filename as a parameter:
# scrapy crawl niedersachsen_abi_spider -a filename="za-download-6e05cbbb6e07250c69ebe95ae972fe8a.zip"
# -a skip_unzip="yes"
# Make sure that there is a corresponding .zip file inside the /zip_download/-folder in the project root

# def start_requests(self):
# yield self.parse(None)

def __init__(self, **kwargs):
super().__init__(**kwargs)
# logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
# logging.disable(logging.DEBUG)
directory_paths = DirectoryInitializer()
zip_file_dictionary = directory_paths.check_download_folder_for_zip_files()

# only extract files if a "filename"-parameter was given:
if self.filename is not None:
zip_selection = self.filename

logging.debug(f"Selected .zip file by CLI-parameter: {zip_selection}")
logging.debug(f"User wants to skip the unzipping? {self.skip_unzip}")
# by default, the script should always unzip the desired .zip file,
# but unzipping the nested .zip files is only done when requested by parameter
if self.skip_unzip == "no":
self.skip_unzip = False
if self.skip_unzip == "yes":
self.skip_unzip = True
logging.debug(f"skip_unzip variable: {self.skip_unzip}")

if self.skip_unzip is False:
un_zipper = UnZipper()
un_zipper.directory_paths = directory_paths.get_path_storage()
un_zipper.zip_file_dictionary = zip_file_dictionary
zip_file_chosen_by_user = \
un_zipper.compare_selected_zip_file_with_recognized_files(zip_selection=zip_selection)

if zip_file_chosen_by_user is not None:
un_zipper.unzip_all_zips_within_the_initial_zip(zip_file=zip_file_chosen_by_user,
skip_unzip=self.skip_unzip)

logging.debug(f"Extracted the following zip files:")
logging.debug(un_zipper.zip_files_already_extracted)

# always scan the /zip_extract/-directory for pdfs and try to extract metadata
print(
f"Analyzing file paths for '.pdf'-files inside "
f"{directory_paths.path_storage.path_to_extraction_directory}")
pdfs_in_directory: dict = \
DirectoryScanner.scan_directory_for_pdfs(directory_paths.path_storage.path_to_extraction_directory)
# logging.debug(pp.pformat(pdfs_in_directory))
print(f"Total .pdf items in the above mentioned directory: {len(pdfs_in_directory.keys())}")
if len(pdfs_in_directory.keys()) == 0:
raise Exception(f"No .pdf files found inside {directory_paths.path_storage.path_to_extraction_directory}. "
f"Please make sure that you've run the crawler with '-a filename=<zip filename>' "
f"parameter first and that there's actual .pdf files inside the extraction directory")
kw_mapper = LoSaxKeywordMapper()
pdf_dict1, pdf_dict2 = kw_mapper.extract_pdf_metadata(pdfs_in_directory)
self.pdf_dictionary_general = pdf_dict1
self.pdf_dictionary_additional = pdf_dict2

def getId(self, response=None) -> str:
pass

def getHash(self, response=None) -> str:
pass

def parse(self, response, **kwargs):
# print(f"filename = {self.filename}")
# print(f"skip_unzip = {self.skip_unzip}")
logging.debug(f"The .pdf (general) dictionary has {len(self.pdf_dictionary_general.keys())} files")
logging.debug(f"The dictionary for additional .pdf files has "
f"{len(self.pdf_dictionary_additional.keys())} entries")

# first we're scraping all the .pdf files that follow the more general RegEx syntax
for pdf_item in self.pdf_dictionary_general:
current_dict: dict = self.pdf_dictionary_general.get(pdf_item)
# pprint.pprint(current_dict)
base = BaseItemLoader()
base.add_value('sourceId', pdf_item)
hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
base.add_value('hash', hash_temp)
base.add_value('type', Constants.TYPE_MATERIAL)
base.add_value('binary', self.get_binary(current_dict, pdf_item))

lom = LomBaseItemloader()

general = LomGeneralItemloader()
title_long: str = ' '.join(current_dict.get('keywords'))
general.add_value('title', title_long)
general.add_value('identifier', pdf_item)
general.add_value('keyword', current_dict.get('keywords'))
lom.add_value('general', general.load_item())

technical = LomTechnicalItemLoader()
technical.add_value('format', 'application/pdf')
lom.add_value('technical', technical.load_item())

lifecycle = LomLifecycleItemloader()
lifecycle.add_value('role', 'publisher')
lifecycle.add_value('organization', 'Niedersächsisches Kultusministerium')
lom.add_value('lifecycle', lifecycle.load_item())

educational = LomEducationalItemLoader()
lom.add_value('educational', educational.load_item())

base.add_value('lom', lom.load_item())

vs = ValuespaceItemLoader()
if current_dict.get('discipline') is not None:
vs.add_value('discipline', current_dict.get('discipline'))
if current_dict.get('intendedEndUserRole') is not None:
vs.add_value('intendedEndUserRole', current_dict.get('intendedEndUserRole'))
base.add_value('valuespaces', vs.load_item())

lic = LicenseItemLoader()
base.add_value('license', lic.load_item())

permissions = LomBase.getPermissions(self)
base.add_value('permissions', permissions.load_item())

response_loader = ResponseItemLoader()
base.add_value('response', response_loader.load_item())

yield base.load_item()

# Making sure that we also grab the additional .pdf files that don't follow the general filename syntax
for pdf_item in self.pdf_dictionary_additional:
current_dict: dict = self.pdf_dictionary_additional.get(pdf_item)
# pprint.pprint(current_dict)
base = BaseItemLoader()
base.add_value('sourceId', pdf_item)
hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
base.add_value('hash', hash_temp)
base.add_value('type', Constants.TYPE_MATERIAL)
base.add_value('binary', self.get_binary(current_dict, pdf_item))

lom = LomBaseItemloader()

general = LomGeneralItemloader()
general.add_value('title', pdf_item.split('.')[:-1])
general.add_value('identifier', pdf_item)
general.add_value('keyword', current_dict.get('keywords'))
lom.add_value('general', general.load_item())

technical = LomTechnicalItemLoader()
technical.add_value('format', 'application/pdf')
lom.add_value('technical', technical.load_item())

lifecycle = LomLifecycleItemloader()
lifecycle.add_value('role', 'publisher')
lifecycle.add_value('organization', 'Niedersächsisches Kultusministerium')
lom.add_value('lifecycle', lifecycle.load_item())

educational = LomEducationalItemLoader()
lom.add_value('educational', educational.load_item())

base.add_value('lom', lom.load_item())

vs = ValuespaceItemLoader()
if current_dict.get('discipline') is not None:
vs.add_value('discipline', current_dict.get('discipline'))
base.add_value('valuespaces', vs.load_item())

lic = LicenseItemLoader()
base.add_value('license', lic.load_item())

permissions = LomBase.getPermissions(self)
base.add_value('permissions', permissions.load_item())

response_loader = ResponseItemLoader()
base.add_value('response', response_loader.load_item())

yield base.load_item()

@staticmethod
def get_binary(current_dict, pdf_item):
filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
file = open(filepath_full, mode='rb')
binary = file.read()
file.close()
return binary
Empty file.
Empty file.
Loading

0 comments on commit 498ccfd

Please sign in to comment.