Merge pull request #32 from openeduhub/niedersachsen_abi

niedersachsen_abi_spider v0.0.2
hpi-schul-cloud · Oct 5, 2021 · 498ccfd · 498ccfd
2 parents 8a4aa6a + 2c0171b
commit 498ccfd
Show file tree

Hide file tree

Showing 9 changed files with 743 additions and 7 deletions.
diff --git a/converter/es_connector.py b/converter/es_connector.py
@@ -168,6 +168,27 @@ def setPermissions(self, uuid, permissions) -> bool:
             return True
         except ApiException as e:
             return False
+    def setNodeBinaryData(self, uuid, item) -> bool:
+        if "binary" in item:
+            logging.info(get_project_settings().get("EDU_SHARING_BASE_URL")
+                + "rest/node/v1/nodes/-home-/"
+                + uuid
+                + "/content?mimetype="
+                + item["lom"]["technical"]["format"]
+                         )
+            files = {"file": item["binary"]}
+            response = requests.post(
+                get_project_settings().get("EDU_SHARING_BASE_URL")
+                + "rest/node/v1/nodes/-home-/"
+                + uuid
+                + "/content?mimetype="
+                + item["lom"]["technical"]["format"],
+                headers=self.getHeaders(None),
+                files=files,
+            )
+            return response.status_code == 200
+        else:
+            return False
 
     def setNodePreview(self, uuid, item) -> bool:
         if "thumbnail" in item:
@@ -243,8 +264,9 @@ def transformItem(self, uuid, spider, item):
             "ccm:objecttype": item["type"],
             "ccm:replicationsourceuuid": uuid,
             "cm:name": item["lom"]["general"]["title"],
-            "ccm:wwwurl": item["lom"]["technical"]["location"],
-            "cclom:location": item["lom"]["technical"]["location"],
+            "ccm:wwwurl": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
+            "cclom:location": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
+            "cclom:format": item["lom"]["technical"]["format"] if "format" in item["lom"]["technical"] else None,
             "cclom:title": item["lom"]["general"]["title"],
         }
         if "notes" in item:
@@ -491,7 +513,8 @@ def insertItem(self, spider, uuid, item):
         node = self.syncNode(spider, "ccm:io", self.transformItem(uuid, spider, item))
         self.setNodePermissions(node["ref"]["id"], item)
         self.setNodePreview(node["ref"]["id"], item)
-        self.setNodeText(node["ref"]["id"], item)
+        if not self.setNodeBinaryData(node["ref"]["id"], item):
+            self.setNodeText(node["ref"]["id"], item)
 
     def updateItem(self, spider, uuid, item):
         self.insertItem(spider, uuid, item)

diff --git a/converter/items.py b/converter/items.py
@@ -181,15 +181,18 @@ class BaseItem(Item):
     ranking = Field()
     fulltext = Field()
     thumbnail = Field()
+    "thumbnail data in base64"
     lastModified = Field()
     lom = Field(serializer=LomBaseItem)
     valuespaces = Field(serializer=ValuespaceItem)
     permissions = Field(serializer=PermissionItem)
     "permissions (access rights) for this entry"
     license = Field(serializer=LicenseItem)
     publisher = Field()
-    # editorial notes
     notes = Field()
+    "editorial notes"
+    binary = Field()
+    "binary data which should be uploaded (raw data)"
 
 
 class BaseItemLoader(ItemLoader):

diff --git a/converter/pipelines.py b/converter/pipelines.py
@@ -112,9 +112,9 @@ def process_item(self, raw_item, spider):
         except KeyError:
             raise DropItem(f'Item {item} has no lom.technical.location')
         try:
-            if "location" not in item["lom"]["technical"]:
+            if "location" not in item["lom"]["technical"] and not "binary" in item:
                 raise DropItem(
-                    "Entry {} has no technical location".format(item["lom"]["general"]["title"])
+                    "Entry {} has no technical location or binary data".format(item["lom"]["general"]["title"])
                 )
         except KeyError:
             raise DropItem(f'Item {item} has no lom.technical.location')
@@ -519,7 +519,7 @@ def process_item(self, raw_item, spider):
         title = "<no title>"
         if "title" in item["lom"]["general"]:
             title = str(item["lom"]["general"]["title"])
-        entryUUID = EduSharing.buildUUID(item["response"]["url"])
+        entryUUID = EduSharing.buildUUID(item["response"]["url"] if "url" in item["response"] else item["hash"])
         self.insertItem(spider, entryUUID, item)
         logging.info("item " + entryUUID + " inserted/updated")
 

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
@@ -0,0 +1,209 @@
+import logging
+import os
+from _datetime import datetime
+
+import scrapy
+
+from .base_classes import LomBase
+from .scripts.lower_saxony_abi.directory_routine import DirectoryInitializer, UnZipper, \
+    DirectoryScanner
+from .scripts.lower_saxony_abi.keyword_mapper import LoSaxKeywordMapper
+from ..constants import Constants
+from ..items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
+    LomLifecycleItemloader, LomEducationalItemLoader, LicenseItemLoader, ResponseItemLoader, \
+    ValuespaceItemLoader
+
+
+class NiedersachsenAbiSpider(scrapy.Spider, LomBase):
+    name = 'niedersachsen_abi_spider'
+
+    allowed_domains = ['za-aufgaben.nibis.de']
+    start_urls = ['https://za-aufgaben.nibis.de']
+    version = "0.0.2"
+    # Default values for the 2 expected parameters. Parameter "filename" is always required, "skip_unzip" is optional.
+    filename = None
+    skip_unzip = False
+    pdf_dictionary_general = dict()
+    pdf_dictionary_additional = dict()
+
+    # Running the crawler from the command line with the exact filename as a parameter:
+    #   scrapy crawl niedersachsen_abi_spider -a filename="za-download-6e05cbbb6e07250c69ebe95ae972fe8a.zip"
+    #   -a skip_unzip="yes"
+    # Make sure that there is a corresponding .zip file inside the /zip_download/-folder in the project root
+
+    # def start_requests(self):
+    #    yield self.parse(None)
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
+        # logging.disable(logging.DEBUG)
+        directory_paths = DirectoryInitializer()
+        zip_file_dictionary = directory_paths.check_download_folder_for_zip_files()
+
+        # only extract files if a "filename"-parameter was given:
+        if self.filename is not None:
+            zip_selection = self.filename
+
+            logging.debug(f"Selected .zip file by CLI-parameter: {zip_selection}")
+            logging.debug(f"User wants to skip the unzipping? {self.skip_unzip}")
+            # by default, the script should always unzip the desired .zip file,
+            # but unzipping the nested .zip files is only done when requested by parameter
+            if self.skip_unzip == "no":
+                self.skip_unzip = False
+            if self.skip_unzip == "yes":
+                self.skip_unzip = True
+            logging.debug(f"skip_unzip variable: {self.skip_unzip}")
+
+            if self.skip_unzip is False:
+                un_zipper = UnZipper()
+                un_zipper.directory_paths = directory_paths.get_path_storage()
+                un_zipper.zip_file_dictionary = zip_file_dictionary
+                zip_file_chosen_by_user = \
+                    un_zipper.compare_selected_zip_file_with_recognized_files(zip_selection=zip_selection)
+
+                if zip_file_chosen_by_user is not None:
+                    un_zipper.unzip_all_zips_within_the_initial_zip(zip_file=zip_file_chosen_by_user,
+                                                                    skip_unzip=self.skip_unzip)
+
+                    logging.debug(f"Extracted the following zip files:")
+                    logging.debug(un_zipper.zip_files_already_extracted)
+
+        # always scan the /zip_extract/-directory for pdfs and try to extract metadata
+        print(
+            f"Analyzing file paths for '.pdf'-files inside "
+            f"{directory_paths.path_storage.path_to_extraction_directory}")
+        pdfs_in_directory: dict = \
+            DirectoryScanner.scan_directory_for_pdfs(directory_paths.path_storage.path_to_extraction_directory)
+        # logging.debug(pp.pformat(pdfs_in_directory))
+        print(f"Total .pdf items in the above mentioned directory: {len(pdfs_in_directory.keys())}")
+        if len(pdfs_in_directory.keys()) == 0:
+            raise Exception(f"No .pdf files found inside {directory_paths.path_storage.path_to_extraction_directory}. "
+                            f"Please make sure that you've run the crawler with '-a filename=<zip filename>' "
+                            f"parameter first and that there's actual .pdf files inside the extraction directory")
+        kw_mapper = LoSaxKeywordMapper()
+        pdf_dict1, pdf_dict2 = kw_mapper.extract_pdf_metadata(pdfs_in_directory)
+        self.pdf_dictionary_general = pdf_dict1
+        self.pdf_dictionary_additional = pdf_dict2
+
+    def getId(self, response=None) -> str:
+        pass
+
+    def getHash(self, response=None) -> str:
+        pass
+
+    def parse(self, response, **kwargs):
+        # print(f"filename = {self.filename}")
+        # print(f"skip_unzip = {self.skip_unzip}")
+        logging.debug(f"The .pdf (general) dictionary has {len(self.pdf_dictionary_general.keys())} files")
+        logging.debug(f"The dictionary for additional .pdf files has "
+                      f"{len(self.pdf_dictionary_additional.keys())} entries")
+
+        # first we're scraping all the .pdf files that follow the more general RegEx syntax
+        for pdf_item in self.pdf_dictionary_general:
+            current_dict: dict = self.pdf_dictionary_general.get(pdf_item)
+            # pprint.pprint(current_dict)
+            base = BaseItemLoader()
+            base.add_value('sourceId', pdf_item)
+            hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
+            base.add_value('hash', hash_temp)
+            base.add_value('type', Constants.TYPE_MATERIAL)
+            base.add_value('binary', self.get_binary(current_dict, pdf_item))
+
+            lom = LomBaseItemloader()
+
+            general = LomGeneralItemloader()
+            title_long: str = ' '.join(current_dict.get('keywords'))
+            general.add_value('title', title_long)
+            general.add_value('identifier', pdf_item)
+            general.add_value('keyword', current_dict.get('keywords'))
+            lom.add_value('general', general.load_item())
+
+            technical = LomTechnicalItemLoader()
+            technical.add_value('format', 'application/pdf')
+            lom.add_value('technical', technical.load_item())
+
+            lifecycle = LomLifecycleItemloader()
+            lifecycle.add_value('role', 'publisher')
+            lifecycle.add_value('organization', 'Niedersächsisches Kultusministerium')
+            lom.add_value('lifecycle', lifecycle.load_item())
+
+            educational = LomEducationalItemLoader()
+            lom.add_value('educational', educational.load_item())
+
+            base.add_value('lom', lom.load_item())
+
+            vs = ValuespaceItemLoader()
+            if current_dict.get('discipline') is not None:
+                vs.add_value('discipline', current_dict.get('discipline'))
+            if current_dict.get('intendedEndUserRole') is not None:
+                vs.add_value('intendedEndUserRole', current_dict.get('intendedEndUserRole'))
+            base.add_value('valuespaces', vs.load_item())
+
+            lic = LicenseItemLoader()
+            base.add_value('license', lic.load_item())
+
+            permissions = LomBase.getPermissions(self)
+            base.add_value('permissions', permissions.load_item())
+
+            response_loader = ResponseItemLoader()
+            base.add_value('response', response_loader.load_item())
+
+            yield base.load_item()
+
+        # Making sure that we also grab the additional .pdf files that don't follow the general filename syntax
+        for pdf_item in self.pdf_dictionary_additional:
+            current_dict: dict = self.pdf_dictionary_additional.get(pdf_item)
+            # pprint.pprint(current_dict)
+            base = BaseItemLoader()
+            base.add_value('sourceId', pdf_item)
+            hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
+            base.add_value('hash', hash_temp)
+            base.add_value('type', Constants.TYPE_MATERIAL)
+            base.add_value('binary', self.get_binary(current_dict, pdf_item))
+
+            lom = LomBaseItemloader()
+
+            general = LomGeneralItemloader()
+            general.add_value('title', pdf_item.split('.')[:-1])
+            general.add_value('identifier', pdf_item)
+            general.add_value('keyword', current_dict.get('keywords'))
+            lom.add_value('general', general.load_item())
+
+            technical = LomTechnicalItemLoader()
+            technical.add_value('format', 'application/pdf')
+            lom.add_value('technical', technical.load_item())
+
+            lifecycle = LomLifecycleItemloader()
+            lifecycle.add_value('role', 'publisher')
+            lifecycle.add_value('organization', 'Niedersächsisches Kultusministerium')
+            lom.add_value('lifecycle', lifecycle.load_item())
+
+            educational = LomEducationalItemLoader()
+            lom.add_value('educational', educational.load_item())
+
+            base.add_value('lom', lom.load_item())
+
+            vs = ValuespaceItemLoader()
+            if current_dict.get('discipline') is not None:
+                vs.add_value('discipline', current_dict.get('discipline'))
+            base.add_value('valuespaces', vs.load_item())
+
+            lic = LicenseItemLoader()
+            base.add_value('license', lic.load_item())
+
+            permissions = LomBase.getPermissions(self)
+            base.add_value('permissions', permissions.load_item())
+
+            response_loader = ResponseItemLoader()
+            base.add_value('response', response_loader.load_item())
+
+            yield base.load_item()
+
+    @staticmethod
+    def get_binary(current_dict, pdf_item):
+        filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
+        file = open(filepath_full, mode='rb')
+        binary = file.read()
+        file.close()
+        return binary
diff --git a/converter/spiders/scripts/__init__.py b/converter/spiders/scripts/__init__.py
diff --git a/converter/spiders/scripts/lower_saxony_abi/__init__.py b/converter/spiders/scripts/lower_saxony_abi/__init__.py