From 9ae70ca096003a0356c845a6bb30113a43f7495f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Wed, 15 Sep 2021 18:43:40 +0200
Subject: [PATCH 01/10] first draft of niedersachsen_abi_spider.py

---
 converter/spiders/niedersachsen_abi_spider.py | 172 ++++++++++++
 converter/spiders/scripts/__init__.py         |   0
 .../scripts/lower_saxony_abi/__init__.py      |   0
 .../lower_saxony_abi/directory_routine.py     | 219 +++++++++++++++
 .../lower_saxony_abi/keyword_mapper.py        | 257 ++++++++++++++++++
 5 files changed, 648 insertions(+)
 create mode 100644 converter/spiders/niedersachsen_abi_spider.py
 create mode 100644 converter/spiders/scripts/__init__.py
 create mode 100644 converter/spiders/scripts/lower_saxony_abi/__init__.py
 create mode 100644 converter/spiders/scripts/lower_saxony_abi/directory_routine.py
 create mode 100644 converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
new file mode 100644
index 00000000..b6f69c06
--- /dev/null
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -0,0 +1,172 @@
+import os
+from _datetime import datetime
+import logging
+import pprint
+
+import scrapy
+
+from .scripts.lower_saxony_abi.directory_routine import DirectoryInitializer, UnZipper, \
+    DirectoryScanner
+from .scripts.lower_saxony_abi.keyword_mapper import LoSaxKeywordMapper
+from ..items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
+    LomLifecycleItemloader, LomEducationalItemLoader, LicenseItemLoader, PermissionItemLoader, ResponseItemLoader, \
+    ValuespaceItemLoader
+
+
+class NiedersachsenAbiSpider(scrapy.Spider):
+    name = 'niedersachsen_abi_spider'
+
+    allowed_domains = ['https://za-aufgaben.nibis.de']
+    start_urls = ['https://za-aufgaben.nibis.de']
+    version = "0.0.1"
+    # Default values for the 2 expected parameters. filename is always required, skip_unzip optional.
+    filename = None
+    skip_unzip = False
+    pdf_dictionary_general = dict()
+    pdf_dictionary_additional = dict()
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
+        # logging.disable(logging.DEBUG)
+        if self.filename is not None:
+            zip_selection = self.filename
+
+            logging.debug(f"Selected .zip file by CLI-parameter: {zip_selection}")
+            logging.debug(f"User wants to skip the unzipping? {self.skip_unzip}")
+            # by default, the script should always unzip the desired .zip file
+            if self.skip_unzip == "no":
+                self.skip_unzip = False
+            if self.skip_unzip == "yes":
+                self.skip_unzip = True
+            logging.debug(f"skip_unzip variable: {self.skip_unzip}")
+
+            directory_paths = DirectoryInitializer()
+            zip_file_dictionary = directory_paths.check_download_folder_for_zip_files()
+
+            if self.skip_unzip is False:
+                un_zipper = UnZipper()
+                un_zipper.directory_paths = directory_paths.get_path_storage()
+                un_zipper.zip_file_dictionary = zip_file_dictionary
+                zip_file_chosen_by_user = un_zipper.show_zip_list(zip_selection=zip_selection)
+
+                if zip_file_chosen_by_user is not None:
+                    un_zipper.unzip_all_zips_within_the_initial_zip(zip_file=zip_file_chosen_by_user,
+                                                                    skip_unzip=self.skip_unzip)
+
+                    logging.debug(f"Extracted the following zip files:")
+                    logging.debug(un_zipper.zip_files_already_extracted)
+
+            print(
+                f"Analyzing file paths for '.pdf'-files inside "
+                f"{directory_paths.path_storage.path_to_extraction_directory}")
+            pdfs_in_directory: dict = \
+                DirectoryScanner.scan_directory_for_pdfs(directory_paths.path_storage.path_to_extraction_directory)
+            # logging.debug(pp.pformat(pdfs_in_directory))
+            print(f"Total .pdf items in the above mentioned directory: {len(pdfs_in_directory.keys())}")
+
+            kw_mapper = LoSaxKeywordMapper()
+            pdf_dict1, pdf_dict2 = kw_mapper.extract_pdf_metadata(pdfs_in_directory)
+            self.pdf_dictionary_general = pdf_dict1
+            self.pdf_dictionary_additional = pdf_dict2
+
+    def parse(self, response, **kwargs):
+        print(f"Hello world!")
+        print(f"filename = {self.filename}")
+        print(f"skip_unzip = {self.skip_unzip}")
+        print(f"The .pdf (general) dictionary has {len(self.pdf_dictionary_general.keys())} files")
+        print(f"The dictionary for additional .pdf files has {len(self.pdf_dictionary_additional.keys())} entries")
+
+        # first we're scraping all the .pdf files that follow the more general RegEx syntax
+        for pdf_item in self.pdf_dictionary_general:
+            current_dict: dict = self.pdf_dictionary_general.get(pdf_item)
+            pprint.pprint(current_dict)
+            base = BaseItemLoader()
+            base.add_value('sourceId', pdf_item)
+            hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
+            base.add_value('hash', hash_temp)
+
+            lom = LomBaseItemloader()
+
+            general = LomGeneralItemloader()
+            general.add_value('title', pdf_item)
+            general.add_value('identifier', pdf_item)
+            general.add_value('keyword', current_dict.get('keywords'))
+            lom.add_value('general', general.load_item())
+
+            technical = LomTechnicalItemLoader()
+            filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
+            technical.add_value('location', filepath_full)
+            lom.add_value('technical', technical.load_item())
+
+            lifecycle = LomLifecycleItemloader()
+            lom.add_value('lifecycle', lifecycle.load_item())
+
+            educational = LomEducationalItemLoader()
+            lom.add_value('educational', educational.load_item())
+
+            base.add_value('lom', lom.load_item())
+
+            vs = ValuespaceItemLoader()
+            if current_dict.get('discipline') is not None:
+                vs.add_value('discipline', current_dict.get('discipline'))
+            if current_dict.get('intendedEndUserRole') is not None:
+                vs.add_value('intendedEndUserRole', current_dict.get('intendedEndUserRole'))
+            base.add_value('valuespaces', vs.load_item())
+
+            lic = LicenseItemLoader()
+            base.add_value('license', lic.load_item())
+
+            permissions = PermissionItemLoader()
+            base.add_value('permissions', permissions.load_item())
+
+            response_loader = ResponseItemLoader()
+            base.add_value('response', response_loader.load_item())
+
+            yield base.load_item()
+
+        # Making sure that we also grab the additional .pdf files that don't follow the general filename syntax
+        for pdf_item in self.pdf_dictionary_additional:
+            current_dict: dict = self.pdf_dictionary_additional.get(pdf_item)
+            pprint.pprint(current_dict)
+            base = BaseItemLoader()
+            base.add_value('sourceId', pdf_item)
+            hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
+            base.add_value('hash', hash_temp)
+
+            lom = LomBaseItemloader()
+
+            general = LomGeneralItemloader()
+            general.add_value('title', pdf_item)
+            general.add_value('identifier', pdf_item)
+            general.add_value('keyword', current_dict.get('keywords'))
+            lom.add_value('general', general.load_item())
+
+            technical = LomTechnicalItemLoader()
+            filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
+            technical.add_value('location', filepath_full)
+            lom.add_value('technical', technical.load_item())
+
+            lifecycle = LomLifecycleItemloader()
+            lom.add_value('lifecycle', lifecycle.load_item())
+
+            educational = LomEducationalItemLoader()
+            lom.add_value('educational', educational.load_item())
+
+            base.add_value('lom', lom.load_item())
+
+            vs = ValuespaceItemLoader()
+            if current_dict.get('discipline') is not None:
+                vs.add_value('discipline', current_dict.get('discipline'))
+            base.add_value('valuespaces', vs.load_item())
+
+            lic = LicenseItemLoader()
+            base.add_value('license', lic.load_item())
+
+            permissions = PermissionItemLoader()
+            base.add_value('permissions', permissions.load_item())
+
+            response_loader = ResponseItemLoader()
+            base.add_value('response', response_loader.load_item())
+
+            yield base.load_item()
diff --git a/converter/spiders/scripts/__init__.py b/converter/spiders/scripts/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/converter/spiders/scripts/lower_saxony_abi/__init__.py b/converter/spiders/scripts/lower_saxony_abi/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/converter/spiders/scripts/lower_saxony_abi/directory_routine.py b/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
new file mode 100644
index 00000000..5186dcd2
--- /dev/null
+++ b/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
@@ -0,0 +1,219 @@
+import logging
+import os
+import pprint
+import zipfile
+from dataclasses import dataclass
+
+
+@dataclass
+class PathStorage:
+    parent_directory: str = None
+    path_to_download_directory: str = None
+    path_to_extraction_directory: str = None
+
+    pp = pprint.PrettyPrinter(indent=4)
+
+    def print_all_directories(self):
+        self.pp.pprint("Working-directories that this script will be using:")
+        self.pp.pprint(self.parent_directory)
+        self.pp.pprint(self.path_to_download_directory)
+        self.pp.pprint(self.path_to_extraction_directory)
+
+    pass
+
+
+class DirectoryInitializer:
+    """
+    This class makes sure that the 3 directories that will be frequently used actually exist - and if they don't will
+    create those directories and save them to our 'PathStorage'-dataclass.
+    After the DirectoryInitializer class is done with its work, the folder structure should look like this:
+    /<parent_dir_of_this_project>/
+    /<parent_dir_of_this_project>/zip_download              <- this is where the 'to be extracted' .zips should be
+    /<parent_dir_of_this_project>/zip_download/zip_extract/ <- this is where the extracted files end up
+    """
+    path_storage = PathStorage()
+
+    def __init__(self):
+        self.initialize_required_directories()
+
+    def check_download_folder_for_zip_files(self) -> dict:
+        file_dict = dict()
+        os.chdir(self.path_storage.path_to_download_directory)
+        logging.debug("Checking " + os.getcwd() + " for zip files")
+        if os.getcwd().endswith('zip_download'):
+            temp_list = os.listdir(os.getcwd())
+            # since the temp_list will hold folder names as well, we're checking for files only:
+            file_list = list()
+            for file_entry in temp_list:
+                if os.path.isfile(file_entry):
+                    if file_entry.endswith('.zip'):
+                        file_list.append(file_entry)
+            file_number: int = 1
+            for file in file_list:
+                file_size_temp = os.path.getsize(file)
+                file_size_megabyte = file_size_temp / (1000 * 1000)
+                file_size_megabyte = str(file_size_megabyte) + "MB"
+                # file size in Mebibyte:
+                # file_size_mebibyte = file_size_temp / (1024 * 1024)
+                file_dict_entry = {
+                    file: file_size_megabyte
+                }
+                file_dict.update(file_dict_entry)
+                file_number += 1
+            logging.debug(".zip files detected inside the '/zip_download/'-directory: ")
+            logging.debug(file_dict)
+        return file_dict
+
+    def create_zip_download_directory(self):
+        os.chdir(self.path_storage.parent_directory)
+        logging.debug("Creating '/zip_download/-directory ...")
+        os.mkdir('zip_download')
+        if os.path.exists('zip_download'):
+            print("Please provide a suitable .zip-file inside the '/zip_download/'-directory and rerun the script")
+            self.path_storage.path_to_download_directory = os.path.join(os.getcwd(), 'zip_download')
+
+    def create_zip_extraction_directory(self):
+        os.chdir(self.path_storage.path_to_download_directory)
+        logging.debug("Creating '/zip_extract/'-directory ...")
+        os.mkdir('zip_extract')
+        os.chdir('zip_extract')
+        self.path_storage.path_to_extraction_directory = os.getcwd()
+        os.chdir('..')
+
+    def detect_extraction_directory(self):
+        logging.debug("Detecting 'zip_extract'-sub-folder ...")
+        os.chdir(self.path_storage.path_to_download_directory)
+        if os.path.exists('zip_extract'):
+            logging.debug("SUCCESS! Detected '/zip_extract/'-directory, continuing ...")
+            os.chdir('zip_extract')
+            self.path_storage.path_to_extraction_directory = os.getcwd()
+            os.chdir('..')
+        else:
+            self.create_zip_extraction_directory()
+
+    def detect_zip_directory(self) -> bool:
+        if os.path.exists('zip_download'):
+            os.chdir('zip_download')
+            zip_directory = os.path.join(os.getcwd())
+            logging.debug("SUCCESS! Detected 'zip_download'-directory in: " + zip_directory)
+            self.path_storage.path_to_download_directory = zip_directory
+            return True
+        else:
+            self.create_zip_download_directory()
+            return False
+
+    def get_path_storage(self):
+        return self.path_storage
+
+    def initialize_folders(self):
+        logging.debug("Looking for 'zip_download/'-directory ...")
+        if self.detect_zip_directory():
+            self.detect_extraction_directory()
+
+    def initialize_required_directories(self):
+        self.path_storage.parent_directory = os.getcwd()
+        self.initialize_folders()
+        self.path_storage.print_all_directories()
+        return self
+
+
+class UnZipper:
+    directory_paths = None
+    zip_file_dictionary = None
+    zip_files_already_extracted = set()
+    zip_files_to_extract = set()
+    zip_files_to_extract_dict = dict()
+
+    pp = pprint.PrettyPrinter(indent=4)
+
+    def show_zip_list(self, zip_selection=None):
+        # TODO: prettify the zip list output
+        self.pp.pprint(f"The following .zip files were recognized by the script: {self.zip_file_dictionary}")
+        if zip_selection is not None:
+            if zip_selection in self.zip_file_dictionary.keys():
+                zip_file_name = zip_selection
+                zip_file_size_megabytes = self.zip_file_dictionary.get(zip_selection)
+                print(f"Selected the following file:\t {zip_file_name} \t size: {zip_file_size_megabytes}")
+                zip_file = zipfile.ZipFile(zip_file_name)
+                return zip_file
+            else:
+                logging.warning(f"Selected .zip file '{zip_selection}' not found in "
+                                f"'{self.directory_paths.path_to_download_directory}'!\n"
+                                f"These are the available .zip files: {self.zip_file_dictionary}.\n"
+                                f"Please make sure that your CLI-parameter input for --filename='file.zip' is valid.")
+
+    def unzip_all_zips_within_the_initial_zip(self, zip_file: zipfile, skip_unzip=False):
+        zips_inside_zip: list = list()
+        zip_files_list: list = zip_file.namelist()
+        zip_file.extractall(path='zip_extract')
+        self.zip_files_already_extracted.add(zip_file.filename)
+
+        for zip_item in zip_files_list:
+            if zip_item.endswith('.zip'):
+                zips_inside_zip.append(zip_item)
+
+        if len(zips_inside_zip) > 0:
+            logging.debug(f"Found additional .zip files inside {zip_file.filename}:")
+            logging.debug(zips_inside_zip)
+            if skip_unzip is False:
+                self.unzip_everything(self.directory_paths.path_to_extraction_directory)
+            elif skip_unzip is True:
+                print(f"Okay. Skipping extraction of nested .zip files within {zip_file.filename}")
+        elif len(zips_inside_zip) == 0:
+            return zips_inside_zip
+
+    def unzip_everything(self, directory_as_string):
+        extract_dir = directory_as_string
+        os.chdir(extract_dir)
+        zip_inside_zip_counter = 0
+        for folder_name, sub_folder, filenames in os.walk(extract_dir):
+            if len(sub_folder) == 0 and folder_name.endswith('zip_extract'):
+                for filename_top_level in filenames:
+                    if filename_top_level.endswith(
+                            '.zip') and filename_top_level not in self.zip_files_already_extracted:
+                        print(folder_name)
+                        print(filename_top_level)
+                        self.zip_files_already_extracted.add(filename_top_level)
+                        current_zip = zipfile.ZipFile(filename_top_level)
+                        zip_files_inside = current_zip.namelist()
+                        for zip_file_inside in zip_files_inside:
+                            if zip_file_inside.endswith('.zip'):
+                                zip_inside_zip_counter += 1
+                        current_zip.extractall()
+                if zip_inside_zip_counter > 0:
+                    if extract_dir is not None:
+                        self.unzip_everything(extract_dir)
+                    else:
+                        extract_dir = self.directory_paths.path_to_extraction_directory
+                        self.unzip_everything(extract_dir)
+            for _ in sub_folder:
+                for filename in filenames:
+                    if filename.endswith('.zip') and filename not in self.zip_files_already_extracted:
+                        self.zip_files_to_extract.add(filename)
+                        self.zip_files_to_extract_dict.update({filename: folder_name})
+
+        for item in self.zip_files_to_extract_dict.keys():
+            if item not in self.zip_files_already_extracted:
+                print(f"Unzipping: {item}")
+                temp_filepath_full = self.zip_files_to_extract_dict.get(item) + os.path.sep + item
+                temp_path = self.zip_files_to_extract_dict.get(item)
+                temp_zip: zipfile = zipfile.ZipFile(temp_filepath_full)
+                temp_zip.extractall(path=temp_path)
+                self.zip_files_already_extracted.add(item)
+        pass
+
+
+class DirectoryScanner:
+
+    @staticmethod
+    def scan_directory_for_pdfs(target_directory):
+        directory_to_scan = target_directory
+        pdf_list = set()
+        pdf_dictionary_temp = dict()
+        for folder_name, sub_folders, filenames in os.walk(directory_to_scan):
+            for _ in sub_folders:
+                for filename in filenames:
+                    if filename.endswith('.pdf') and filename not in pdf_list:
+                        pdf_list.add(filename)
+                        pdf_dictionary_temp.update({filename: folder_name})
+        return pdf_dictionary_temp
diff --git a/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py b/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
new file mode 100644
index 00000000..c6f871a2
--- /dev/null
+++ b/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
@@ -0,0 +1,257 @@
+import logging
+import os
+import pprint
+import re
+
+
+class LoSaxKeywordMapper:
+    """
+    KeywordMapper for 'Abituraufgaben' from Lower Saxony
+    see: https://za-aufgaben.nibis.de
+
+    Provides discipline- and keyword-mapping for the abbreviations found in the to be parsed '.pdf'-filenames.
+    """
+    discipline_mapping = {
+        # SkoHub discipline Mapping, see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl
+        'BRC': 'Wirtschaft und Verwaltung',
+        'BVW': 'Wirtschaftskunde',
+        'Ernaehrung': 'Ernährung und Hauswirtschaft',
+        'EvReligion': 'Religion',
+        'Franz': 'Französisch',
+        'GesPfl': 'Gesundheit',
+        'KathReligion': 'Religion',
+        'Mathe': 'Mathematik',
+        'MatheTech': 'Mathematik',
+        'MatheWirt': 'Mathematik',
+        'PaedPsych': 'Pädagogik',
+        'PolitikWirtschaft': 'Politik',
+        'VW': 'Wirtschaftskunde',
+        'WerteNormen': 'Ethik',
+    }
+
+    keyword_mapping = {
+        # additional discipline information, specific for Lower Saxony:
+        'BRC': 'Betriebswirtschaft mit Rechnungswesen-Controlling',
+        'BVW': 'Betriebs- und Volkswirtschaft',
+        'Ernaehrung': 'Ernährung und Hauswirtschaft',
+        'EvReligion': 'Evangelische Religion',
+        'Franz': 'Französisch',
+        'GesPfl': 'Gesundheit-Pflege',
+        'KathReligion': 'Katholische Religion',
+        'Mathe': 'Mathematik',
+        'MatheTech': 'Mathematik - Berufliches Gymnasium - Technik',
+        'MatheWirt': 'Mathematik - Berufliches Gymnasium - Wirtschaft / Gesundheit und Soziales',
+        'PaedPsych': 'Pädagogik-Psychologie',
+        'PolitikWirtschaft': 'Politik-Wirtschaft',
+        'VW': 'Volkswirtschaft',
+        'WerteNormen': 'Werte und Normen',
+        # additional keywords
+        'Neu': 'Neubeginn',
+        'BG': 'Berufsgymnasium (BG)',
+        'ZBW': 'Zweiter Bildungsweg (ZBW) / Freie Waldorfschulen / Nichtschüler',
+        'CAS': 'Computer Algebra System (CAS)',
+        'GTR': 'Grafikfähiger Taschenrechner (GTR)',
+        'WTR': 'Wissenschaftlicher Taschenrechner',
+        'EA': 'Kurs auf erhöhtem Anforderungsniveau (eA)',
+        'GA': 'Kurs auf grundlegendem Anforderungsniveau (gA)',
+        'HV': 'Hörverständnis',
+        'ME': 'Material',  # for students or teachers
+        'mitExp': 'mit Experimentieren',
+        'ohneExp': 'ohne Experimentieren',
+        'mitExpElektrik': 'mit Experimentieren - Elektrik',
+        'mitExpOptik': 'mit Experimentieren - Optik',
+        'mitExpWellen': 'mit Experimentieren - Wellen',
+        '_ALLGE': 'Allgemein (ALLGE)',
+        '_LA': 'Lineare Algebra (LA)',
+        '_LA_AG': 'Lineare Algebra / Analytische Geometrie (LA_AG)',
+        '_STOCH': 'Stochastik (STOCH)',
+        'AnlagenTSP': 'Anlagen - Thematische Schwerpunkte',
+        'TS': 'Thematische Schwerpunkte / Themenschwerpunkte',
+        'TSP': 'Thematische Schwerpunkte / Themenschwerpunkte'
+    }
+    # For Debugging:
+    logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
+    pp = pprint.PrettyPrinter(indent=4)
+
+    def extract_pdf_metadata(self, pdf_dictionary):
+        """
+        expects a pdf_dictionary consisting of two strings: {'filename': 'path_to_file'}
+        then does a 3 step conversion:
+
+        - sorting the pdf_entries into either 'general' or 'additional' .pdf files
+        - using RegEx to extract metadata from the filename into a pdf dictionary
+        - cleaning up the dictionary of 'None'-Types
+        - mapping keywords
+
+        afterwards returns two final pdf_dictionary for 'normal' and 'additional' .pdf files, where
+
+        - key = 'unique_filename_of_a_pdf_file.pdf'
+        - values = nested dictionary (with keys like 'discipline', 'year', 'pdf_path', 'keywords'
+
+        :param pdf_dictionary: dict
+        :return: pdf_dictionary_general, pdf_dictionary_additional_files
+        """
+        pdf_dictionary_raw = pdf_dictionary
+        pdf_temp = dict()
+        pdf_additional_files = dict()
+        for pdf_item in pdf_dictionary_raw.keys():
+            logging.debug(self.pp.pformat(pdf_item))
+            if pdf_item.startswith('Anlage') or pdf_item.startswith('TSP'):
+                logging.debug(f"Filtered out {pdf_item} from {pdf_dictionary_raw.get(pdf_item)}")
+                pdf_additional_files.update({pdf_item: pdf_dictionary_raw.get(pdf_item)})
+            else:
+                regex_general = re.compile(r'(?P<year>\d{4})'
+                                           r'(?P<discipline>.+?)'
+                                           r'(?P<new_begin_1st>Neu)?'  # Neubeginner (2. Fremdsprache als Neubeginner)
+                                           r'(?P<secondary_school_type>BG|ZBW|FWS)?'
+                                           # Berufsgymnasium / Zweiter Bildungsweg / Freie Waldorfschulen?
+                                           r'(?P<obligation>Pflicht)?'  # Pflichtfach
+                                           r'(?P<calculator>CAS|GTR|WTR)?'
+                                           # ComputerAlgebraSystem / Grafikfähiger Taschenrechner / 
+                                           # Wissenschaftlicher TR 
+                                           r'(?P<course_type>EA|GA)?'
+                                           r'(?P<listening_comprehension>HV)?'  # Hörverständnis
+                                           r'(?P<material_or_expectations>M|ME)?'
+                                           # Material (für Schüler) oder Erwartungshorizont (für Lehrer)
+                                           r'(?P<physics_topic>(mitExp)?(Elektrik|Optik|Wellen)?'
+                                           r'|ohneExp)?'
+                                           r'(?P<new_begin_2nd>Neu)?'
+                                           r'(?P<math_topic>_ALLGE|_LA|_LA_AG|_STOCH)?'
+                                           # Allgemein / LinAlg / analytische Geometrie / Stochastik
+                                           r'(?P<attachment_2nd>Anlagen|AnlagenTSP|TS|TS\d{4})?'
+                                           # TSP bzw. TS = Thematische Schwerpunkte / Themenschwerpunkte
+                                           r'(?P<assignment_part>Aufg\d)?'
+                                           r'(?P<teacher>Lehrer)?'
+                                           r'(.pdf)')
+                if regex_general.search(pdf_item) is not None:
+                    regex_result_dict = regex_general.search(pdf_item).groupdict()
+
+                    # For Debugging - In case we want to see the individual (raw) RegEx results:
+                    logging.debug(self.pp.pformat(regex_result_dict))
+
+                    # filterung out the invalid (NoneType) values from the initial regex results with a temporary list:
+                    only_valid_values = list()
+                    for value in regex_result_dict.values():
+                        if value is not None and value != '':
+                            only_valid_values.append(value)
+
+                    # Discipline-Mapping to SkoHub vocabulary:
+                    if regex_result_dict.get('discipline') in self.discipline_mapping.keys():
+                        regex_result_dict.update(
+                            {'discipline': self.discipline_mapping.get(regex_result_dict.get('discipline'))})
+                    # Mapping '<filename>Lehrer.pdf' to SkoHub intendedEndUserRole:
+                    if regex_result_dict.get('teacher') is None:
+                        regex_result_dict.update({'intendedEndUserRole': 'learner'})
+                    elif regex_result_dict.get('teacher') == "Lehrer":
+                        regex_result_dict.update({'intendedEndUserRole': 'teacher'})
+
+                    # For Debugging - this is the 'working list' of keywords without any of the 'None'-types:
+                    logging.debug(f"PDF File: {pdf_item} // only_valid_keywords: {only_valid_values}")
+
+                    keywords_cleaned_and_mapped = list()
+                    for potential_keyword in only_valid_values:
+                        if potential_keyword in self.keyword_mapping:
+                            potential_keyword = self.keyword_mapping.get(potential_keyword)
+                        if potential_keyword.startswith('Aufg'):
+                            potential_keyword = potential_keyword.replace('Aufg', 'Aufgabe ')
+                        keywords_cleaned_and_mapped.append(potential_keyword)
+                    keywords_cleaned_and_mapped.append('Schriftliche Abituraufgaben Niedersachsen')
+                    logging.debug(self.pp.pformat(keywords_cleaned_and_mapped))
+
+                    # TODO: keywords
+                    #  - Erwartungshorizont für Lehrer
+                    #  - relative / absolute path?
+                    dict_of_current_pdf = {
+                        pdf_item.split(os.path.sep)[-1]: {
+                            'discipline': regex_result_dict.get('discipline'),
+                            'year': regex_result_dict.get('year'),
+                            'pdf_path': pdf_dictionary_raw.get(pdf_item),
+                            'keywords': keywords_cleaned_and_mapped,
+                            'intendedEndUserRole': regex_result_dict.get('intendedEndUserRole')
+                        }
+                    }
+                    pdf_temp.update(dict_of_current_pdf)
+
+        logging.debug(self.pp.pformat(pdf_temp))
+        logging.debug(f"length of pdf_temp: {len(pdf_temp)}")
+        logging.debug(f"amount of filtered out (additional) pdfs: {len(pdf_additional_files)}")
+        logging.debug(f"Filtered out pdf items: {pdf_additional_files.items()}")
+        # self.pp.pprint(pdf_additional_files)
+        if len(pdf_additional_files) > 0:
+            pdf_additional_files = self.extract_pdf_metadata_from_additional_files(pdf_dictionary=pdf_additional_files)
+        return pdf_temp, pdf_additional_files
+
+    def extract_pdf_metadata_from_additional_files(self, pdf_dictionary):
+        """
+        Since not all '.pdf'-filenames are following the same naming syntax, this method processes the filenames that
+        can't be parsed by the more generic extract_pdf_metadata()-method.
+
+        Expects a pdf_dictionary consisting of two strings: {'filename': 'path_to_file'}
+        then does a 3 step conversion:
+
+        - sorting the pdf_entries into either 'general' or 'additional' .pdf files
+        - using RegEx to extract metadata from the filename into a pdf dictionary
+        - cleaning up the dictionary of 'None'-Types
+        - mapping keywords
+
+        afterwards returns two final pdf_dictionary for 'normal' and 'additional' .pdf files, where
+
+        - key = 'unique_filename_of_a_pdf_file.pdf'
+        - values = nested dictionary (with the following keys: 'discipline', 'year', 'pdf_path', 'keywords'
+
+        :param pdf_dictionary: dict
+        :return: nested dict = { '.pdf filename': {
+            'discipline': '...',
+            'year': '...',
+            'pdf_path': '...',
+            'keywords': '...' }
+            }
+        """
+        pdf_working_dict = pdf_dictionary
+        pdf_filenames_and_metadata_dict = dict()
+        for pdf_filename in pdf_working_dict.keys():
+            regex_additional_files = re.compile(r'(?P<attachment>Anlage .+ im Fach|TSP)?'
+                                                r'(?P<discipline>.+?)'
+                                                r'(?P<attachment_2nd>TS)?'
+                                                r'(?P<year>\d{4})?'
+                                                r'(?P<attachment_3rd>Anlagen)?'
+                                                r'(.pdf)')
+            if regex_additional_files.search(pdf_filename) is not None:
+                regex_result_dict_temporary: dict = regex_additional_files.search(pdf_filename).groupdict()
+                logging.debug(self.pp.pformat(regex_result_dict_temporary))
+
+                # extract and clean up the keyword-list:
+                only_valid_values = list()
+                for value in regex_result_dict_temporary.values():
+                    if value is not None and value != '':
+                        only_valid_values.append(value)
+                logging.debug(only_valid_values)
+                keywords_cleaned_and_mapped = list()
+                for potential_keyword in only_valid_values:
+                    if potential_keyword in self.keyword_mapping:
+                        potential_keyword = self.keyword_mapping.get(potential_keyword)
+                    keywords_cleaned_and_mapped.append(potential_keyword)
+                keywords_cleaned_and_mapped.append('Schriftliche Abituraufgaben Niedersachsen')
+
+                logging.debug(self.pp.pformat(keywords_cleaned_and_mapped))
+                dict_of_current_pdf = {
+                    pdf_filename: {
+                        'discipline': regex_result_dict_temporary.get('discipline'),
+                        'year': regex_result_dict_temporary.get('year'),
+                        'pdf_path': pdf_working_dict.get(pdf_filename),
+                        'keywords': keywords_cleaned_and_mapped
+                    }
+                }
+                pdf_filenames_and_metadata_dict.update(dict_of_current_pdf)
+        return pdf_filenames_and_metadata_dict
+
+
+if __name__ == '__main__':
+    debug_additional_files = {
+        'Anlage zum Abitur im Fach Informatik.pdf': '/home/criamos/PycharmProjects/pythonScriptTestingArea/zip_download/zip_extract/2016/2016Informatik',
+        'TSPInformatik2020Anlagen.pdf': '/home/criamos/PycharmProjects/pythonScriptTestingArea/zip_download/zip_extract/2020Informatik/2020InformatikEA',
+        'TSPInformatikTS2019Anlagen.pdf': '/home/criamos/PycharmProjects/pythonScriptTestingArea/zip_download/zip_extract/2019/2019Informatik/2019InformatikEA'
+    }
+    kw_mapper = LoSaxKeywordMapper()
+    pdf_result_dictionary = kw_mapper.extract_pdf_metadata_from_additional_files(pdf_dictionary=debug_additional_files)
+    pprint.pprint(pdf_result_dictionary)

From 744d4050b19ac8de208b350c4936158dd75f6354 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Wed, 15 Sep 2021 18:49:51 +0200
Subject: [PATCH 02/10] add .gitignore to /zip_download/-folder

---
 zip_download/.gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 zip_download/.gitignore

diff --git a/zip_download/.gitignore b/zip_download/.gitignore
new file mode 100644
index 00000000..1a0e445b
--- /dev/null
+++ b/zip_download/.gitignore
@@ -0,0 +1,3 @@
+./zip_extract
+*.zip
+*.json
\ No newline at end of file

From 691701f0e230eafbb5e6600207bf2556500112c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Wed, 15 Sep 2021 18:56:32 +0200
Subject: [PATCH 03/10] add: explanations for the two expected CLI parameters

---
 converter/spiders/niedersachsen_abi_spider.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
index b6f69c06..df601cf5 100644
--- a/converter/spiders/niedersachsen_abi_spider.py
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -19,12 +19,17 @@ class NiedersachsenAbiSpider(scrapy.Spider):
     allowed_domains = ['https://za-aufgaben.nibis.de']
     start_urls = ['https://za-aufgaben.nibis.de']
     version = "0.0.1"
-    # Default values for the 2 expected parameters. filename is always required, skip_unzip optional.
+    # Default values for the 2 expected parameters. Parameter "filename" is always required, "skip_unzip" is optional.
     filename = None
     skip_unzip = False
     pdf_dictionary_general = dict()
     pdf_dictionary_additional = dict()
 
+    # Running the crawler from the command line with the exact filename as a parameter:
+    #   scrapy crawl niedersachsen_abi_spider -a filename="za-download-6e05cbbb6e07250c69ebe95ae972fe8a.zip"
+    #   -a skip_unzip="yes"
+    # Make sure that there is a corresponding .zip file inside the /zip_download/-folder in the project root
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
@@ -80,7 +85,7 @@ def parse(self, response, **kwargs):
         # first we're scraping all the .pdf files that follow the more general RegEx syntax
         for pdf_item in self.pdf_dictionary_general:
             current_dict: dict = self.pdf_dictionary_general.get(pdf_item)
-            pprint.pprint(current_dict)
+            # pprint.pprint(current_dict)
             base = BaseItemLoader()
             base.add_value('sourceId', pdf_item)
             hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
@@ -128,7 +133,7 @@ def parse(self, response, **kwargs):
         # Making sure that we also grab the additional .pdf files that don't follow the general filename syntax
         for pdf_item in self.pdf_dictionary_additional:
             current_dict: dict = self.pdf_dictionary_additional.get(pdf_item)
-            pprint.pprint(current_dict)
+            # pprint.pprint(current_dict)
             base = BaseItemLoader()
             base.add_value('sourceId', pdf_item)
             hash_temp = str(f"{datetime.now().isoformat()}{self.version}")

From 0bdf5e17365f12ee6b5f77f887c69df8d9c18a14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Wed, 15 Sep 2021 19:06:17 +0200
Subject: [PATCH 04/10] minor code cleanup, disable/remove pretty prints

---
 converter/spiders/niedersachsen_abi_spider.py   | 13 ++++++-------
 .../scripts/lower_saxony_abi/keyword_mapper.py  | 17 ++++++-----------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
index df601cf5..06ed7982 100644
--- a/converter/spiders/niedersachsen_abi_spider.py
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -1,7 +1,6 @@
+import logging
 import os
 from _datetime import datetime
-import logging
-import pprint
 
 import scrapy
 
@@ -76,11 +75,11 @@ def __init__(self, **kwargs):
             self.pdf_dictionary_additional = pdf_dict2
 
     def parse(self, response, **kwargs):
-        print(f"Hello world!")
-        print(f"filename = {self.filename}")
-        print(f"skip_unzip = {self.skip_unzip}")
-        print(f"The .pdf (general) dictionary has {len(self.pdf_dictionary_general.keys())} files")
-        print(f"The dictionary for additional .pdf files has {len(self.pdf_dictionary_additional.keys())} entries")
+        # print(f"filename = {self.filename}")
+        # print(f"skip_unzip = {self.skip_unzip}")
+        logging.debug(f"The .pdf (general) dictionary has {len(self.pdf_dictionary_general.keys())} files")
+        logging.debug(f"The dictionary for additional .pdf files has "
+                      f"{len(self.pdf_dictionary_additional.keys())} entries")
 
         # first we're scraping all the .pdf files that follow the more general RegEx syntax
         for pdf_item in self.pdf_dictionary_general:
diff --git a/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py b/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
index c6f871a2..1d914354 100644
--- a/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
+++ b/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
@@ -226,6 +226,12 @@ def extract_pdf_metadata_from_additional_files(self, pdf_dictionary):
                     if value is not None and value != '':
                         only_valid_values.append(value)
                 logging.debug(only_valid_values)
+
+                # Discipline-Mapping to SkoHub vocabulary:
+                if regex_result_dict_temporary.get('discipline') in self.discipline_mapping.keys():
+                    regex_result_dict_temporary.update(
+                        {'discipline': self.discipline_mapping.get(regex_result_dict_temporary.get('discipline'))})
+
                 keywords_cleaned_and_mapped = list()
                 for potential_keyword in only_valid_values:
                     if potential_keyword in self.keyword_mapping:
@@ -244,14 +250,3 @@ def extract_pdf_metadata_from_additional_files(self, pdf_dictionary):
                 }
                 pdf_filenames_and_metadata_dict.update(dict_of_current_pdf)
         return pdf_filenames_and_metadata_dict
-
-
-if __name__ == '__main__':
-    debug_additional_files = {
-        'Anlage zum Abitur im Fach Informatik.pdf': '/home/criamos/PycharmProjects/pythonScriptTestingArea/zip_download/zip_extract/2016/2016Informatik',
-        'TSPInformatik2020Anlagen.pdf': '/home/criamos/PycharmProjects/pythonScriptTestingArea/zip_download/zip_extract/2020Informatik/2020InformatikEA',
-        'TSPInformatikTS2019Anlagen.pdf': '/home/criamos/PycharmProjects/pythonScriptTestingArea/zip_download/zip_extract/2019/2019Informatik/2019InformatikEA'
-    }
-    kw_mapper = LoSaxKeywordMapper()
-    pdf_result_dictionary = kw_mapper.extract_pdf_metadata_from_additional_files(pdf_dictionary=debug_additional_files)
-    pprint.pprint(pdf_result_dictionary)

From a0d7c94b4359d314390ed3787b1db5eb749cffa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Thu, 16 Sep 2021 12:13:27 +0200
Subject: [PATCH 05/10] add: lifecycle.role and lifecycle.organization

---
 converter/spiders/niedersachsen_abi_spider.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
index 06ed7982..915cc1a4 100644
--- a/converter/spiders/niedersachsen_abi_spider.py
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -104,6 +104,8 @@ def parse(self, response, **kwargs):
             lom.add_value('technical', technical.load_item())
 
             lifecycle = LomLifecycleItemloader()
+            lifecycle.add_value('role', 'publisher')
+            lifecycle.add_value('organization', 'Niedersächsisches Kultusministerium')
             lom.add_value('lifecycle', lifecycle.load_item())
 
             educational = LomEducationalItemLoader()
@@ -152,6 +154,8 @@ def parse(self, response, **kwargs):
             lom.add_value('technical', technical.load_item())
 
             lifecycle = LomLifecycleItemloader()
+            lifecycle.add_value('role', 'publisher')
+            lifecycle.add_value('organization', 'Niedersächsisches Kultusministerium')
             lom.add_value('lifecycle', lifecycle.load_item())
 
             educational = LomEducationalItemLoader()

From 7d66a5bf000a5c9792059c0945ca2f40251a1dec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Thu, 16 Sep 2021 14:10:54 +0200
Subject: [PATCH 06/10] remove unnecessary debug code, clarify methods and docs

---
 converter/spiders/niedersachsen_abi_spider.py |  3 +-
 .../lower_saxony_abi/directory_routine.py     | 35 +++++++++++++++----
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
index 915cc1a4..3115ffee 100644
--- a/converter/spiders/niedersachsen_abi_spider.py
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -52,7 +52,8 @@ def __init__(self, **kwargs):
                 un_zipper = UnZipper()
                 un_zipper.directory_paths = directory_paths.get_path_storage()
                 un_zipper.zip_file_dictionary = zip_file_dictionary
-                zip_file_chosen_by_user = un_zipper.show_zip_list(zip_selection=zip_selection)
+                zip_file_chosen_by_user = \
+                    un_zipper.compare_selected_zip_file_with_recognized_files(zip_selection=zip_selection)
 
                 if zip_file_chosen_by_user is not None:
                     un_zipper.unzip_all_zips_within_the_initial_zip(zip_file=zip_file_chosen_by_user,
diff --git a/converter/spiders/scripts/lower_saxony_abi/directory_routine.py b/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
index 5186dcd2..cc538168 100644
--- a/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
+++ b/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
@@ -37,6 +37,9 @@ def __init__(self):
         self.initialize_required_directories()
 
     def check_download_folder_for_zip_files(self) -> dict:
+        """
+        Checks the /zip_download/-folder for .zip files and returns a list with their filenames and size in megabyte.
+        """
         file_dict = dict()
         os.chdir(self.path_storage.path_to_download_directory)
         logging.debug("Checking " + os.getcwd() + " for zip files")
@@ -48,7 +51,6 @@ def check_download_folder_for_zip_files(self) -> dict:
                 if os.path.isfile(file_entry):
                     if file_entry.endswith('.zip'):
                         file_list.append(file_entry)
-            file_number: int = 1
             for file in file_list:
                 file_size_temp = os.path.getsize(file)
                 file_size_megabyte = file_size_temp / (1000 * 1000)
@@ -59,7 +61,6 @@ def check_download_folder_for_zip_files(self) -> dict:
                     file: file_size_megabyte
                 }
                 file_dict.update(file_dict_entry)
-                file_number += 1
             logging.debug(".zip files detected inside the '/zip_download/'-directory: ")
             logging.debug(file_dict)
         return file_dict
@@ -81,6 +82,10 @@ def create_zip_extraction_directory(self):
         os.chdir('..')
 
     def detect_extraction_directory(self):
+        """
+        Checks if there is a /zip_extract/-subdirectory inside the /zip_download/ folder and saves the folder path to
+        the class attributes. If there isn't a subdirectory, it'll create one by calling the corresponding method.
+        """
         logging.debug("Detecting 'zip_extract'-sub-folder ...")
         os.chdir(self.path_storage.path_to_download_directory)
         if os.path.exists('zip_extract'):
@@ -118,16 +123,16 @@ def initialize_required_directories(self):
 
 
 class UnZipper:
-    directory_paths = None
-    zip_file_dictionary = None
+    directory_paths: PathStorage = None
+    zip_file_dictionary: dict = None
     zip_files_already_extracted = set()
     zip_files_to_extract = set()
     zip_files_to_extract_dict = dict()
 
     pp = pprint.PrettyPrinter(indent=4)
 
-    def show_zip_list(self, zip_selection=None):
-        # TODO: prettify the zip list output
+    def compare_selected_zip_file_with_recognized_files(self, zip_selection=None):
+        # TODO: maybe prettify the zip list output
         self.pp.pprint(f"The following .zip files were recognized by the script: {self.zip_file_dictionary}")
         if zip_selection is not None:
             if zip_selection in self.zip_file_dictionary.keys():
@@ -143,6 +148,14 @@ def show_zip_list(self, zip_selection=None):
                                 f"Please make sure that your CLI-parameter input for --filename='file.zip' is valid.")
 
     def unzip_all_zips_within_the_initial_zip(self, zip_file: zipfile, skip_unzip=False):
+        """
+        Unzips the initially selected .zip file and checks if the user wants to also extract all .zip files in its
+        subdirectories.
+        Keeps track of which files were already extracted by using a set() of their filenames.
+        :param zip_file: the user-specified zip file that needs extraction
+        :param skip_unzip: in case the user wants to only unzip the initial .zip file and nothing else
+        :return: a list() of all .zip files that were found within the initial .zip file
+        """
         zips_inside_zip: list = list()
         zip_files_list: list = zip_file.namelist()
         zip_file.extractall(path='zip_extract')
@@ -163,6 +176,10 @@ def unzip_all_zips_within_the_initial_zip(self, zip_file: zipfile, skip_unzip=Fa
             return zips_inside_zip
 
     def unzip_everything(self, directory_as_string):
+        """
+        Tries to recursively unzip all .zip files within a directory.
+        :param directory_as_string: the filepath in which to look for .zip files
+        """
         extract_dir = directory_as_string
         os.chdir(extract_dir)
         zip_inside_zip_counter = 0
@@ -207,6 +224,12 @@ class DirectoryScanner:
 
     @staticmethod
     def scan_directory_for_pdfs(target_directory):
+        """
+        Returns a dict() of .pdf files and their filepath.
+        :param target_directory: the directory in which to look for .pdf files
+        :return: a dictionary consisting of two strings: a unique filename and the corresponding directory, e.g.:
+        dict() = { filename : directory }
+        """
         directory_to_scan = target_directory
         pdf_list = set()
         pdf_dictionary_temp = dict()

From 57432af8e666711a7ca9ebb0c8e8d9f278f81aca Mon Sep 17 00:00:00 2001
From: Torsten Simon <simon@edu-sharing.net>
Date: Fri, 17 Sep 2021 15:43:42 +0200
Subject: [PATCH 07/10] binary handling niedersachsen spider

---
 converter/es_connector.py                     | 20 +++++++++--
 converter/items.py                            |  5 ++-
 converter/pipelines.py                        |  6 ++--
 converter/spiders/niedersachsen_abi_spider.py | 34 +++++++++++++------
 4 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/converter/es_connector.py b/converter/es_connector.py
index b6b229c5..86268295 100644
--- a/converter/es_connector.py
+++ b/converter/es_connector.py
@@ -168,6 +168,21 @@ def setPermissions(self, uuid, permissions) -> bool:
             return True
         except ApiException as e:
             return False
+    def setNodeBinaryData(self, uuid, item) -> bool:
+        if "binary" in item:
+            logging.info('set binary')
+            files = {"file": item["binary"]}
+            response = requests.post(
+                get_project_settings().get("EDU_SHARING_BASE_URL")
+                + "rest/node/v1/nodes/-home-/"
+                + uuid
+                + "/content?mimetype="
+                + item["lom"]["technical"]["format"],
+                headers=self.getHeaders(None),
+                files=files,
+            )
+            logging.info(response)
+            return response.status_code == 200
 
     def setNodePreview(self, uuid, item) -> bool:
         if "thumbnail" in item:
@@ -243,8 +258,8 @@ def transformItem(self, uuid, spider, item):
             "ccm:objecttype": item["type"],
             "ccm:replicationsourceuuid": uuid,
             "cm:name": item["lom"]["general"]["title"],
-            "ccm:wwwurl": item["lom"]["technical"]["location"],
-            "cclom:location": item["lom"]["technical"]["location"],
+            "ccm:wwwurl": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
+            "cclom:location": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
             "cclom:title": item["lom"]["general"]["title"],
         }
         if "notes" in item:
@@ -491,6 +506,7 @@ def insertItem(self, spider, uuid, item):
         node = self.syncNode(spider, "ccm:io", self.transformItem(uuid, spider, item))
         self.setNodePermissions(node["ref"]["id"], item)
         self.setNodePreview(node["ref"]["id"], item)
+        self.setNodeBinaryData(node["ref"]["id"], item)
         self.setNodeText(node["ref"]["id"], item)
 
     def updateItem(self, spider, uuid, item):
diff --git a/converter/items.py b/converter/items.py
index 83d266dc..41dd58a6 100644
--- a/converter/items.py
+++ b/converter/items.py
@@ -181,6 +181,7 @@ class BaseItem(Item):
     ranking = Field()
     fulltext = Field()
     thumbnail = Field()
+    "thumbnail data in base64"
     lastModified = Field()
     lom = Field(serializer=LomBaseItem)
     valuespaces = Field(serializer=ValuespaceItem)
@@ -188,8 +189,10 @@ class BaseItem(Item):
     "permissions (access rights) for this entry"
     license = Field(serializer=LicenseItem)
     publisher = Field()
-    # editorial notes
     notes = Field()
+    "editorial notes"
+    binary = Field()
+    "binary data which should be uploaded (raw data)"
 
 
 class BaseItemLoader(ItemLoader):
diff --git a/converter/pipelines.py b/converter/pipelines.py
index 60aea79c..1523a7d7 100644
--- a/converter/pipelines.py
+++ b/converter/pipelines.py
@@ -112,9 +112,9 @@ def process_item(self, raw_item, spider):
         except KeyError:
             raise DropItem(f'Item {item} has no lom.technical.location')
         try:
-            if "location" not in item["lom"]["technical"]:
+            if "location" not in item["lom"]["technical"] and not "binary" in item:
                 raise DropItem(
-                    "Entry {} has no technical location".format(item["lom"]["general"]["title"])
+                    "Entry {} has no technical location or binary data".format(item["lom"]["general"]["title"])
                 )
         except KeyError:
             raise DropItem(f'Item {item} has no lom.technical.location')
@@ -519,7 +519,7 @@ def process_item(self, raw_item, spider):
         title = "<no title>"
         if "title" in item["lom"]["general"]:
             title = str(item["lom"]["general"]["title"])
-        entryUUID = EduSharing.buildUUID(item["response"]["url"])
+        entryUUID = EduSharing.buildUUID(item["response"]["url"] if "url" in item["response"] else item["hash"])
         self.insertItem(spider, entryUUID, item)
         logging.info("item " + entryUUID + " inserted/updated")
 
diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
index 3115ffee..6eeeee6d 100644
--- a/converter/spiders/niedersachsen_abi_spider.py
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -4,19 +4,21 @@
 
 import scrapy
 
+from .base_classes import LomBase
 from .scripts.lower_saxony_abi.directory_routine import DirectoryInitializer, UnZipper, \
     DirectoryScanner
 from .scripts.lower_saxony_abi.keyword_mapper import LoSaxKeywordMapper
+from ..constants import Constants
 from ..items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
     LomLifecycleItemloader, LomEducationalItemLoader, LicenseItemLoader, PermissionItemLoader, ResponseItemLoader, \
     ValuespaceItemLoader
 
 
-class NiedersachsenAbiSpider(scrapy.Spider):
+class NiedersachsenAbiSpider(scrapy.Spider, LomBase):
     name = 'niedersachsen_abi_spider'
 
     allowed_domains = ['https://za-aufgaben.nibis.de']
-    start_urls = ['https://za-aufgaben.nibis.de']
+    start_urls =  ['https://za-aufgaben.nibis.de']
     version = "0.0.1"
     # Default values for the 2 expected parameters. Parameter "filename" is always required, "skip_unzip" is optional.
     filename = None
@@ -29,9 +31,12 @@ class NiedersachsenAbiSpider(scrapy.Spider):
     #   -a skip_unzip="yes"
     # Make sure that there is a corresponding .zip file inside the /zip_download/-folder in the project root
 
+    # def start_requests(self):
+    #    yield self.parse(None)
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
+        # logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
         # logging.disable(logging.DEBUG)
         if self.filename is not None:
             zip_selection = self.filename
@@ -90,6 +95,8 @@ def parse(self, response, **kwargs):
             base.add_value('sourceId', pdf_item)
             hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
             base.add_value('hash', hash_temp)
+            base.add_value('type', Constants.TYPE_MATERIAL)
+            base.add_value('binary', self.getBinary(current_dict, pdf_item))
 
             lom = LomBaseItemloader()
 
@@ -100,8 +107,7 @@ def parse(self, response, **kwargs):
             lom.add_value('general', general.load_item())
 
             technical = LomTechnicalItemLoader()
-            filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
-            technical.add_value('location', filepath_full)
+            technical.add_value('format', 'application/pdf')
             lom.add_value('technical', technical.load_item())
 
             lifecycle = LomLifecycleItemloader()
@@ -124,7 +130,7 @@ def parse(self, response, **kwargs):
             lic = LicenseItemLoader()
             base.add_value('license', lic.load_item())
 
-            permissions = PermissionItemLoader()
+            permissions = LomBase.getPermissions(self)
             base.add_value('permissions', permissions.load_item())
 
             response_loader = ResponseItemLoader()
@@ -140,18 +146,19 @@ def parse(self, response, **kwargs):
             base.add_value('sourceId', pdf_item)
             hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
             base.add_value('hash', hash_temp)
+            base.add_value('type', Constants.TYPE_MATERIAL)
+            base.add_value('binary', self.getBinary(current_dict, pdf_item))
 
             lom = LomBaseItemloader()
 
             general = LomGeneralItemloader()
-            general.add_value('title', pdf_item)
+            general.add_value('title', pdf_item.split('.')[:-1])
             general.add_value('identifier', pdf_item)
             general.add_value('keyword', current_dict.get('keywords'))
             lom.add_value('general', general.load_item())
 
             technical = LomTechnicalItemLoader()
-            filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
-            technical.add_value('location', filepath_full)
+            technical.add_value('format', 'application/pdf')
             lom.add_value('technical', technical.load_item())
 
             lifecycle = LomLifecycleItemloader()
@@ -172,10 +179,17 @@ def parse(self, response, **kwargs):
             lic = LicenseItemLoader()
             base.add_value('license', lic.load_item())
 
-            permissions = PermissionItemLoader()
+            permissions = LomBase.getPermissions(self)
             base.add_value('permissions', permissions.load_item())
 
             response_loader = ResponseItemLoader()
             base.add_value('response', response_loader.load_item())
 
             yield base.load_item()
+
+    def getBinary(self, current_dict, pdf_item):
+        filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
+        file = open(filepath_full, mode='rb')
+        binary = file.read()
+        file.close()
+        return binary

From d8825900422225e9dfd192c038cbed1358ef8c0e Mon Sep 17 00:00:00 2001
From: Torsten Simon <simon@edu-sharing.net>
Date: Fri, 17 Sep 2021 16:16:39 +0200
Subject: [PATCH 08/10] fix mimetype

---
 converter/es_connector.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/converter/es_connector.py b/converter/es_connector.py
index 86268295..c4a1f530 100644
--- a/converter/es_connector.py
+++ b/converter/es_connector.py
@@ -170,7 +170,12 @@ def setPermissions(self, uuid, permissions) -> bool:
             return False
     def setNodeBinaryData(self, uuid, item) -> bool:
         if "binary" in item:
-            logging.info('set binary')
+            logging.info(get_project_settings().get("EDU_SHARING_BASE_URL")
+                + "rest/node/v1/nodes/-home-/"
+                + uuid
+                + "/content?mimetype="
+                + item["lom"]["technical"]["format"]
+                         )
             files = {"file": item["binary"]}
             response = requests.post(
                 get_project_settings().get("EDU_SHARING_BASE_URL")
@@ -181,8 +186,9 @@ def setNodeBinaryData(self, uuid, item) -> bool:
                 headers=self.getHeaders(None),
                 files=files,
             )
-            logging.info(response)
             return response.status_code == 200
+        else:
+            return False
 
     def setNodePreview(self, uuid, item) -> bool:
         if "thumbnail" in item:
@@ -260,6 +266,7 @@ def transformItem(self, uuid, spider, item):
             "cm:name": item["lom"]["general"]["title"],
             "ccm:wwwurl": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
             "cclom:location": item["lom"]["technical"]["location"] if "location" in item["lom"]["technical"] else None,
+            "cclom:format": item["lom"]["technical"]["format"] if "format" in item["lom"]["technical"] else None,
             "cclom:title": item["lom"]["general"]["title"],
         }
         if "notes" in item:
@@ -506,8 +513,8 @@ def insertItem(self, spider, uuid, item):
         node = self.syncNode(spider, "ccm:io", self.transformItem(uuid, spider, item))
         self.setNodePermissions(node["ref"]["id"], item)
         self.setNodePreview(node["ref"]["id"], item)
-        self.setNodeBinaryData(node["ref"]["id"], item)
-        self.setNodeText(node["ref"]["id"], item)
+        if not self.setNodeBinaryData(node["ref"]["id"], item):
+            self.setNodeText(node["ref"]["id"], item)
 
     def updateItem(self, spider, uuid, item):
         self.insertItem(spider, uuid, item)

From 7fffe6fe41a94003de2ed10a8b3c0d60dc0d078b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Tue, 21 Sep 2021 14:45:31 +0200
Subject: [PATCH 09/10] add: requested features for
 "niedersachsen_abi_spider.py" v0.0.2

- if the spider is started without using the "filename"-parameter, it will always scan for pdfs and try to extract metadata
-- only if the user wants to extract specific .zip files or nested zips the "filename"-parameter is required
- making sure that the current working directory doesn't change after directory initialization (the output.json and .log should now end up in the project root again)
- recognize already extracted files by their full filepath+filename instead of filename only
- use cleaned up keyword list to generate "general.title"
- reorder keyword list for better re-use in title string
- rename getBinary to get_binary
-- made get_binary static since 'self' isn't used anywhere
- fix allowed_domains (was URL, needed to be domain)
- remove unnecessary import of PermissionLoader
---
 converter/spiders/niedersachsen_abi_spider.py | 57 +++++++++++--------
 .../lower_saxony_abi/directory_routine.py     | 12 ++--
 .../lower_saxony_abi/keyword_mapper.py        |  2 +-
 3 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
index 6eeeee6d..91398d41 100644
--- a/converter/spiders/niedersachsen_abi_spider.py
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -10,16 +10,16 @@
 from .scripts.lower_saxony_abi.keyword_mapper import LoSaxKeywordMapper
 from ..constants import Constants
 from ..items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
-    LomLifecycleItemloader, LomEducationalItemLoader, LicenseItemLoader, PermissionItemLoader, ResponseItemLoader, \
+    LomLifecycleItemloader, LomEducationalItemLoader, LicenseItemLoader, ResponseItemLoader, \
     ValuespaceItemLoader
 
 
 class NiedersachsenAbiSpider(scrapy.Spider, LomBase):
     name = 'niedersachsen_abi_spider'
 
-    allowed_domains = ['https://za-aufgaben.nibis.de']
-    start_urls =  ['https://za-aufgaben.nibis.de']
-    version = "0.0.1"
+    allowed_domains = ['za-aufgaben.nibis.de']
+    start_urls = ['https://za-aufgaben.nibis.de']
+    version = "0.0.2"
     # Default values for the 2 expected parameters. Parameter "filename" is always required, "skip_unzip" is optional.
     filename = None
     skip_unzip = False
@@ -38,21 +38,23 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         # logging.basicConfig(level=logging.DEBUG, format=' %(asctime)s - %(levelname)s - %(message)s')
         # logging.disable(logging.DEBUG)
+        directory_paths = DirectoryInitializer()
+        zip_file_dictionary = directory_paths.check_download_folder_for_zip_files()
+
+        # only extract files if a "filename"-parameter was given:
         if self.filename is not None:
             zip_selection = self.filename
 
             logging.debug(f"Selected .zip file by CLI-parameter: {zip_selection}")
             logging.debug(f"User wants to skip the unzipping? {self.skip_unzip}")
-            # by default, the script should always unzip the desired .zip file
+            # by default, the script should always unzip the desired .zip file,
+            # but unzipping the nested .zip files is only done when requested by parameter
             if self.skip_unzip == "no":
                 self.skip_unzip = False
             if self.skip_unzip == "yes":
                 self.skip_unzip = True
             logging.debug(f"skip_unzip variable: {self.skip_unzip}")
 
-            directory_paths = DirectoryInitializer()
-            zip_file_dictionary = directory_paths.check_download_folder_for_zip_files()
-
             if self.skip_unzip is False:
                 un_zipper = UnZipper()
                 un_zipper.directory_paths = directory_paths.get_path_storage()
@@ -67,18 +69,25 @@ def __init__(self, **kwargs):
                     logging.debug(f"Extracted the following zip files:")
                     logging.debug(un_zipper.zip_files_already_extracted)
 
-            print(
-                f"Analyzing file paths for '.pdf'-files inside "
-                f"{directory_paths.path_storage.path_to_extraction_directory}")
-            pdfs_in_directory: dict = \
-                DirectoryScanner.scan_directory_for_pdfs(directory_paths.path_storage.path_to_extraction_directory)
-            # logging.debug(pp.pformat(pdfs_in_directory))
-            print(f"Total .pdf items in the above mentioned directory: {len(pdfs_in_directory.keys())}")
+        # always scan the /zip_extract/-directory for pdfs and try to extract metadata
+        print(
+            f"Analyzing file paths for '.pdf'-files inside "
+            f"{directory_paths.path_storage.path_to_extraction_directory}")
+        pdfs_in_directory: dict = \
+            DirectoryScanner.scan_directory_for_pdfs(directory_paths.path_storage.path_to_extraction_directory)
+        # logging.debug(pp.pformat(pdfs_in_directory))
+        print(f"Total .pdf items in the above mentioned directory: {len(pdfs_in_directory.keys())}")
+
+        kw_mapper = LoSaxKeywordMapper()
+        pdf_dict1, pdf_dict2 = kw_mapper.extract_pdf_metadata(pdfs_in_directory)
+        self.pdf_dictionary_general = pdf_dict1
+        self.pdf_dictionary_additional = pdf_dict2
+
+    def getId(self, response=None) -> str:
+        pass
 
-            kw_mapper = LoSaxKeywordMapper()
-            pdf_dict1, pdf_dict2 = kw_mapper.extract_pdf_metadata(pdfs_in_directory)
-            self.pdf_dictionary_general = pdf_dict1
-            self.pdf_dictionary_additional = pdf_dict2
+    def getHash(self, response=None) -> str:
+        pass
 
     def parse(self, response, **kwargs):
         # print(f"filename = {self.filename}")
@@ -96,12 +105,13 @@ def parse(self, response, **kwargs):
             hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
             base.add_value('hash', hash_temp)
             base.add_value('type', Constants.TYPE_MATERIAL)
-            base.add_value('binary', self.getBinary(current_dict, pdf_item))
+            base.add_value('binary', self.get_binary(current_dict, pdf_item))
 
             lom = LomBaseItemloader()
 
             general = LomGeneralItemloader()
-            general.add_value('title', pdf_item)
+            title_long: str = ' '.join(current_dict.get('keywords'))
+            general.add_value('title', title_long)
             general.add_value('identifier', pdf_item)
             general.add_value('keyword', current_dict.get('keywords'))
             lom.add_value('general', general.load_item())
@@ -147,7 +157,7 @@ def parse(self, response, **kwargs):
             hash_temp = str(f"{datetime.now().isoformat()}{self.version}")
             base.add_value('hash', hash_temp)
             base.add_value('type', Constants.TYPE_MATERIAL)
-            base.add_value('binary', self.getBinary(current_dict, pdf_item))
+            base.add_value('binary', self.get_binary(current_dict, pdf_item))
 
             lom = LomBaseItemloader()
 
@@ -187,7 +197,8 @@ def parse(self, response, **kwargs):
 
             yield base.load_item()
 
-    def getBinary(self, current_dict, pdf_item):
+    @staticmethod
+    def get_binary(current_dict, pdf_item):
         filepath_full = current_dict.get('pdf_path') + os.path.sep + pdf_item
         file = open(filepath_full, mode='rb')
         binary = file.read()
diff --git a/converter/spiders/scripts/lower_saxony_abi/directory_routine.py b/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
index cc538168..cd869528 100644
--- a/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
+++ b/converter/spiders/scripts/lower_saxony_abi/directory_routine.py
@@ -119,6 +119,7 @@ def initialize_required_directories(self):
         self.path_storage.parent_directory = os.getcwd()
         self.initialize_folders()
         self.path_storage.print_all_directories()
+        os.chdir(self.path_storage.parent_directory)
         return self
 
 
@@ -159,7 +160,8 @@ def unzip_all_zips_within_the_initial_zip(self, zip_file: zipfile, skip_unzip=Fa
         zips_inside_zip: list = list()
         zip_files_list: list = zip_file.namelist()
         zip_file.extractall(path='zip_extract')
-        self.zip_files_already_extracted.add(zip_file.filename)
+        filename_full_path = os.path.abspath(zip_file.filename)
+        self.zip_files_already_extracted.add(filename_full_path)
 
         for zip_item in zip_files_list:
             if zip_item.endswith('.zip'):
@@ -186,11 +188,12 @@ def unzip_everything(self, directory_as_string):
         for folder_name, sub_folder, filenames in os.walk(extract_dir):
             if len(sub_folder) == 0 and folder_name.endswith('zip_extract'):
                 for filename_top_level in filenames:
+                    current_full_path = os.path.abspath(filename_top_level)
                     if filename_top_level.endswith(
-                            '.zip') and filename_top_level not in self.zip_files_already_extracted:
+                            '.zip') and current_full_path not in self.zip_files_already_extracted:
                         print(folder_name)
                         print(filename_top_level)
-                        self.zip_files_already_extracted.add(filename_top_level)
+                        self.zip_files_already_extracted.add(current_full_path)
                         current_zip = zipfile.ZipFile(filename_top_level)
                         zip_files_inside = current_zip.namelist()
                         for zip_file_inside in zip_files_inside:
@@ -205,7 +208,8 @@ def unzip_everything(self, directory_as_string):
                         self.unzip_everything(extract_dir)
             for _ in sub_folder:
                 for filename in filenames:
-                    if filename.endswith('.zip') and filename not in self.zip_files_already_extracted:
+                    current_full_path = os.path.abspath(filename)
+                    if filename.endswith('.zip') and current_full_path not in self.zip_files_already_extracted:
                         self.zip_files_to_extract.add(filename)
                         self.zip_files_to_extract_dict.update({filename: folder_name})
 
diff --git a/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py b/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
index 1d914354..1ca04752 100644
--- a/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
+++ b/converter/spiders/scripts/lower_saxony_abi/keyword_mapper.py
@@ -149,13 +149,13 @@ def extract_pdf_metadata(self, pdf_dictionary):
                     logging.debug(f"PDF File: {pdf_item} // only_valid_keywords: {only_valid_values}")
 
                     keywords_cleaned_and_mapped = list()
+                    keywords_cleaned_and_mapped.append('Schriftliche Abituraufgaben Niedersachsen')
                     for potential_keyword in only_valid_values:
                         if potential_keyword in self.keyword_mapping:
                             potential_keyword = self.keyword_mapping.get(potential_keyword)
                         if potential_keyword.startswith('Aufg'):
                             potential_keyword = potential_keyword.replace('Aufg', 'Aufgabe ')
                         keywords_cleaned_and_mapped.append(potential_keyword)
-                    keywords_cleaned_and_mapped.append('Schriftliche Abituraufgaben Niedersachsen')
                     logging.debug(self.pp.pformat(keywords_cleaned_and_mapped))
 
                     # TODO: keywords

From 2c0171b0292fb577a3734563168348b908c60d54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Tue, 21 Sep 2021 14:55:22 +0200
Subject: [PATCH 10/10] add: spider Exception when there's no .pdf files found
 within /zip_extract/-directory

---
 converter/spiders/niedersachsen_abi_spider.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/converter/spiders/niedersachsen_abi_spider.py b/converter/spiders/niedersachsen_abi_spider.py
index 91398d41..de895dd4 100644
--- a/converter/spiders/niedersachsen_abi_spider.py
+++ b/converter/spiders/niedersachsen_abi_spider.py
@@ -77,7 +77,10 @@ def __init__(self, **kwargs):
             DirectoryScanner.scan_directory_for_pdfs(directory_paths.path_storage.path_to_extraction_directory)
         # logging.debug(pp.pformat(pdfs_in_directory))
         print(f"Total .pdf items in the above mentioned directory: {len(pdfs_in_directory.keys())}")
-
+        if len(pdfs_in_directory.keys()) == 0:
+            raise Exception(f"No .pdf files found inside {directory_paths.path_storage.path_to_extraction_directory}. "
+                            f"Please make sure that you've run the crawler with '-a filename=<zip filename>' "
+                            f"parameter first and that there's actual .pdf files inside the extraction directory")
         kw_mapper = LoSaxKeywordMapper()
         pdf_dict1, pdf_dict2 = kw_mapper.extract_pdf_metadata(pdfs_in_directory)
         self.pdf_dictionary_general = pdf_dict1