From df170766130463e278d674522bec51eed6a33931 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Thu, 7 Oct 2021 15:06:47 +0200
Subject: [PATCH 01/10] add umwelt_im_unterricht_spider.py v0.0.1 (WIP!)

- works in local 'json'-mode
- rough first draft; still missing a bunch of metadata-fields (see ToDos)
---
 .../spiders/umwelt_im_unterricht_spider.py    | 277 ++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 converter/spiders/umwelt_im_unterricht_spider.py

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
new file mode 100644
index 00000000..db625b09
--- /dev/null
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -0,0 +1,277 @@
+import logging
+
+import scrapy
+import w3lib.html
+from scrapy.spiders import CrawlSpider
+
+from converter.constants import Constants
+from converter.items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
+    LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, ResponseItemLoader, \
+    PermissionItemLoader
+from converter.spiders.base_classes import LomBase
+
+
+class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
+    """
+    Crawler for Umwelt-im-Unterricht.de
+    (Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit)
+    """
+    name = "umwelt_im_unterricht_spider"
+    friendlyName = "Umwelt im Unterricht"
+    start_urls = [
+        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics",
+        # # Typ: Thema der Woche
+        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons",
+        # # Typ: Unterrichtsvorschlag
+        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts",
+        # # Typ: Hintergrund (Kontext)
+        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials",
+        # # Typ: Arbeitsmaterial
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video",
+        # Typ: Video
+        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images",
+        # # Typ: Bilderserie
+    ]
+    version = "0.0.1"   # last update: 2021-10-07
+    topic_urls = set()  # urls that need to be parsed will be added here
+    topic_urls_already_parsed = set()   # this set is used for 'checking off' already parsed urls
+
+    EDUCATIONAL_CONTEXT_MAPPING: dict = {
+        # There's only 2 "Zielgruppen": 'Grundschule' and 'Sekundarstufe'
+        # ToDo: either map Sekundarstufe to both or neither
+        'Sekundarstufe': ['Sekundarstufe I', 'Sekundarstufe II']
+    }
+    DISCIPLINE_MAPPING: dict = {
+        'Arbeit, Wirtschaft, Technik': 'Arbeitslehre',
+        'Ethik, Philosophie, Religion': ['Ethik', 'Philosophie', 'Religion'],
+        # 'Fächerübergreifend',   # ToDo: no mapping available
+        'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde'],
+        # 'Verbraucherbildung'    # ToDo: no mapping available
+    }
+
+    def getId(self, response=None) -> str:
+        return response.url
+
+    def getHash(self, response=None) -> str:
+        date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
+        date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw)
+        hash_temp = str(date_cleaned_up + self.version)
+        return hash_temp
+
+    def parse_start_url(self, response, **kwargs):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls)
+
+    def parse_category_overview_for_individual_topic_urls(self, response, **kwargs):
+        # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}")
+        topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall()
+        # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}")
+
+        for url_ending in topic_urls_raw:
+            self.topic_urls.add(response.urljoin(url_ending))
+        # logging.debug(f"TOPIC URLS ({len(self.topic_urls)}) = {self.topic_urls}")
+
+        # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially
+        # displayed 10 elements
+        last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
+        if last_page_button_url is not None:
+            last_page_button_url = response.urljoin(last_page_button_url)
+            # Using the "next page"-button until we reach the last page:
+            if last_page_button_url != response.url:
+                next_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last next"]/a/@href').get()
+                if next_page_button_url is not None:
+                    next_url_to_parse = response.urljoin(next_page_button_url)
+                    yield scrapy.Request(url=next_url_to_parse,
+                                         callback=self.parse_category_overview_for_individual_topic_urls)
+            # if last_page_button_url == response.url:
+            #     logging.debug(f"Reached the last page: {response.url}")
+            #     logging.debug(f"{len(self.topic_urls)} individual topic_urls were found: {self.topic_urls}")
+        for url in self.topic_urls:
+            # making sure that we don't accidentally crawl individual pages more than once
+            if url not in self.topic_urls_already_parsed:
+                yield scrapy.Request(url=url, callback=self.parse)
+                self.topic_urls_already_parsed.add(url)
+        # logging.debug(f"topic_urls after yielding them: {len(self.topic_urls)} --- "
+        #               f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}")
+
+    def parse(self, response, **kwargs):
+        base = BaseItemLoader()
+        # ALL possible keys for the different Item and ItemLoader-classes can be found inside converter/items.py
+
+        # TODO: fill "base"-keys with values for
+        #  - thumbnail          recommended (let splash handle it)
+        #  - publisher          optional
+        base.add_value('sourceId', response.url)
+        date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
+        date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw)
+        base.add_value('lastModified', date_cleaned_up)
+        base.add_value('type', Constants.TYPE_MATERIAL)
+        # base.add_value('thumbnail', thumbnail_url)
+
+        lom = LomBaseItemloader()
+
+        general = LomGeneralItemloader()
+        # TODO: fill "general"-keys with values for
+        #  - coverage                       optional
+        #  - structure                      optional
+        #  - aggregationLevel               optional
+        general.add_value('identifier', response.url)
+        title = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get()
+        general.add_value('title', title)
+        keywords = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
+        if len(keywords) >= 1:
+            general.add_value('keyword', keywords)
+        description = response.xpath('/html/head/meta[@name="description"]/@content').get()
+        general.add_value('description', description)
+        general.add_value('language', 'de')
+
+        lom.add_value('general', general.load_item())
+
+        technical = LomTechnicalItemLoader()
+        # TODO: fill "technical"-keys with values for
+        #  - size                           optional
+        #  - requirement                    optional
+        #  - installationRemarks            optional
+        #  - otherPlatformRequirements      optional
+        technical.add_value('format', 'text/html')
+        technical.add_value('location', response.url)
+        lom.add_value('technical', technical.load_item())
+
+        lifecycle = LomLifecycleItemloader()
+        # TODO: fill "lifecycle"-keys with values for
+        #  - url                            recommended
+        #  - email                          optional
+        #  - uuid                           optional
+        lifecycle.add_value('role', 'publisher')
+        lifecycle.add_value('date', date_cleaned_up)
+        lifecycle.add_value('organization', 'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)')
+        lom.add_value('lifecycle', lifecycle.load_item())
+
+        educational = LomEducationalItemLoader()
+        # TODO: fill "educational"-keys with values for
+        #  - description                    recommended (= "Comments on how this learning object is to be used")
+        #  - interactivityType              optional
+        #  - interactivityLevel             optional
+        #  - semanticDensity                optional
+        #  - typicalAgeRange                optional
+        #  - difficulty                     optional
+        #  - typicalLearningTime            optional
+        educational.add_value('language', 'de')
+        lom.add_value('educational', educational.load_item())
+
+        # once you've filled "general", "technical", "lifecycle" and "educational" with values,
+        # the LomBaseItem is loaded into the "base"-BaseItemLoader
+        base.add_value('lom', lom.load_item())
+
+        vs = ValuespaceItemLoader()
+        # for possible values, either consult https://vocabs.openeduhub.de
+        # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs
+        # TODO: fill "valuespaces"-keys with values for
+        #  - discipline                     recommended
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl)
+        #  - intendedEndUserRole            recommended
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/intendedEndUserRole.ttl)
+        #  - learningResourceType           recommended
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl)
+        #  - conditionsOfAccess             recommended
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/conditionsOfAccess.ttl)
+        #  - containsAdvertisement          recommended
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/containsAdvertisement.ttl)
+        #  - price                          recommended
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/price.ttl)
+        #  - educationalContext             optional
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/educationalContext.ttl)
+        #  - sourceContentType              optional
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/sourceContentType.ttl)
+        #  - toolCategory                   optional
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/toolCategory.ttl)
+        #  - accessibilitySummary           optional
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/accessibilitySummary.ttl)
+        #  - dataProtectionConformity       optional
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/dataProtectionConformity.ttl)
+        #  - fskRating                      optional
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/fskRating.ttl)
+        #  - oer                            optional
+        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oer.ttl)
+        disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
+        if len(disciplines_raw) >= 1:
+            disciplines = list()
+            for discipline_value in disciplines_raw:
+                # self.debug_discipline_values.add(discipline_value)
+                if discipline_value in self.DISCIPLINE_MAPPING.keys():
+                    discipline_value = self.DISCIPLINE_MAPPING.get(discipline_value)
+                if type(discipline_value) is list:
+                    disciplines.extend(discipline_value)
+                else:
+                    disciplines.append(discipline_value)
+            if len(disciplines) >= 1:
+                vs.add_value('discipline', disciplines)
+
+        educational_context_raw = response.xpath('//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall()
+        if len(educational_context_raw) >= 1:
+            educational_context = list()
+            for educational_context_value in educational_context_raw:
+                # self.debug_educational_context_values.add(educational_context_value)
+                if educational_context_value in self.EDUCATIONAL_CONTEXT_MAPPING.keys():
+                    educational_context_value = self.EDUCATIONAL_CONTEXT_MAPPING.get(educational_context_value)
+                if type(educational_context_value) is list:
+                    educational_context.extend(educational_context_value)
+                else:
+                    educational_context.append(educational_context_value)
+            if len(educational_context) >= 1:
+                vs.add_value('educationalContext', educational_context)
+
+        base.add_value('valuespaces', vs.load_item())
+
+        lic = LicenseItemLoader()
+        # TODO: fill "license"-keys with values for
+        #  - oer                            recommended ('oer' is automatically set if the 'url'-field above
+        #  is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at
+        #  LICENSE_MAPPINGS in converter/constants.py)
+        #  - author                         recommended
+        #  - internal                       optional
+        #  - expirationDate                 optional (for content that expires, e.g. ÖR-Mediatheken)
+        license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
+        if license_url is not None:
+            lic.add_value('url', license_url)
+
+        license_description_raw = response.xpath('//div[@class="cc-licence-info"]').get()
+        if license_description_raw is not None:
+            license_description_raw = w3lib.html.remove_tags(license_description_raw)
+            license_description_raw = w3lib.html.replace_escape_chars(license_description_raw, which_ones="\n",
+                                                                      replace_by=" ")
+            license_description_raw = w3lib.html.replace_escape_chars(license_description_raw)
+            license_description = " ".join(license_description_raw.split())
+            lic.add_value('description', license_description)
+        base.add_value('license', lic.load_item())
+
+        # Either fill the PermissionItemLoader manually (not necessary most of the times)
+        permissions = PermissionItemLoader()
+        # or (preferably) call the inherited getPermissions(response)-method
+        #   from converter/spiders/base_classes/lom_base.py by using super().:
+        # permissions = super().getPermissions(response)
+        # TODO: if necessary, add/replace values for the following "permissions"-keys
+        #  - public                         optional
+        #  - groups                         optional
+        #  - mediacenters                   optional
+        #  - autoCreateGroups               optional
+        #  - autoCreateMediacenters         optional
+        base.add_value('permissions', permissions.load_item())
+
+        # Either fill the ResponseItemLoader manually (not necessary most of the time)
+        # response_loader = ResponseItemLoader()
+        # or (preferably) call the inherited mapResponse(response)-method
+        #   from converter/spiders/base_classes/lom_base.py by using super().:
+        response_loader = super().mapResponse(response)
+        # TODO: if necessary, add/replace values for the following "response"-keys
+        #  - url                            required
+        #  - status                         optional
+        #  - html                           optional
+        #  - text                           optional
+        #  - headers                        optional
+        #  - cookies                        optional
+        #  - har                            optional
+        base.add_value('response', response_loader.load_item())
+
+        # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method
+        yield base.load_item()

From ea688d56d7b78129c9e533655a940ea54bea26a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Thu, 7 Oct 2021 19:45:55 +0200
Subject: [PATCH 02/10] add LomClassificationItem to
 sample_spider_alternative.py

- initially forgot to add 'classification' to the spider blueprint since it was never used anywhere
---
 converter/spiders/sample_spider_alternative.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/converter/spiders/sample_spider_alternative.py b/converter/spiders/sample_spider_alternative.py
index 8b1cd07f..5eee54ec 100644
--- a/converter/spiders/sample_spider_alternative.py
+++ b/converter/spiders/sample_spider_alternative.py
@@ -4,7 +4,7 @@
 from converter.constants import Constants
 from converter.items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
     LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, ResponseItemLoader, \
-    PermissionItemLoader
+    PermissionItemLoader, LomClassificationItemLoader
 from converter.spiders.base_classes import LomBase
 
 
@@ -70,6 +70,7 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
         #  - LomTechnicalItem               required
         #  - LomLifeCycleItem               required (multiple possible)
         #  - LomEducationalItem             required
+        #  - LomClassificationItem          optional
 
         general = LomGeneralItemloader()
         # TODO: fill "general"-keys with values for
@@ -132,6 +133,15 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
         #  - typicalLearningTime            optional
         lom.add_value('educational', educational.load_item())
 
+        classification = LomClassificationItemLoader()
+        # TODO: fill "classification"-keys with values for
+        #  - cost                           optional
+        #  - purpose                        optional
+        #  - taxonPath                      optional
+        #  - description                    optional
+        #  - keyword                        optional
+        lom.add_value('classification', classification.load_item())
+
         # once you've filled "general", "technical", "lifecycle" and "educational" with values,
         # the LomBaseItem is loaded into the "base"-BaseItemLoader
         base.add_value('lom', lom.load_item())

From 251dd53aa711ac3d9ca83d99d0c6cc5499a6ce1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Thu, 7 Oct 2021 21:03:55 +0200
Subject: [PATCH 03/10] umwelt_im_unterricht_spider.py (WIP!)

- fill up most of the remaining metadata-fields
-- ToDo: doublecheck the remaining questionable fields
- add classification.description (for classification.purpose:'competency' and purpose:'educational objective')
---
 .../spiders/umwelt_im_unterricht_spider.py    | 145 ++++++++----------
 1 file changed, 61 insertions(+), 84 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index db625b09..40ca2d15 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -1,13 +1,11 @@
-import logging
-
 import scrapy
 import w3lib.html
 from scrapy.spiders import CrawlSpider
 
 from converter.constants import Constants
 from converter.items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \
-    LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, ResponseItemLoader, \
-    PermissionItemLoader
+    LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, \
+    LomClassificationItemLoader
 from converter.spiders.base_classes import LomBase
 
 
@@ -21,24 +19,22 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     start_urls = [
         # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics",
         # # Typ: Thema der Woche
-        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons",
-        # # Typ: Unterrichtsvorschlag
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons",
+        # Typ: Unterrichtsvorschlag
         # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts",
         # # Typ: Hintergrund (Kontext)
         # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials",
         # # Typ: Arbeitsmaterial
-        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video",
-        # Typ: Video
+        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video",
+        # # Typ: Video
         # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images",
         # # Typ: Bilderserie
     ]
-    version = "0.0.1"   # last update: 2021-10-07
+    version = "0.0.1"  # last update: 2021-10-07
     topic_urls = set()  # urls that need to be parsed will be added here
-    topic_urls_already_parsed = set()   # this set is used for 'checking off' already parsed urls
+    topic_urls_already_parsed = set()  # this set is used for 'checking off' already parsed urls
 
     EDUCATIONAL_CONTEXT_MAPPING: dict = {
-        # There's only 2 "Zielgruppen": 'Grundschule' and 'Sekundarstufe'
-        # ToDo: either map Sekundarstufe to both or neither
         'Sekundarstufe': ['Sekundarstufe I', 'Sekundarstufe II']
     }
     DISCIPLINE_MAPPING: dict = {
@@ -50,19 +46,16 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     }
 
     def getId(self, response=None) -> str:
-        return response.url
+        pass
 
     def getHash(self, response=None) -> str:
-        date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
-        date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw)
-        hash_temp = str(date_cleaned_up + self.version)
-        return hash_temp
+        pass
 
     def parse_start_url(self, response, **kwargs):
         for url in self.start_urls:
             yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls)
 
-    def parse_category_overview_for_individual_topic_urls(self, response, **kwargs):
+    def parse_category_overview_for_individual_topic_urls(self, response):
         # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}")
         topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall()
         # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}")
@@ -95,15 +88,14 @@ def parse_category_overview_for_individual_topic_urls(self, response, **kwargs):
         #               f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}")
 
     def parse(self, response, **kwargs):
+        current_url: str = response.url
         base = BaseItemLoader()
-        # ALL possible keys for the different Item and ItemLoader-classes can be found inside converter/items.py
 
-        # TODO: fill "base"-keys with values for
-        #  - thumbnail          recommended (let splash handle it)
-        #  - publisher          optional
         base.add_value('sourceId', response.url)
         date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
         date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw)
+        hash_temp = str(date_cleaned_up + self.version)
+        base.add_value('hash', hash_temp)
         base.add_value('lastModified', date_cleaned_up)
         base.add_value('type', Constants.TYPE_MATERIAL)
         # base.add_value('thumbnail', thumbnail_url)
@@ -112,7 +104,6 @@ def parse(self, response, **kwargs):
 
         general = LomGeneralItemloader()
         # TODO: fill "general"-keys with values for
-        #  - coverage                       optional
         #  - structure                      optional
         #  - aggregationLevel               optional
         general.add_value('identifier', response.url)
@@ -128,22 +119,14 @@ def parse(self, response, **kwargs):
         lom.add_value('general', general.load_item())
 
         technical = LomTechnicalItemLoader()
-        # TODO: fill "technical"-keys with values for
-        #  - size                           optional
-        #  - requirement                    optional
-        #  - installationRemarks            optional
-        #  - otherPlatformRequirements      optional
         technical.add_value('format', 'text/html')
         technical.add_value('location', response.url)
         lom.add_value('technical', technical.load_item())
 
         lifecycle = LomLifecycleItemloader()
-        # TODO: fill "lifecycle"-keys with values for
-        #  - url                            recommended
-        #  - email                          optional
-        #  - uuid                           optional
         lifecycle.add_value('role', 'publisher')
         lifecycle.add_value('date', date_cleaned_up)
+        lifecycle.add_value('url', "https://www.umwelt-im-unterricht.de/impressum/")
         lifecycle.add_value('organization', 'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)')
         lom.add_value('lifecycle', lifecycle.load_item())
 
@@ -159,40 +142,56 @@ def parse(self, response, **kwargs):
         educational.add_value('language', 'de')
         lom.add_value('educational', educational.load_item())
 
-        # once you've filled "general", "technical", "lifecycle" and "educational" with values,
-        # the LomBaseItem is loaded into the "base"-BaseItemLoader
+        # ToDo: didactic_comment / competencies
+        classification = LomClassificationItemLoader()
+
+        if "/wochenthemen/" in current_url:
+            classification.add_value('purpose', 'educational objective')
+            # didactic comments are only part of "Thema der Woche"
+            didactic_comment = response.xpath('//div[@class="c-collapse-content js-collapse-content"]').get()
+            if didactic_comment is not None:
+                didactic_comment = w3lib.html.remove_tags(didactic_comment)
+                # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ")
+                # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment)
+                didactic_comment = " ".join(didactic_comment.split())
+                if didactic_comment.endswith(".mehr lesenweniger lesen"):
+                    didactic_comment = didactic_comment.replace("mehr lesenweniger lesen", "")
+                # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars)
+                classification.add_value('description', didactic_comment)
+
+        if "/unterrichtsvorschlaege/" in current_url:
+            classification.add_value('purpose', 'competency')
+            competency_description: list = response.xpath('//div[@class="b-cpsuiu-show-description"]/*[not('
+                                                          '@class="cc-licence-info")]').getall()
+            # competency_description will grab the whole div-element, but EXCLUDE the "license"-container
+            if len(competency_description) >= 1:
+                # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its
+                # formatting up)
+                competency_description: str = " ".join(competency_description)
+                competency_description = w3lib.html.remove_tags(competency_description)
+                classification.add_value('description', competency_description)
+
+        lom.add_value('classification', classification.load_item())
+
         base.add_value('lom', lom.load_item())
 
         vs = ValuespaceItemLoader()
         # for possible values, either consult https://vocabs.openeduhub.de
         # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs
         # TODO: fill "valuespaces"-keys with values for
-        #  - discipline                     recommended
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl)
-        #  - intendedEndUserRole            recommended
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/intendedEndUserRole.ttl)
         #  - learningResourceType           recommended
         #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl)
-        #  - conditionsOfAccess             recommended
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/conditionsOfAccess.ttl)
-        #  - containsAdvertisement          recommended
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/containsAdvertisement.ttl)
-        #  - price                          recommended
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/price.ttl)
-        #  - educationalContext             optional
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/educationalContext.ttl)
-        #  - sourceContentType              optional
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/sourceContentType.ttl)
-        #  - toolCategory                   optional
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/toolCategory.ttl)
-        #  - accessibilitySummary           optional
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/accessibilitySummary.ttl)
-        #  - dataProtectionConformity       optional
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/dataProtectionConformity.ttl)
-        #  - fskRating                      optional
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/fskRating.ttl)
-        #  - oer                            optional
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oer.ttl)
+        vs.add_value('price', 'no')
+        vs.add_value('containsAdvertisement', 'no')
+        vs.add_value('conditionsOfAccess', 'no login')
+        vs.add_value('intendedEndUserRole', 'teacher')
+        vs.add_value('sourceContentType', 'Unterrichtsmaterial- und Aufgaben-Sammlung')
+        vs.add_value('accessibilitySummary', 'Not tested')  # ToDo: check if the accessibility has changed
+        # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/
+        vs.add_value('dataProtectionConformity', 'Sensible data collection')  # ToDo: DSGVO-konform?
+        # see: https://www.umwelt-im-unterricht.de/datenschutz/
+        vs.add_value('oer', 'partly OER')  # ToDo: alles OER? nur teils? wie setzen?
+        # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/
         disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
         if len(disciplines_raw) >= 1:
             disciplines = list()
@@ -228,9 +227,6 @@ def parse(self, response, **kwargs):
         #  - oer                            recommended ('oer' is automatically set if the 'url'-field above
         #  is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at
         #  LICENSE_MAPPINGS in converter/constants.py)
-        #  - author                         recommended
-        #  - internal                       optional
-        #  - expirationDate                 optional (for content that expires, e.g. ÖR-Mediatheken)
         license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
         if license_url is not None:
             lic.add_value('url', license_url)
@@ -240,37 +236,18 @@ def parse(self, response, **kwargs):
             license_description_raw = w3lib.html.remove_tags(license_description_raw)
             license_description_raw = w3lib.html.replace_escape_chars(license_description_raw, which_ones="\n",
                                                                       replace_by=" ")
+            # if we would replace_escape_chars() straight away, there would be words stuck together that don't belong
+            # together. just replacing \n with a whitespace is enough to keep the structure of the string intact.
             license_description_raw = w3lib.html.replace_escape_chars(license_description_raw)
             license_description = " ".join(license_description_raw.split())
+            # making sure that there's only 1 whitespace between words, not 4+ when the original string had serveral \t
             lic.add_value('description', license_description)
         base.add_value('license', lic.load_item())
 
-        # Either fill the PermissionItemLoader manually (not necessary most of the times)
-        permissions = PermissionItemLoader()
-        # or (preferably) call the inherited getPermissions(response)-method
-        #   from converter/spiders/base_classes/lom_base.py by using super().:
-        # permissions = super().getPermissions(response)
-        # TODO: if necessary, add/replace values for the following "permissions"-keys
-        #  - public                         optional
-        #  - groups                         optional
-        #  - mediacenters                   optional
-        #  - autoCreateGroups               optional
-        #  - autoCreateMediacenters         optional
+        permissions = super().getPermissions(response)
         base.add_value('permissions', permissions.load_item())
 
-        # Either fill the ResponseItemLoader manually (not necessary most of the time)
-        # response_loader = ResponseItemLoader()
-        # or (preferably) call the inherited mapResponse(response)-method
-        #   from converter/spiders/base_classes/lom_base.py by using super().:
         response_loader = super().mapResponse(response)
-        # TODO: if necessary, add/replace values for the following "response"-keys
-        #  - url                            required
-        #  - status                         optional
-        #  - html                           optional
-        #  - text                           optional
-        #  - headers                        optional
-        #  - cookies                        optional
-        #  - har                            optional
         base.add_value('response', response_loader.load_item())
 
         # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method

From 798b80723e925cb5bec032a1be3eecf689b8068d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Fri, 8 Oct 2021 11:46:14 +0200
Subject: [PATCH 04/10] umwelt_im_unterricht_spider.py (WIP!)

- move 'didactic_comment' to educational.description
- after feedback: clarified the remaining metadata questions from the ToDo-list
- next ToDo: set learningResourceType depending on which material-type is currently getting crawled
---
 .../spiders/umwelt_im_unterricht_spider.py    | 53 ++++++-------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index 40ca2d15..abdfeb9d 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -19,8 +19,8 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     start_urls = [
         # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics",
         # # Typ: Thema der Woche
-        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons",
-        # Typ: Unterrichtsvorschlag
+        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons",
+        # # Typ: Unterrichtsvorschlag
         # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts",
         # # Typ: Hintergrund (Kontext)
         # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials",
@@ -40,9 +40,8 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     DISCIPLINE_MAPPING: dict = {
         'Arbeit, Wirtschaft, Technik': 'Arbeitslehre',
         'Ethik, Philosophie, Religion': ['Ethik', 'Philosophie', 'Religion'],
-        # 'Fächerübergreifend',   # ToDo: no mapping available
-        'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde'],
-        # 'Verbraucherbildung'    # ToDo: no mapping available
+        'Fächerübergreifend': 'Allgemein',   # ToDo: no mapping available
+        'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde']
     }
 
     def getId(self, response=None) -> str:
@@ -103,9 +102,6 @@ def parse(self, response, **kwargs):
         lom = LomBaseItemloader()
 
         general = LomGeneralItemloader()
-        # TODO: fill "general"-keys with values for
-        #  - structure                      optional
-        #  - aggregationLevel               optional
         general.add_value('identifier', response.url)
         title = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get()
         general.add_value('title', title)
@@ -131,22 +127,12 @@ def parse(self, response, **kwargs):
         lom.add_value('lifecycle', lifecycle.load_item())
 
         educational = LomEducationalItemLoader()
-        # TODO: fill "educational"-keys with values for
-        #  - description                    recommended (= "Comments on how this learning object is to be used")
-        #  - interactivityType              optional
-        #  - interactivityLevel             optional
-        #  - semanticDensity                optional
-        #  - typicalAgeRange                optional
-        #  - difficulty                     optional
-        #  - typicalLearningTime            optional
         educational.add_value('language', 'de')
-        lom.add_value('educational', educational.load_item())
-
-        # ToDo: didactic_comment / competencies
-        classification = LomClassificationItemLoader()
 
+        # TODO: didactic comment could be either one of these:
+        #  - educational.description
+        #  - classification.description (with classification.purpose set to 'educational objective')
         if "/wochenthemen/" in current_url:
-            classification.add_value('purpose', 'educational objective')
             # didactic comments are only part of "Thema der Woche"
             didactic_comment = response.xpath('//div[@class="c-collapse-content js-collapse-content"]').get()
             if didactic_comment is not None:
@@ -154,11 +140,15 @@ def parse(self, response, **kwargs):
                 # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ")
                 # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment)
                 didactic_comment = " ".join(didactic_comment.split())
-                if didactic_comment.endswith(".mehr lesenweniger lesen"):
+                if didactic_comment.endswith("mehr lesenweniger lesen"):
+                    # the button-description of the expandable info-box ends up in the string, therefore removing it:
                     didactic_comment = didactic_comment.replace("mehr lesenweniger lesen", "")
                 # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars)
-                classification.add_value('description', didactic_comment)
+                educational.add_value('description', didactic_comment)
 
+        lom.add_value('educational', educational.load_item())
+
+        classification = LomClassificationItemLoader()
         if "/unterrichtsvorschlaege/" in current_url:
             classification.add_value('purpose', 'competency')
             competency_description: list = response.xpath('//div[@class="b-cpsuiu-show-description"]/*[not('
@@ -172,25 +162,18 @@ def parse(self, response, **kwargs):
                 classification.add_value('description', competency_description)
 
         lom.add_value('classification', classification.load_item())
-
         base.add_value('lom', lom.load_item())
 
         vs = ValuespaceItemLoader()
-        # for possible values, either consult https://vocabs.openeduhub.de
-        # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs
-        # TODO: fill "valuespaces"-keys with values for
-        #  - learningResourceType           recommended
-        #  (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl)
+        # ToDo: Set 'learningResourceType' depending on the material that's being crawled, recognize it by url
         vs.add_value('price', 'no')
         vs.add_value('containsAdvertisement', 'no')
         vs.add_value('conditionsOfAccess', 'no login')
         vs.add_value('intendedEndUserRole', 'teacher')
-        vs.add_value('sourceContentType', 'Unterrichtsmaterial- und Aufgaben-Sammlung')
-        vs.add_value('accessibilitySummary', 'Not tested')  # ToDo: check if the accessibility has changed
+        vs.add_value('accessibilitySummary', 'Not tested')
         # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/
-        vs.add_value('dataProtectionConformity', 'Sensible data collection')  # ToDo: DSGVO-konform?
+        vs.add_value('dataProtectionConformity', 'Sensible data collection')
         # see: https://www.umwelt-im-unterricht.de/datenschutz/
-        vs.add_value('oer', 'partly OER')  # ToDo: alles OER? nur teils? wie setzen?
         # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/
         disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
         if len(disciplines_raw) >= 1:
@@ -223,10 +206,6 @@ def parse(self, response, **kwargs):
         base.add_value('valuespaces', vs.load_item())
 
         lic = LicenseItemLoader()
-        # TODO: fill "license"-keys with values for
-        #  - oer                            recommended ('oer' is automatically set if the 'url'-field above
-        #  is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at
-        #  LICENSE_MAPPINGS in converter/constants.py)
         license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
         if license_url is not None:
             lic.add_value('url', license_url)

From bf227ecb29d76c9d47ea763a023b8b5a5c141e5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Fri, 8 Oct 2021 12:41:26 +0200
Subject: [PATCH 05/10] umwelt_im_unterricht_spider.py (WIP!)

- set learningResourceType depending on the URL-structure
---
 .../spiders/umwelt_im_unterricht_spider.py    | 43 +++++++++++++------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index abdfeb9d..5ec0cf85 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -17,18 +17,18 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     name = "umwelt_im_unterricht_spider"
     friendlyName = "Umwelt im Unterricht"
     start_urls = [
-        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics",
-        # # Typ: Thema der Woche
-        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons",
-        # # Typ: Unterrichtsvorschlag
-        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts",
-        # # Typ: Hintergrund (Kontext)
-        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials",
-        # # Typ: Arbeitsmaterial
-        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video",
-        # # Typ: Video
-        # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images",
-        # # Typ: Bilderserie
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics",
+        # Typ: Thema der Woche
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons",
+        # Typ: Unterrichtsvorschlag
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts",
+        # Typ: Hintergrund (Kontext)
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials",
+        # Typ: Arbeitsmaterial
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video",
+        # Typ: Video
+        "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images",
+        # Typ: Bilderserie
     ]
     version = "0.0.1"  # last update: 2021-10-07
     topic_urls = set()  # urls that need to be parsed will be added here
@@ -129,7 +129,7 @@ def parse(self, response, **kwargs):
         educational = LomEducationalItemLoader()
         educational.add_value('language', 'de')
 
-        # TODO: didactic comment could be either one of these:
+        # TODO: a didactic comment could fit into either one of these:
         #  - educational.description
         #  - classification.description (with classification.purpose set to 'educational objective')
         if "/wochenthemen/" in current_url:
@@ -165,7 +165,22 @@ def parse(self, response, **kwargs):
         base.add_value('lom', lom.load_item())
 
         vs = ValuespaceItemLoader()
-        # ToDo: Set 'learningResourceType' depending on the material that's being crawled, recognize it by url
+
+        # depending on the website-category, we need to set a specific learningResourceType
+        # because the value 'website' for all crawled items would not be helpful enough
+        if "/wochenthemen/" in current_url or "/unterrichtsvorschlaege/" in current_url:
+            vs.add_value('learningResourceType', 'lesson plan')
+        if "/hintergrund/" in current_url:
+            vs.add_value('learningResourceType', 'Text')
+        if "/medien/dateien/" in current_url:
+            # topics categorized as "Arbeitsmaterial" offer customizable worksheets to teachers
+            vs.add_value('learningResourceType', 'worksheet')
+        if "/medien/videos/" in current_url:
+            vs.add_value('learningResourceType', 'video')
+        if "/medien/bilder/" in current_url:
+            # topics categorized as "Bilderserie" hold several images in a gallery (with individual licenses)
+            vs.add_value('learningResourceType', 'image')
+
         vs.add_value('price', 'no')
         vs.add_value('containsAdvertisement', 'no')
         vs.add_value('conditionsOfAccess', 'no login')

From b57cb0640a89bef787c876dd16de8a8ab66ebabc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Fri, 8 Oct 2021 13:20:27 +0200
Subject: [PATCH 06/10] umwelt_im_unterricht_spider.py v0.0.2

- fix license_url (replace "http://" by "https://")
---
 converter/spiders/umwelt_im_unterricht_spider.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index 5ec0cf85..976c83f3 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -30,7 +30,7 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
         "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images",
         # Typ: Bilderserie
     ]
-    version = "0.0.1"  # last update: 2021-10-07
+    version = "0.0.2"  # last update: 2021-10-08
     topic_urls = set()  # urls that need to be parsed will be added here
     topic_urls_already_parsed = set()  # this set is used for 'checking off' already parsed urls
 
@@ -221,8 +221,10 @@ def parse(self, response, **kwargs):
         base.add_value('valuespaces', vs.load_item())
 
         lic = LicenseItemLoader()
-        license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
+        license_url: str = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
         if license_url is not None:
+            if license_url.startswith("http://"):
+                license_url = license_url.replace("http://", "https://")
             lic.add_value('url', license_url)
 
         license_description_raw = response.xpath('//div[@class="cc-licence-info"]').get()

From 5b3a69209d4eaa075445c5664d54fcd064a431d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Mon, 11 Oct 2021 13:23:24 +0200
Subject: [PATCH 07/10] add documentation / scrapy contracts

---
 .../spiders/umwelt_im_unterricht_spider.py    | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index 976c83f3..015d859f 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -40,7 +40,7 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     DISCIPLINE_MAPPING: dict = {
         'Arbeit, Wirtschaft, Technik': 'Arbeitslehre',
         'Ethik, Philosophie, Religion': ['Ethik', 'Philosophie', 'Religion'],
-        'Fächerübergreifend': 'Allgemein',   # ToDo: no mapping available
+        'Fächerübergreifend': 'Allgemein',
         'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde']
     }
 
@@ -55,6 +55,12 @@ def parse_start_url(self, response, **kwargs):
             yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls)
 
     def parse_category_overview_for_individual_topic_urls(self, response):
+        """
+
+        Scrapy Contracts:
+        @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons
+        @returns requests 10
+        """
         # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}")
         topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall()
         # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}")
@@ -68,10 +74,12 @@ def parse_category_overview_for_individual_topic_urls(self, response):
         last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
         if last_page_button_url is not None:
             last_page_button_url = response.urljoin(last_page_button_url)
-            # Using the "next page"-button until we reach the last page:
+            # Using the "next page"-button to navigate through all individual topics until we reach the last page:
             if last_page_button_url != response.url:
                 next_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last next"]/a/@href').get()
                 if next_page_button_url is not None:
+                    # ToDo: optimize the page navigation by making it independent of the 'next'-button
+                    #   (by manually 'building' the url_strings from 1 to "last-page" with RegEx)
                     next_url_to_parse = response.urljoin(next_page_button_url)
                     yield scrapy.Request(url=next_url_to_parse,
                                          callback=self.parse_category_overview_for_individual_topic_urls)
@@ -87,6 +95,12 @@ def parse_category_overview_for_individual_topic_urls(self, response):
         #               f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}")
 
     def parse(self, response, **kwargs):
+        """
+
+        Scrapy Contracts:
+        @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/
+        @returns item 1
+        """
         current_url: str = response.url
         base = BaseItemLoader()
 
@@ -107,6 +121,7 @@ def parse(self, response, **kwargs):
         general.add_value('title', title)
         keywords = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
         if len(keywords) >= 1:
+            # only add keywords if the list isn't empty
             general.add_value('keyword', keywords)
         description = response.xpath('/html/head/meta[@name="description"]/@content').get()
         general.add_value('description', description)
@@ -141,8 +156,10 @@ def parse(self, response, **kwargs):
                 # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment)
                 didactic_comment = " ".join(didactic_comment.split())
                 if didactic_comment.endswith("mehr lesenweniger lesen"):
-                    # the button-description of the expandable info-box ends up in the string, therefore removing it:
+                    # the button-description of the expandable info-box ends up in the string,
+                    # therefore we are manually removing it:
                     didactic_comment = didactic_comment.replace("mehr lesenweniger lesen", "")
+                # since there's currently no way to confirm how the string looks in the web-interface:
                 # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars)
                 educational.add_value('description', didactic_comment)
 
@@ -153,7 +170,8 @@ def parse(self, response, **kwargs):
             classification.add_value('purpose', 'competency')
             competency_description: list = response.xpath('//div[@class="b-cpsuiu-show-description"]/*[not('
                                                           '@class="cc-licence-info")]').getall()
-            # competency_description will grab the whole div-element, but EXCLUDE the "license"-container
+            # the xpath-expression for competency_description will grab the whole div-element,
+            # but EXCLUDE the "license"-container (if the license-description exists, it's always part of the same div)
             if len(competency_description) >= 1:
                 # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its
                 # formatting up)
@@ -185,11 +203,12 @@ def parse(self, response, **kwargs):
         vs.add_value('containsAdvertisement', 'no')
         vs.add_value('conditionsOfAccess', 'no login')
         vs.add_value('intendedEndUserRole', 'teacher')
+        # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/
         vs.add_value('accessibilitySummary', 'Not tested')
         # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/
         vs.add_value('dataProtectionConformity', 'Sensible data collection')
         # see: https://www.umwelt-im-unterricht.de/datenschutz/
-        # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/
+
         disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
         if len(disciplines_raw) >= 1:
             disciplines = list()
@@ -197,6 +216,8 @@ def parse(self, response, **kwargs):
                 # self.debug_discipline_values.add(discipline_value)
                 if discipline_value in self.DISCIPLINE_MAPPING.keys():
                     discipline_value = self.DISCIPLINE_MAPPING.get(discipline_value)
+                # since the mapping value can either be a single string OR a list of strings, we need to make sure that
+                # our 'disciplines'-list is a list of strings (not a list with nested lists):
                 if type(discipline_value) is list:
                     disciplines.extend(discipline_value)
                 else:
@@ -206,6 +227,7 @@ def parse(self, response, **kwargs):
 
         educational_context_raw = response.xpath('//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall()
         if len(educational_context_raw) >= 1:
+            # the educationalContext-mapping is only done when there's at least one educational_context found
             educational_context = list()
             for educational_context_value in educational_context_raw:
                 # self.debug_educational_context_values.add(educational_context_value)

From 9319b5764629f7f65275dbcf0b46a287dbf4f93b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Mon, 11 Oct 2021 20:48:24 +0200
Subject: [PATCH 08/10] rework crawler navigation through overview_urls

- no longer uses the "next page"-button, but instead builds the url-list by splitting the "last page"-button up into a URL and its page-parameter (int)
-- this makes sure that we don't lose several pages and topics at once if scrapy gets only a timeout as a response from one overview subpage
---
 .../spiders/umwelt_im_unterricht_spider.py    | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index 015d859f..2810f78d 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -1,3 +1,5 @@
+import re
+
 import scrapy
 import w3lib.html
 from scrapy.spiders import CrawlSpider
@@ -32,7 +34,8 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     ]
     version = "0.0.2"  # last update: 2021-10-08
     topic_urls = set()  # urls that need to be parsed will be added here
-    topic_urls_already_parsed = set()  # this set is used for 'checking off' already parsed urls
+    topic_urls_parsed = set()  # this set is used for 'checking off' already parsed urls
+    overview_urls_already_parsed = set()  # this set is used for 'checking off' already parsed overview_pages
 
     EDUCATIONAL_CONTEXT_MAPPING: dict = {
         'Sekundarstufe': ['Sekundarstufe I', 'Sekundarstufe II']
@@ -52,9 +55,9 @@ def getHash(self, response=None) -> str:
 
     def parse_start_url(self, response, **kwargs):
         for url in self.start_urls:
-            yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls)
+            yield scrapy.Request(url=url, callback=self.parse_category_overview_for_topics_and_subpages)
 
-    def parse_category_overview_for_individual_topic_urls(self, response):
+    def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.Response):
         """
 
         Scrapy Contracts:
@@ -72,27 +75,30 @@ def parse_category_overview_for_individual_topic_urls(self, response):
         # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially
         # displayed 10 elements
         last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
+        page_number_regex = re.compile(r'(?P<url_with_parameters>.*&tx_solr%5Bpage%5D=)(?P<nr>\d+)')
+        overview_urls_parsed: set = set()
+
         if last_page_button_url is not None:
-            last_page_button_url = response.urljoin(last_page_button_url)
-            # Using the "next page"-button to navigate through all individual topics until we reach the last page:
-            if last_page_button_url != response.url:
-                next_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last next"]/a/@href').get()
-                if next_page_button_url is not None:
-                    # ToDo: optimize the page navigation by making it independent of the 'next'-button
-                    #   (by manually 'building' the url_strings from 1 to "last-page" with RegEx)
-                    next_url_to_parse = response.urljoin(next_page_button_url)
-                    yield scrapy.Request(url=next_url_to_parse,
-                                         callback=self.parse_category_overview_for_individual_topic_urls)
-            # if last_page_button_url == response.url:
-            #     logging.debug(f"Reached the last page: {response.url}")
-            #     logging.debug(f"{len(self.topic_urls)} individual topic_urls were found: {self.topic_urls}")
+            page_number_dict: dict = page_number_regex.search(last_page_button_url).groupdict()
+            url_without_page_parameter = response.urljoin(page_number_dict.get('url_with_parameters'))
+            last_page_number = int(page_number_dict.get('nr'))
+            for i in range(2, last_page_number + 1):
+                # since the initial url in start_urls already counts as page 1,
+                # we're iterating from page 2 to the last page
+                next_overview_subpage_to_crawl = str(url_without_page_parameter + str(i))
+                if next_overview_subpage_to_crawl not in self.overview_urls_already_parsed:
+                    yield scrapy.Request(url=next_overview_subpage_to_crawl,
+                                         callback=self.parse_category_overview_for_topics_and_subpages)
+                    overview_urls_parsed.add(next_overview_subpage_to_crawl)
+            self.overview_urls_already_parsed.update(overview_urls_parsed)
+
+        parsed_urls: set = set()
         for url in self.topic_urls:
             # making sure that we don't accidentally crawl individual pages more than once
-            if url not in self.topic_urls_already_parsed:
+            if url not in self.topic_urls_parsed:
                 yield scrapy.Request(url=url, callback=self.parse)
-                self.topic_urls_already_parsed.add(url)
-        # logging.debug(f"topic_urls after yielding them: {len(self.topic_urls)} --- "
-        #               f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}")
+                parsed_urls.add(url)
+        self.topic_urls_parsed.update(parsed_urls)
 
     def parse(self, response, **kwargs):
         """

From 23817ecad2040037b7504fe961b4389d7b1be51b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Tue, 12 Oct 2021 11:40:34 +0200
Subject: [PATCH 09/10] minimal code cleanup, add documentation

---
 .../spiders/umwelt_im_unterricht_spider.py    | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index 2810f78d..c8b90789 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -34,7 +34,7 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase):
     ]
     version = "0.0.2"  # last update: 2021-10-08
     topic_urls = set()  # urls that need to be parsed will be added here
-    topic_urls_parsed = set()  # this set is used for 'checking off' already parsed urls
+    topic_urls_parsed = set()  # this set is used for 'checking off' already parsed (individual) topic urls
     overview_urls_already_parsed = set()  # this set is used for 'checking off' already parsed overview_pages
 
     EDUCATIONAL_CONTEXT_MAPPING: dict = {
@@ -64,38 +64,37 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.
         @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons
         @returns requests 10
         """
-        # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}")
         topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall()
-        # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}")
 
         for url_ending in topic_urls_raw:
             self.topic_urls.add(response.urljoin(url_ending))
-        # logging.debug(f"TOPIC URLS ({len(self.topic_urls)}) = {self.topic_urls}")
 
         # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially
         # displayed 10 elements
         last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
+        # the string last_page_button_url typically looks like this:
+        # "/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images&tx_solr%5Bpage%5D=8"
         page_number_regex = re.compile(r'(?P<url_with_parameters>.*&tx_solr%5Bpage%5D=)(?P<nr>\d+)')
-        overview_urls_parsed: set = set()
 
+        overview_urls_parsed: set = set()   # temporary set used for checking off already visited URLs
         if last_page_button_url is not None:
             page_number_dict: dict = page_number_regex.search(last_page_button_url).groupdict()
             url_without_page_parameter = response.urljoin(page_number_dict.get('url_with_parameters'))
             last_page_number = int(page_number_dict.get('nr'))
             for i in range(2, last_page_number + 1):
-                # since the initial url in start_urls already counts as page 1,
-                # we're iterating from page 2 to the last page
+                # the initial url from start_urls already counts as page 1, therefore we're iterating
+                # from page 2 to the last page
                 next_overview_subpage_to_crawl = str(url_without_page_parameter + str(i))
                 if next_overview_subpage_to_crawl not in self.overview_urls_already_parsed:
                     yield scrapy.Request(url=next_overview_subpage_to_crawl,
                                          callback=self.parse_category_overview_for_topics_and_subpages)
                     overview_urls_parsed.add(next_overview_subpage_to_crawl)
-            self.overview_urls_already_parsed.update(overview_urls_parsed)
+            self.overview_urls_already_parsed.update(overview_urls_parsed)  # checking off the (10) URLs that we yielded
 
-        parsed_urls: set = set()
+        parsed_urls: set = set()    # temporary set used for checking off already visited topics
         for url in self.topic_urls:
-            # making sure that we don't accidentally crawl individual pages more than once
             if url not in self.topic_urls_parsed:
+                # making sure that we don't accidentally crawl individual pages more than once
                 yield scrapy.Request(url=url, callback=self.parse)
                 parsed_urls.add(url)
         self.topic_urls_parsed.update(parsed_urls)
@@ -252,6 +251,7 @@ def parse(self, response, **kwargs):
         license_url: str = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get()
         if license_url is not None:
             if license_url.startswith("http://"):
+                # the license-mapper expects urls that are in https:// format, but UIU uses http:// links to CC-licenses
                 license_url = license_url.replace("http://", "https://")
             lic.add_value('url', license_url)
 
@@ -264,7 +264,7 @@ def parse(self, response, **kwargs):
             # together. just replacing \n with a whitespace is enough to keep the structure of the string intact.
             license_description_raw = w3lib.html.replace_escape_chars(license_description_raw)
             license_description = " ".join(license_description_raw.split())
-            # making sure that there's only 1 whitespace between words, not 4+ when the original string had serveral \t
+            # making sure that there's only 1 whitespace between words
             lic.add_value('description', license_description)
         base.add_value('license', lic.load_item())
 
@@ -274,5 +274,4 @@ def parse(self, response, **kwargs):
         response_loader = super().mapResponse(response)
         base.add_value('response', response_loader.load_item())
 
-        # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method
         yield base.load_item()

From 32dc5645ea5f8c34ac894438e4561a60bf0c9fdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?=
 <981166+Criamos@users.noreply.github.com>
Date: Tue, 12 Oct 2021 12:22:07 +0200
Subject: [PATCH 10/10] add type hinting, descriptions

---
 .../spiders/umwelt_im_unterricht_spider.py    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py
index c8b90789..4dc5f0f8 100644
--- a/converter/spiders/umwelt_im_unterricht_spider.py
+++ b/converter/spiders/umwelt_im_unterricht_spider.py
@@ -59,6 +59,9 @@ def parse_start_url(self, response, **kwargs):
 
     def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.Response):
         """
+        Crawls an overview page of a "type"-category (e.g. "Hintergrund", "Bilderserie" etc.) for subpages and topics.
+        If the overview has subpages, it will recursively yield additional scrapy.Requests to the overview-subpages.
+        Afterwards it yields the (10) individual topic_urls (per overview page) to the parse()-method.
 
         Scrapy Contracts:
         @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons
@@ -71,7 +74,7 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.
 
         # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially
         # displayed 10 elements
-        last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
+        last_page_button_url: str = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
         # the string last_page_button_url typically looks like this:
         # "/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images&tx_solr%5Bpage%5D=8"
         page_number_regex = re.compile(r'(?P<url_with_parameters>.*&tx_solr%5Bpage%5D=)(?P<nr>\d+)')
@@ -79,7 +82,7 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.
         overview_urls_parsed: set = set()   # temporary set used for checking off already visited URLs
         if last_page_button_url is not None:
             page_number_dict: dict = page_number_regex.search(last_page_button_url).groupdict()
-            url_without_page_parameter = response.urljoin(page_number_dict.get('url_with_parameters'))
+            url_without_page_parameter: str = response.urljoin(page_number_dict.get('url_with_parameters'))
             last_page_number = int(page_number_dict.get('nr'))
             for i in range(2, last_page_number + 1):
                 # the initial url from start_urls already counts as page 1, therefore we're iterating
@@ -99,8 +102,9 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.
                 parsed_urls.add(url)
         self.topic_urls_parsed.update(parsed_urls)
 
-    def parse(self, response, **kwargs):
+    def parse(self, response: scrapy.http.Response, **kwargs):
         """
+        Parses an individual topic url for metadata and yields a BaseItem.
 
         Scrapy Contracts:
         @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/
@@ -110,8 +114,8 @@ def parse(self, response, **kwargs):
         base = BaseItemLoader()
 
         base.add_value('sourceId', response.url)
-        date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
-        date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw)
+        date_raw: str = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
+        date_cleaned_up: str = w3lib.html.strip_html5_whitespace(date_raw)
         hash_temp = str(date_cleaned_up + self.version)
         base.add_value('hash', hash_temp)
         base.add_value('lastModified', date_cleaned_up)
@@ -122,13 +126,13 @@ def parse(self, response, **kwargs):
 
         general = LomGeneralItemloader()
         general.add_value('identifier', response.url)
-        title = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get()
+        title: str = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get()
         general.add_value('title', title)
-        keywords = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
+        keywords: list = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
         if len(keywords) >= 1:
             # only add keywords if the list isn't empty
             general.add_value('keyword', keywords)
-        description = response.xpath('/html/head/meta[@name="description"]/@content').get()
+        description: str = response.xpath('/html/head/meta[@name="description"]/@content').get()
         general.add_value('description', description)
         general.add_value('language', 'de')
 
@@ -214,7 +218,7 @@ def parse(self, response, **kwargs):
         vs.add_value('dataProtectionConformity', 'Sensible data collection')
         # see: https://www.umwelt-im-unterricht.de/datenschutz/
 
-        disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
+        disciplines_raw: list = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
         if len(disciplines_raw) >= 1:
             disciplines = list()
             for discipline_value in disciplines_raw:
@@ -255,7 +259,7 @@ def parse(self, response, **kwargs):
                 license_url = license_url.replace("http://", "https://")
             lic.add_value('url', license_url)
 
-        license_description_raw = response.xpath('//div[@class="cc-licence-info"]').get()
+        license_description_raw: str = response.xpath('//div[@class="cc-licence-info"]').get()
         if license_description_raw is not None:
             license_description_raw = w3lib.html.remove_tags(license_description_raw)
             license_description_raw = w3lib.html.replace_escape_chars(license_description_raw, which_ones="\n",