From df170766130463e278d674522bec51eed6a33931 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Thu, 7 Oct 2021 15:06:47 +0200 Subject: [PATCH 01/10] add umwelt_im_unterricht_spider.py v0.0.1 (WIP!) - works in local 'json'-mode - rough first draft; still missing a bunch of metadata-fields (see ToDos) --- .../spiders/umwelt_im_unterricht_spider.py | 277 ++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 converter/spiders/umwelt_im_unterricht_spider.py diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py new file mode 100644 index 00000000..db625b09 --- /dev/null +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -0,0 +1,277 @@ +import logging + +import scrapy +import w3lib.html +from scrapy.spiders import CrawlSpider + +from converter.constants import Constants +from converter.items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \ + LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, ResponseItemLoader, \ + PermissionItemLoader +from converter.spiders.base_classes import LomBase + + +class UmweltImUnterrichtSpider(CrawlSpider, LomBase): + """ + Crawler for Umwelt-im-Unterricht.de + (Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit) + """ + name = "umwelt_im_unterricht_spider" + friendlyName = "Umwelt im Unterricht" + start_urls = [ + # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics", + # # Typ: Thema der Woche + # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons", + # # Typ: Unterrichtsvorschlag + # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts", + # # Typ: Hintergrund (Kontext) + # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials", + # # Typ: Arbeitsmaterial + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video", + # Typ: Video + # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images", + # # Typ: Bilderserie + ] + version = "0.0.1" # last update: 2021-10-07 + topic_urls = set() # urls that need to be parsed will be added here + topic_urls_already_parsed = set() # this set is used for 'checking off' already parsed urls + + EDUCATIONAL_CONTEXT_MAPPING: dict = { + # There's only 2 "Zielgruppen": 'Grundschule' and 'Sekundarstufe' + # ToDo: either map Sekundarstufe to both or neither + 'Sekundarstufe': ['Sekundarstufe I', 'Sekundarstufe II'] + } + DISCIPLINE_MAPPING: dict = { + 'Arbeit, Wirtschaft, Technik': 'Arbeitslehre', + 'Ethik, Philosophie, Religion': ['Ethik', 'Philosophie', 'Religion'], + # 'Fächerübergreifend', # ToDo: no mapping available + 'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde'], + # 'Verbraucherbildung' # ToDo: no mapping available + } + + def getId(self, response=None) -> str: + return response.url + + def getHash(self, response=None) -> str: + date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get() + date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw) + hash_temp = str(date_cleaned_up + self.version) + return hash_temp + + def parse_start_url(self, response, **kwargs): + for url in self.start_urls: + yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls) + + def parse_category_overview_for_individual_topic_urls(self, response, **kwargs): + # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}") + topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall() + # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}") + + for url_ending in topic_urls_raw: + self.topic_urls.add(response.urljoin(url_ending)) + # logging.debug(f"TOPIC URLS ({len(self.topic_urls)}) = {self.topic_urls}") + + # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially + # displayed 10 elements + last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get() + if last_page_button_url is not None: + last_page_button_url = response.urljoin(last_page_button_url) + # Using the "next page"-button until we reach the last page: + if last_page_button_url != response.url: + next_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last next"]/a/@href').get() + if next_page_button_url is not None: + next_url_to_parse = response.urljoin(next_page_button_url) + yield scrapy.Request(url=next_url_to_parse, + callback=self.parse_category_overview_for_individual_topic_urls) + # if last_page_button_url == response.url: + # logging.debug(f"Reached the last page: {response.url}") + # logging.debug(f"{len(self.topic_urls)} individual topic_urls were found: {self.topic_urls}") + for url in self.topic_urls: + # making sure that we don't accidentally crawl individual pages more than once + if url not in self.topic_urls_already_parsed: + yield scrapy.Request(url=url, callback=self.parse) + self.topic_urls_already_parsed.add(url) + # logging.debug(f"topic_urls after yielding them: {len(self.topic_urls)} --- " + # f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}") + + def parse(self, response, **kwargs): + base = BaseItemLoader() + # ALL possible keys for the different Item and ItemLoader-classes can be found inside converter/items.py + + # TODO: fill "base"-keys with values for + # - thumbnail recommended (let splash handle it) + # - publisher optional + base.add_value('sourceId', response.url) + date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get() + date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw) + base.add_value('lastModified', date_cleaned_up) + base.add_value('type', Constants.TYPE_MATERIAL) + # base.add_value('thumbnail', thumbnail_url) + + lom = LomBaseItemloader() + + general = LomGeneralItemloader() + # TODO: fill "general"-keys with values for + # - coverage optional + # - structure optional + # - aggregationLevel optional + general.add_value('identifier', response.url) + title = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get() + general.add_value('title', title) + keywords = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall() + if len(keywords) >= 1: + general.add_value('keyword', keywords) + description = response.xpath('/html/head/meta[@name="description"]/@content').get() + general.add_value('description', description) + general.add_value('language', 'de') + + lom.add_value('general', general.load_item()) + + technical = LomTechnicalItemLoader() + # TODO: fill "technical"-keys with values for + # - size optional + # - requirement optional + # - installationRemarks optional + # - otherPlatformRequirements optional + technical.add_value('format', 'text/html') + technical.add_value('location', response.url) + lom.add_value('technical', technical.load_item()) + + lifecycle = LomLifecycleItemloader() + # TODO: fill "lifecycle"-keys with values for + # - url recommended + # - email optional + # - uuid optional + lifecycle.add_value('role', 'publisher') + lifecycle.add_value('date', date_cleaned_up) + lifecycle.add_value('organization', 'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)') + lom.add_value('lifecycle', lifecycle.load_item()) + + educational = LomEducationalItemLoader() + # TODO: fill "educational"-keys with values for + # - description recommended (= "Comments on how this learning object is to be used") + # - interactivityType optional + # - interactivityLevel optional + # - semanticDensity optional + # - typicalAgeRange optional + # - difficulty optional + # - typicalLearningTime optional + educational.add_value('language', 'de') + lom.add_value('educational', educational.load_item()) + + # once you've filled "general", "technical", "lifecycle" and "educational" with values, + # the LomBaseItem is loaded into the "base"-BaseItemLoader + base.add_value('lom', lom.load_item()) + + vs = ValuespaceItemLoader() + # for possible values, either consult https://vocabs.openeduhub.de + # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs + # TODO: fill "valuespaces"-keys with values for + # - discipline recommended + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl) + # - intendedEndUserRole recommended + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/intendedEndUserRole.ttl) + # - learningResourceType recommended + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl) + # - conditionsOfAccess recommended + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/conditionsOfAccess.ttl) + # - containsAdvertisement recommended + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/containsAdvertisement.ttl) + # - price recommended + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/price.ttl) + # - educationalContext optional + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/educationalContext.ttl) + # - sourceContentType optional + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/sourceContentType.ttl) + # - toolCategory optional + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/toolCategory.ttl) + # - accessibilitySummary optional + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/accessibilitySummary.ttl) + # - dataProtectionConformity optional + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/dataProtectionConformity.ttl) + # - fskRating optional + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/fskRating.ttl) + # - oer optional + # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oer.ttl) + disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() + if len(disciplines_raw) >= 1: + disciplines = list() + for discipline_value in disciplines_raw: + # self.debug_discipline_values.add(discipline_value) + if discipline_value in self.DISCIPLINE_MAPPING.keys(): + discipline_value = self.DISCIPLINE_MAPPING.get(discipline_value) + if type(discipline_value) is list: + disciplines.extend(discipline_value) + else: + disciplines.append(discipline_value) + if len(disciplines) >= 1: + vs.add_value('discipline', disciplines) + + educational_context_raw = response.xpath('//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall() + if len(educational_context_raw) >= 1: + educational_context = list() + for educational_context_value in educational_context_raw: + # self.debug_educational_context_values.add(educational_context_value) + if educational_context_value in self.EDUCATIONAL_CONTEXT_MAPPING.keys(): + educational_context_value = self.EDUCATIONAL_CONTEXT_MAPPING.get(educational_context_value) + if type(educational_context_value) is list: + educational_context.extend(educational_context_value) + else: + educational_context.append(educational_context_value) + if len(educational_context) >= 1: + vs.add_value('educationalContext', educational_context) + + base.add_value('valuespaces', vs.load_item()) + + lic = LicenseItemLoader() + # TODO: fill "license"-keys with values for + # - oer recommended ('oer' is automatically set if the 'url'-field above + # is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at + # LICENSE_MAPPINGS in converter/constants.py) + # - author recommended + # - internal optional + # - expirationDate optional (for content that expires, e.g. ÖR-Mediatheken) + license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() + if license_url is not None: + lic.add_value('url', license_url) + + license_description_raw = response.xpath('//div[@class="cc-licence-info"]').get() + if license_description_raw is not None: + license_description_raw = w3lib.html.remove_tags(license_description_raw) + license_description_raw = w3lib.html.replace_escape_chars(license_description_raw, which_ones="\n", + replace_by=" ") + license_description_raw = w3lib.html.replace_escape_chars(license_description_raw) + license_description = " ".join(license_description_raw.split()) + lic.add_value('description', license_description) + base.add_value('license', lic.load_item()) + + # Either fill the PermissionItemLoader manually (not necessary most of the times) + permissions = PermissionItemLoader() + # or (preferably) call the inherited getPermissions(response)-method + # from converter/spiders/base_classes/lom_base.py by using super().: + # permissions = super().getPermissions(response) + # TODO: if necessary, add/replace values for the following "permissions"-keys + # - public optional + # - groups optional + # - mediacenters optional + # - autoCreateGroups optional + # - autoCreateMediacenters optional + base.add_value('permissions', permissions.load_item()) + + # Either fill the ResponseItemLoader manually (not necessary most of the time) + # response_loader = ResponseItemLoader() + # or (preferably) call the inherited mapResponse(response)-method + # from converter/spiders/base_classes/lom_base.py by using super().: + response_loader = super().mapResponse(response) + # TODO: if necessary, add/replace values for the following "response"-keys + # - url required + # - status optional + # - html optional + # - text optional + # - headers optional + # - cookies optional + # - har optional + base.add_value('response', response_loader.load_item()) + + # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method + yield base.load_item() From ea688d56d7b78129c9e533655a940ea54bea26a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Thu, 7 Oct 2021 19:45:55 +0200 Subject: [PATCH 02/10] add LomClassificationItem to sample_spider_alternative.py - initially forgot to add 'classification' to the spider blueprint since it was never used anywhere --- converter/spiders/sample_spider_alternative.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/converter/spiders/sample_spider_alternative.py b/converter/spiders/sample_spider_alternative.py index 8b1cd07f..5eee54ec 100644 --- a/converter/spiders/sample_spider_alternative.py +++ b/converter/spiders/sample_spider_alternative.py @@ -4,7 +4,7 @@ from converter.constants import Constants from converter.items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \ LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, ResponseItemLoader, \ - PermissionItemLoader + PermissionItemLoader, LomClassificationItemLoader from converter.spiders.base_classes import LomBase @@ -70,6 +70,7 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader: # - LomTechnicalItem required # - LomLifeCycleItem required (multiple possible) # - LomEducationalItem required + # - LomClassificationItem optional general = LomGeneralItemloader() # TODO: fill "general"-keys with values for @@ -132,6 +133,15 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader: # - typicalLearningTime optional lom.add_value('educational', educational.load_item()) + classification = LomClassificationItemLoader() + # TODO: fill "classification"-keys with values for + # - cost optional + # - purpose optional + # - taxonPath optional + # - description optional + # - keyword optional + lom.add_value('classification', classification.load_item()) + # once you've filled "general", "technical", "lifecycle" and "educational" with values, # the LomBaseItem is loaded into the "base"-BaseItemLoader base.add_value('lom', lom.load_item()) From 251dd53aa711ac3d9ca83d99d0c6cc5499a6ce1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Thu, 7 Oct 2021 21:03:55 +0200 Subject: [PATCH 03/10] umwelt_im_unterricht_spider.py (WIP!) - fill up most of the remaining metadata-fields -- ToDo: doublecheck the remaining questionable fields - add classification.description (for classification.purpose:'competency' and purpose:'educational objective') --- .../spiders/umwelt_im_unterricht_spider.py | 145 ++++++++---------- 1 file changed, 61 insertions(+), 84 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index db625b09..40ca2d15 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -1,13 +1,11 @@ -import logging - import scrapy import w3lib.html from scrapy.spiders import CrawlSpider from converter.constants import Constants from converter.items import BaseItemLoader, LomBaseItemloader, LomGeneralItemloader, LomTechnicalItemLoader, \ - LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, ResponseItemLoader, \ - PermissionItemLoader + LomLifecycleItemloader, LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, \ + LomClassificationItemLoader from converter.spiders.base_classes import LomBase @@ -21,24 +19,22 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): start_urls = [ # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics", # # Typ: Thema der Woche - # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons", - # # Typ: Unterrichtsvorschlag + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons", + # Typ: Unterrichtsvorschlag # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts", # # Typ: Hintergrund (Kontext) # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials", # # Typ: Arbeitsmaterial - "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video", - # Typ: Video + # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video", + # # Typ: Video # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images", # # Typ: Bilderserie ] - version = "0.0.1" # last update: 2021-10-07 + version = "0.0.1" # last update: 2021-10-07 topic_urls = set() # urls that need to be parsed will be added here - topic_urls_already_parsed = set() # this set is used for 'checking off' already parsed urls + topic_urls_already_parsed = set() # this set is used for 'checking off' already parsed urls EDUCATIONAL_CONTEXT_MAPPING: dict = { - # There's only 2 "Zielgruppen": 'Grundschule' and 'Sekundarstufe' - # ToDo: either map Sekundarstufe to both or neither 'Sekundarstufe': ['Sekundarstufe I', 'Sekundarstufe II'] } DISCIPLINE_MAPPING: dict = { @@ -50,19 +46,16 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): } def getId(self, response=None) -> str: - return response.url + pass def getHash(self, response=None) -> str: - date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get() - date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw) - hash_temp = str(date_cleaned_up + self.version) - return hash_temp + pass def parse_start_url(self, response, **kwargs): for url in self.start_urls: yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls) - def parse_category_overview_for_individual_topic_urls(self, response, **kwargs): + def parse_category_overview_for_individual_topic_urls(self, response): # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}") topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall() # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}") @@ -95,15 +88,14 @@ def parse_category_overview_for_individual_topic_urls(self, response, **kwargs): # f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}") def parse(self, response, **kwargs): + current_url: str = response.url base = BaseItemLoader() - # ALL possible keys for the different Item and ItemLoader-classes can be found inside converter/items.py - # TODO: fill "base"-keys with values for - # - thumbnail recommended (let splash handle it) - # - publisher optional base.add_value('sourceId', response.url) date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get() date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw) + hash_temp = str(date_cleaned_up + self.version) + base.add_value('hash', hash_temp) base.add_value('lastModified', date_cleaned_up) base.add_value('type', Constants.TYPE_MATERIAL) # base.add_value('thumbnail', thumbnail_url) @@ -112,7 +104,6 @@ def parse(self, response, **kwargs): general = LomGeneralItemloader() # TODO: fill "general"-keys with values for - # - coverage optional # - structure optional # - aggregationLevel optional general.add_value('identifier', response.url) @@ -128,22 +119,14 @@ def parse(self, response, **kwargs): lom.add_value('general', general.load_item()) technical = LomTechnicalItemLoader() - # TODO: fill "technical"-keys with values for - # - size optional - # - requirement optional - # - installationRemarks optional - # - otherPlatformRequirements optional technical.add_value('format', 'text/html') technical.add_value('location', response.url) lom.add_value('technical', technical.load_item()) lifecycle = LomLifecycleItemloader() - # TODO: fill "lifecycle"-keys with values for - # - url recommended - # - email optional - # - uuid optional lifecycle.add_value('role', 'publisher') lifecycle.add_value('date', date_cleaned_up) + lifecycle.add_value('url', "https://www.umwelt-im-unterricht.de/impressum/") lifecycle.add_value('organization', 'Bundesministerium für Umwelt, Naturschutz und nukleare Sicherheit (BMU)') lom.add_value('lifecycle', lifecycle.load_item()) @@ -159,40 +142,56 @@ def parse(self, response, **kwargs): educational.add_value('language', 'de') lom.add_value('educational', educational.load_item()) - # once you've filled "general", "technical", "lifecycle" and "educational" with values, - # the LomBaseItem is loaded into the "base"-BaseItemLoader + # ToDo: didactic_comment / competencies + classification = LomClassificationItemLoader() + + if "/wochenthemen/" in current_url: + classification.add_value('purpose', 'educational objective') + # didactic comments are only part of "Thema der Woche" + didactic_comment = response.xpath('//div[@class="c-collapse-content js-collapse-content"]').get() + if didactic_comment is not None: + didactic_comment = w3lib.html.remove_tags(didactic_comment) + # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ") + # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment) + didactic_comment = " ".join(didactic_comment.split()) + if didactic_comment.endswith(".mehr lesenweniger lesen"): + didactic_comment = didactic_comment.replace("mehr lesenweniger lesen", "") + # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars) + classification.add_value('description', didactic_comment) + + if "/unterrichtsvorschlaege/" in current_url: + classification.add_value('purpose', 'competency') + competency_description: list = response.xpath('//div[@class="b-cpsuiu-show-description"]/*[not(' + '@class="cc-licence-info")]').getall() + # competency_description will grab the whole div-element, but EXCLUDE the "license"-container + if len(competency_description) >= 1: + # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its + # formatting up) + competency_description: str = " ".join(competency_description) + competency_description = w3lib.html.remove_tags(competency_description) + classification.add_value('description', competency_description) + + lom.add_value('classification', classification.load_item()) + base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() # for possible values, either consult https://vocabs.openeduhub.de # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs # TODO: fill "valuespaces"-keys with values for - # - discipline recommended - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/discipline.ttl) - # - intendedEndUserRole recommended - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/intendedEndUserRole.ttl) # - learningResourceType recommended # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl) - # - conditionsOfAccess recommended - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/conditionsOfAccess.ttl) - # - containsAdvertisement recommended - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/containsAdvertisement.ttl) - # - price recommended - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/price.ttl) - # - educationalContext optional - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/educationalContext.ttl) - # - sourceContentType optional - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/sourceContentType.ttl) - # - toolCategory optional - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/toolCategory.ttl) - # - accessibilitySummary optional - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/accessibilitySummary.ttl) - # - dataProtectionConformity optional - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/dataProtectionConformity.ttl) - # - fskRating optional - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/fskRating.ttl) - # - oer optional - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/oer.ttl) + vs.add_value('price', 'no') + vs.add_value('containsAdvertisement', 'no') + vs.add_value('conditionsOfAccess', 'no login') + vs.add_value('intendedEndUserRole', 'teacher') + vs.add_value('sourceContentType', 'Unterrichtsmaterial- und Aufgaben-Sammlung') + vs.add_value('accessibilitySummary', 'Not tested') # ToDo: check if the accessibility has changed + # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/ + vs.add_value('dataProtectionConformity', 'Sensible data collection') # ToDo: DSGVO-konform? + # see: https://www.umwelt-im-unterricht.de/datenschutz/ + vs.add_value('oer', 'partly OER') # ToDo: alles OER? nur teils? wie setzen? + # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/ disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() if len(disciplines_raw) >= 1: disciplines = list() @@ -228,9 +227,6 @@ def parse(self, response, **kwargs): # - oer recommended ('oer' is automatically set if the 'url'-field above # is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at # LICENSE_MAPPINGS in converter/constants.py) - # - author recommended - # - internal optional - # - expirationDate optional (for content that expires, e.g. ÖR-Mediatheken) license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() if license_url is not None: lic.add_value('url', license_url) @@ -240,37 +236,18 @@ def parse(self, response, **kwargs): license_description_raw = w3lib.html.remove_tags(license_description_raw) license_description_raw = w3lib.html.replace_escape_chars(license_description_raw, which_ones="\n", replace_by=" ") + # if we would replace_escape_chars() straight away, there would be words stuck together that don't belong + # together. just replacing \n with a whitespace is enough to keep the structure of the string intact. license_description_raw = w3lib.html.replace_escape_chars(license_description_raw) license_description = " ".join(license_description_raw.split()) + # making sure that there's only 1 whitespace between words, not 4+ when the original string had serveral \t lic.add_value('description', license_description) base.add_value('license', lic.load_item()) - # Either fill the PermissionItemLoader manually (not necessary most of the times) - permissions = PermissionItemLoader() - # or (preferably) call the inherited getPermissions(response)-method - # from converter/spiders/base_classes/lom_base.py by using super().: - # permissions = super().getPermissions(response) - # TODO: if necessary, add/replace values for the following "permissions"-keys - # - public optional - # - groups optional - # - mediacenters optional - # - autoCreateGroups optional - # - autoCreateMediacenters optional + permissions = super().getPermissions(response) base.add_value('permissions', permissions.load_item()) - # Either fill the ResponseItemLoader manually (not necessary most of the time) - # response_loader = ResponseItemLoader() - # or (preferably) call the inherited mapResponse(response)-method - # from converter/spiders/base_classes/lom_base.py by using super().: response_loader = super().mapResponse(response) - # TODO: if necessary, add/replace values for the following "response"-keys - # - url required - # - status optional - # - html optional - # - text optional - # - headers optional - # - cookies optional - # - har optional base.add_value('response', response_loader.load_item()) # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method From 798b80723e925cb5bec032a1be3eecf689b8068d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Fri, 8 Oct 2021 11:46:14 +0200 Subject: [PATCH 04/10] umwelt_im_unterricht_spider.py (WIP!) - move 'didactic_comment' to educational.description - after feedback: clarified the remaining metadata questions from the ToDo-list - next ToDo: set learningResourceType depending on which material-type is currently getting crawled --- .../spiders/umwelt_im_unterricht_spider.py | 53 ++++++------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index 40ca2d15..abdfeb9d 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -19,8 +19,8 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): start_urls = [ # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics", # # Typ: Thema der Woche - "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons", - # Typ: Unterrichtsvorschlag + # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons", + # # Typ: Unterrichtsvorschlag # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts", # # Typ: Hintergrund (Kontext) # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials", @@ -40,9 +40,8 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): DISCIPLINE_MAPPING: dict = { 'Arbeit, Wirtschaft, Technik': 'Arbeitslehre', 'Ethik, Philosophie, Religion': ['Ethik', 'Philosophie', 'Religion'], - # 'Fächerübergreifend', # ToDo: no mapping available - 'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde'], - # 'Verbraucherbildung' # ToDo: no mapping available + 'Fächerübergreifend': 'Allgemein', # ToDo: no mapping available + 'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde'] } def getId(self, response=None) -> str: @@ -103,9 +102,6 @@ def parse(self, response, **kwargs): lom = LomBaseItemloader() general = LomGeneralItemloader() - # TODO: fill "general"-keys with values for - # - structure optional - # - aggregationLevel optional general.add_value('identifier', response.url) title = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get() general.add_value('title', title) @@ -131,22 +127,12 @@ def parse(self, response, **kwargs): lom.add_value('lifecycle', lifecycle.load_item()) educational = LomEducationalItemLoader() - # TODO: fill "educational"-keys with values for - # - description recommended (= "Comments on how this learning object is to be used") - # - interactivityType optional - # - interactivityLevel optional - # - semanticDensity optional - # - typicalAgeRange optional - # - difficulty optional - # - typicalLearningTime optional educational.add_value('language', 'de') - lom.add_value('educational', educational.load_item()) - - # ToDo: didactic_comment / competencies - classification = LomClassificationItemLoader() + # TODO: didactic comment could be either one of these: + # - educational.description + # - classification.description (with classification.purpose set to 'educational objective') if "/wochenthemen/" in current_url: - classification.add_value('purpose', 'educational objective') # didactic comments are only part of "Thema der Woche" didactic_comment = response.xpath('//div[@class="c-collapse-content js-collapse-content"]').get() if didactic_comment is not None: @@ -154,11 +140,15 @@ def parse(self, response, **kwargs): # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment, which_ones='\t', replace_by=" ") # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment) didactic_comment = " ".join(didactic_comment.split()) - if didactic_comment.endswith(".mehr lesenweniger lesen"): + if didactic_comment.endswith("mehr lesenweniger lesen"): + # the button-description of the expandable info-box ends up in the string, therefore removing it: didactic_comment = didactic_comment.replace("mehr lesenweniger lesen", "") # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars) - classification.add_value('description', didactic_comment) + educational.add_value('description', didactic_comment) + lom.add_value('educational', educational.load_item()) + + classification = LomClassificationItemLoader() if "/unterrichtsvorschlaege/" in current_url: classification.add_value('purpose', 'competency') competency_description: list = response.xpath('//div[@class="b-cpsuiu-show-description"]/*[not(' @@ -172,25 +162,18 @@ def parse(self, response, **kwargs): classification.add_value('description', competency_description) lom.add_value('classification', classification.load_item()) - base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() - # for possible values, either consult https://vocabs.openeduhub.de - # or take a look at https://github.com/openeduhub/oeh-metadata-vocabs - # TODO: fill "valuespaces"-keys with values for - # - learningResourceType recommended - # (see: https://github.com/openeduhub/oeh-metadata-vocabs/blob/master/learningResourceType.ttl) + # ToDo: Set 'learningResourceType' depending on the material that's being crawled, recognize it by url vs.add_value('price', 'no') vs.add_value('containsAdvertisement', 'no') vs.add_value('conditionsOfAccess', 'no login') vs.add_value('intendedEndUserRole', 'teacher') - vs.add_value('sourceContentType', 'Unterrichtsmaterial- und Aufgaben-Sammlung') - vs.add_value('accessibilitySummary', 'Not tested') # ToDo: check if the accessibility has changed + vs.add_value('accessibilitySummary', 'Not tested') # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/ - vs.add_value('dataProtectionConformity', 'Sensible data collection') # ToDo: DSGVO-konform? + vs.add_value('dataProtectionConformity', 'Sensible data collection') # see: https://www.umwelt-im-unterricht.de/datenschutz/ - vs.add_value('oer', 'partly OER') # ToDo: alles OER? nur teils? wie setzen? # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/ disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() if len(disciplines_raw) >= 1: @@ -223,10 +206,6 @@ def parse(self, response, **kwargs): base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() - # TODO: fill "license"-keys with values for - # - oer recommended ('oer' is automatically set if the 'url'-field above - # is recognized in LICENSE_MAPPINGS: for possible url-mapping values, please take a look at - # LICENSE_MAPPINGS in converter/constants.py) license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() if license_url is not None: lic.add_value('url', license_url) From bf227ecb29d76c9d47ea763a023b8b5a5c141e5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Fri, 8 Oct 2021 12:41:26 +0200 Subject: [PATCH 05/10] umwelt_im_unterricht_spider.py (WIP!) - set learningResourceType depending on the URL-structure --- .../spiders/umwelt_im_unterricht_spider.py | 43 +++++++++++++------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index abdfeb9d..5ec0cf85 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -17,18 +17,18 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): name = "umwelt_im_unterricht_spider" friendlyName = "Umwelt im Unterricht" start_urls = [ - # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics", - # # Typ: Thema der Woche - # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons", - # # Typ: Unterrichtsvorschlag - # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts", - # # Typ: Hintergrund (Kontext) - # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials", - # # Typ: Arbeitsmaterial - # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video", - # # Typ: Video - # "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images", - # # Typ: Bilderserie + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Atopics", + # Typ: Thema der Woche + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons", + # Typ: Unterrichtsvorschlag + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Acontexts", + # Typ: Hintergrund (Kontext) + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials", + # Typ: Arbeitsmaterial + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_video", + # Typ: Video + "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images", + # Typ: Bilderserie ] version = "0.0.1" # last update: 2021-10-07 topic_urls = set() # urls that need to be parsed will be added here @@ -129,7 +129,7 @@ def parse(self, response, **kwargs): educational = LomEducationalItemLoader() educational.add_value('language', 'de') - # TODO: didactic comment could be either one of these: + # TODO: a didactic comment could fit into either one of these: # - educational.description # - classification.description (with classification.purpose set to 'educational objective') if "/wochenthemen/" in current_url: @@ -165,7 +165,22 @@ def parse(self, response, **kwargs): base.add_value('lom', lom.load_item()) vs = ValuespaceItemLoader() - # ToDo: Set 'learningResourceType' depending on the material that's being crawled, recognize it by url + + # depending on the website-category, we need to set a specific learningResourceType + # because the value 'website' for all crawled items would not be helpful enough + if "/wochenthemen/" in current_url or "/unterrichtsvorschlaege/" in current_url: + vs.add_value('learningResourceType', 'lesson plan') + if "/hintergrund/" in current_url: + vs.add_value('learningResourceType', 'Text') + if "/medien/dateien/" in current_url: + # topics categorized as "Arbeitsmaterial" offer customizable worksheets to teachers + vs.add_value('learningResourceType', 'worksheet') + if "/medien/videos/" in current_url: + vs.add_value('learningResourceType', 'video') + if "/medien/bilder/" in current_url: + # topics categorized as "Bilderserie" hold several images in a gallery (with individual licenses) + vs.add_value('learningResourceType', 'image') + vs.add_value('price', 'no') vs.add_value('containsAdvertisement', 'no') vs.add_value('conditionsOfAccess', 'no login') From b57cb0640a89bef787c876dd16de8a8ab66ebabc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Fri, 8 Oct 2021 13:20:27 +0200 Subject: [PATCH 06/10] umwelt_im_unterricht_spider.py v0.0.2 - fix license_url (replace "http://" by "https://") --- converter/spiders/umwelt_im_unterricht_spider.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index 5ec0cf85..976c83f3 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -30,7 +30,7 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): "https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images", # Typ: Bilderserie ] - version = "0.0.1" # last update: 2021-10-07 + version = "0.0.2" # last update: 2021-10-08 topic_urls = set() # urls that need to be parsed will be added here topic_urls_already_parsed = set() # this set is used for 'checking off' already parsed urls @@ -221,8 +221,10 @@ def parse(self, response, **kwargs): base.add_value('valuespaces', vs.load_item()) lic = LicenseItemLoader() - license_url = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() + license_url: str = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() if license_url is not None: + if license_url.startswith("http://"): + license_url = license_url.replace("http://", "https://") lic.add_value('url', license_url) license_description_raw = response.xpath('//div[@class="cc-licence-info"]').get() From 5b3a69209d4eaa075445c5664d54fcd064a431d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Mon, 11 Oct 2021 13:23:24 +0200 Subject: [PATCH 07/10] add documentation / scrapy contracts --- .../spiders/umwelt_im_unterricht_spider.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index 976c83f3..015d859f 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -40,7 +40,7 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): DISCIPLINE_MAPPING: dict = { 'Arbeit, Wirtschaft, Technik': 'Arbeitslehre', 'Ethik, Philosophie, Religion': ['Ethik', 'Philosophie', 'Religion'], - 'Fächerübergreifend': 'Allgemein', # ToDo: no mapping available + 'Fächerübergreifend': 'Allgemein', 'Politik, SoWi, Gesellschaft': ['Politik', 'Sozialkunde', 'Gesellschaftskunde'] } @@ -55,6 +55,12 @@ def parse_start_url(self, response, **kwargs): yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls) def parse_category_overview_for_individual_topic_urls(self, response): + """ + + Scrapy Contracts: + @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons + @returns requests 10 + """ # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}") topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall() # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}") @@ -68,10 +74,12 @@ def parse_category_overview_for_individual_topic_urls(self, response): last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get() if last_page_button_url is not None: last_page_button_url = response.urljoin(last_page_button_url) - # Using the "next page"-button until we reach the last page: + # Using the "next page"-button to navigate through all individual topics until we reach the last page: if last_page_button_url != response.url: next_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last next"]/a/@href').get() if next_page_button_url is not None: + # ToDo: optimize the page navigation by making it independent of the 'next'-button + # (by manually 'building' the url_strings from 1 to "last-page" with RegEx) next_url_to_parse = response.urljoin(next_page_button_url) yield scrapy.Request(url=next_url_to_parse, callback=self.parse_category_overview_for_individual_topic_urls) @@ -87,6 +95,12 @@ def parse_category_overview_for_individual_topic_urls(self, response): # f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}") def parse(self, response, **kwargs): + """ + + Scrapy Contracts: + @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/ + @returns item 1 + """ current_url: str = response.url base = BaseItemLoader() @@ -107,6 +121,7 @@ def parse(self, response, **kwargs): general.add_value('title', title) keywords = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall() if len(keywords) >= 1: + # only add keywords if the list isn't empty general.add_value('keyword', keywords) description = response.xpath('/html/head/meta[@name="description"]/@content').get() general.add_value('description', description) @@ -141,8 +156,10 @@ def parse(self, response, **kwargs): # didactic_comment = w3lib.html.replace_escape_chars(didactic_comment) didactic_comment = " ".join(didactic_comment.split()) if didactic_comment.endswith("mehr lesenweniger lesen"): - # the button-description of the expandable info-box ends up in the string, therefore removing it: + # the button-description of the expandable info-box ends up in the string, + # therefore we are manually removing it: didactic_comment = didactic_comment.replace("mehr lesenweniger lesen", "") + # since there's currently no way to confirm how the string looks in the web-interface: # ToDo: make sure which string format looks best in edu-sharing (cleaned up <-> with escape chars) educational.add_value('description', didactic_comment) @@ -153,7 +170,8 @@ def parse(self, response, **kwargs): classification.add_value('purpose', 'competency') competency_description: list = response.xpath('//div[@class="b-cpsuiu-show-description"]/*[not(' '@class="cc-licence-info")]').getall() - # competency_description will grab the whole div-element, but EXCLUDE the "license"-container + # the xpath-expression for competency_description will grab the whole div-element, + # but EXCLUDE the "license"-container (if the license-description exists, it's always part of the same div) if len(competency_description) >= 1: # only if the list of strings is not empty, we'll try to type-convert it to a string (and clean its # formatting up) @@ -185,11 +203,12 @@ def parse(self, response, **kwargs): vs.add_value('containsAdvertisement', 'no') vs.add_value('conditionsOfAccess', 'no login') vs.add_value('intendedEndUserRole', 'teacher') + # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/ vs.add_value('accessibilitySummary', 'Not tested') # see: https://www.umwelt-im-unterricht.de/erklaerung-zur-barrierefreiheit/ vs.add_value('dataProtectionConformity', 'Sensible data collection') # see: https://www.umwelt-im-unterricht.de/datenschutz/ - # see: https://www.umwelt-im-unterricht.de/ueber-umwelt-im-unterricht/ + disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() if len(disciplines_raw) >= 1: disciplines = list() @@ -197,6 +216,8 @@ def parse(self, response, **kwargs): # self.debug_discipline_values.add(discipline_value) if discipline_value in self.DISCIPLINE_MAPPING.keys(): discipline_value = self.DISCIPLINE_MAPPING.get(discipline_value) + # since the mapping value can either be a single string OR a list of strings, we need to make sure that + # our 'disciplines'-list is a list of strings (not a list with nested lists): if type(discipline_value) is list: disciplines.extend(discipline_value) else: @@ -206,6 +227,7 @@ def parse(self, response, **kwargs): educational_context_raw = response.xpath('//div[@class="b-cpsuiu-show-targets"]/ul/li/a/text()').getall() if len(educational_context_raw) >= 1: + # the educationalContext-mapping is only done when there's at least one educational_context found educational_context = list() for educational_context_value in educational_context_raw: # self.debug_educational_context_values.add(educational_context_value) From 9319b5764629f7f65275dbcf0b46a287dbf4f93b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Mon, 11 Oct 2021 20:48:24 +0200 Subject: [PATCH 08/10] rework crawler navigation through overview_urls - no longer uses the "next page"-button, but instead builds the url-list by splitting the "last page"-button up into a URL and its page-parameter (int) -- this makes sure that we don't lose several pages and topics at once if scrapy gets only a timeout as a response from one overview subpage --- .../spiders/umwelt_im_unterricht_spider.py | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index 015d859f..2810f78d 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -1,3 +1,5 @@ +import re + import scrapy import w3lib.html from scrapy.spiders import CrawlSpider @@ -32,7 +34,8 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): ] version = "0.0.2" # last update: 2021-10-08 topic_urls = set() # urls that need to be parsed will be added here - topic_urls_already_parsed = set() # this set is used for 'checking off' already parsed urls + topic_urls_parsed = set() # this set is used for 'checking off' already parsed urls + overview_urls_already_parsed = set() # this set is used for 'checking off' already parsed overview_pages EDUCATIONAL_CONTEXT_MAPPING: dict = { 'Sekundarstufe': ['Sekundarstufe I', 'Sekundarstufe II'] @@ -52,9 +55,9 @@ def getHash(self, response=None) -> str: def parse_start_url(self, response, **kwargs): for url in self.start_urls: - yield scrapy.Request(url=url, callback=self.parse_category_overview_for_individual_topic_urls) + yield scrapy.Request(url=url, callback=self.parse_category_overview_for_topics_and_subpages) - def parse_category_overview_for_individual_topic_urls(self, response): + def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.Response): """ Scrapy Contracts: @@ -72,27 +75,30 @@ def parse_category_overview_for_individual_topic_urls(self, response): # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially # displayed 10 elements last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get() + page_number_regex = re.compile(r'(?P.*&tx_solr%5Bpage%5D=)(?P\d+)') + overview_urls_parsed: set = set() + if last_page_button_url is not None: - last_page_button_url = response.urljoin(last_page_button_url) - # Using the "next page"-button to navigate through all individual topics until we reach the last page: - if last_page_button_url != response.url: - next_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last next"]/a/@href').get() - if next_page_button_url is not None: - # ToDo: optimize the page navigation by making it independent of the 'next'-button - # (by manually 'building' the url_strings from 1 to "last-page" with RegEx) - next_url_to_parse = response.urljoin(next_page_button_url) - yield scrapy.Request(url=next_url_to_parse, - callback=self.parse_category_overview_for_individual_topic_urls) - # if last_page_button_url == response.url: - # logging.debug(f"Reached the last page: {response.url}") - # logging.debug(f"{len(self.topic_urls)} individual topic_urls were found: {self.topic_urls}") + page_number_dict: dict = page_number_regex.search(last_page_button_url).groupdict() + url_without_page_parameter = response.urljoin(page_number_dict.get('url_with_parameters')) + last_page_number = int(page_number_dict.get('nr')) + for i in range(2, last_page_number + 1): + # since the initial url in start_urls already counts as page 1, + # we're iterating from page 2 to the last page + next_overview_subpage_to_crawl = str(url_without_page_parameter + str(i)) + if next_overview_subpage_to_crawl not in self.overview_urls_already_parsed: + yield scrapy.Request(url=next_overview_subpage_to_crawl, + callback=self.parse_category_overview_for_topics_and_subpages) + overview_urls_parsed.add(next_overview_subpage_to_crawl) + self.overview_urls_already_parsed.update(overview_urls_parsed) + + parsed_urls: set = set() for url in self.topic_urls: # making sure that we don't accidentally crawl individual pages more than once - if url not in self.topic_urls_already_parsed: + if url not in self.topic_urls_parsed: yield scrapy.Request(url=url, callback=self.parse) - self.topic_urls_already_parsed.add(url) - # logging.debug(f"topic_urls after yielding them: {len(self.topic_urls)} --- " - # f"topic_urls_already_parsed: {len(self.topic_urls_already_parsed)}") + parsed_urls.add(url) + self.topic_urls_parsed.update(parsed_urls) def parse(self, response, **kwargs): """ From 23817ecad2040037b7504fe961b4389d7b1be51b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Tue, 12 Oct 2021 11:40:34 +0200 Subject: [PATCH 09/10] minimal code cleanup, add documentation --- .../spiders/umwelt_im_unterricht_spider.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index 2810f78d..c8b90789 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -34,7 +34,7 @@ class UmweltImUnterrichtSpider(CrawlSpider, LomBase): ] version = "0.0.2" # last update: 2021-10-08 topic_urls = set() # urls that need to be parsed will be added here - topic_urls_parsed = set() # this set is used for 'checking off' already parsed urls + topic_urls_parsed = set() # this set is used for 'checking off' already parsed (individual) topic urls overview_urls_already_parsed = set() # this set is used for 'checking off' already parsed overview_pages EDUCATIONAL_CONTEXT_MAPPING: dict = { @@ -64,38 +64,37 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http. @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons @returns requests 10 """ - # logging.debug(f"INSIDE PARSE CATEGORY METHOD: {response.url}") topic_urls_raw: list = response.xpath('//a[@class="internal-link readmore"]/@href').getall() - # logging.debug(f"TOPIC URLS (RAW) ={topic_urls_raw}") for url_ending in topic_urls_raw: self.topic_urls.add(response.urljoin(url_ending)) - # logging.debug(f"TOPIC URLS ({len(self.topic_urls)}) = {self.topic_urls}") # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially # displayed 10 elements last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get() + # the string last_page_button_url typically looks like this: + # "/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images&tx_solr%5Bpage%5D=8" page_number_regex = re.compile(r'(?P.*&tx_solr%5Bpage%5D=)(?P\d+)') - overview_urls_parsed: set = set() + overview_urls_parsed: set = set() # temporary set used for checking off already visited URLs if last_page_button_url is not None: page_number_dict: dict = page_number_regex.search(last_page_button_url).groupdict() url_without_page_parameter = response.urljoin(page_number_dict.get('url_with_parameters')) last_page_number = int(page_number_dict.get('nr')) for i in range(2, last_page_number + 1): - # since the initial url in start_urls already counts as page 1, - # we're iterating from page 2 to the last page + # the initial url from start_urls already counts as page 1, therefore we're iterating + # from page 2 to the last page next_overview_subpage_to_crawl = str(url_without_page_parameter + str(i)) if next_overview_subpage_to_crawl not in self.overview_urls_already_parsed: yield scrapy.Request(url=next_overview_subpage_to_crawl, callback=self.parse_category_overview_for_topics_and_subpages) overview_urls_parsed.add(next_overview_subpage_to_crawl) - self.overview_urls_already_parsed.update(overview_urls_parsed) + self.overview_urls_already_parsed.update(overview_urls_parsed) # checking off the (10) URLs that we yielded - parsed_urls: set = set() + parsed_urls: set = set() # temporary set used for checking off already visited topics for url in self.topic_urls: - # making sure that we don't accidentally crawl individual pages more than once if url not in self.topic_urls_parsed: + # making sure that we don't accidentally crawl individual pages more than once yield scrapy.Request(url=url, callback=self.parse) parsed_urls.add(url) self.topic_urls_parsed.update(parsed_urls) @@ -252,6 +251,7 @@ def parse(self, response, **kwargs): license_url: str = response.xpath('//div[@class="cc-licence-info"]/p/a[@rel="license"]/@href').get() if license_url is not None: if license_url.startswith("http://"): + # the license-mapper expects urls that are in https:// format, but UIU uses http:// links to CC-licenses license_url = license_url.replace("http://", "https://") lic.add_value('url', license_url) @@ -264,7 +264,7 @@ def parse(self, response, **kwargs): # together. just replacing \n with a whitespace is enough to keep the structure of the string intact. license_description_raw = w3lib.html.replace_escape_chars(license_description_raw) license_description = " ".join(license_description_raw.split()) - # making sure that there's only 1 whitespace between words, not 4+ when the original string had serveral \t + # making sure that there's only 1 whitespace between words lic.add_value('description', license_description) base.add_value('license', lic.load_item()) @@ -274,5 +274,4 @@ def parse(self, response, **kwargs): response_loader = super().mapResponse(response) base.add_value('response', response_loader.load_item()) - # once all scrapy.Item are loaded into our "base", we yield the BaseItem by calling the .load_item() method yield base.load_item() From 32dc5645ea5f8c34ac894438e4561a60bf0c9fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Schn=C3=A4pp?= <981166+Criamos@users.noreply.github.com> Date: Tue, 12 Oct 2021 12:22:07 +0200 Subject: [PATCH 10/10] add type hinting, descriptions --- .../spiders/umwelt_im_unterricht_spider.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/converter/spiders/umwelt_im_unterricht_spider.py b/converter/spiders/umwelt_im_unterricht_spider.py index c8b90789..4dc5f0f8 100644 --- a/converter/spiders/umwelt_im_unterricht_spider.py +++ b/converter/spiders/umwelt_im_unterricht_spider.py @@ -59,6 +59,9 @@ def parse_start_url(self, response, **kwargs): def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.Response): """ + Crawls an overview page of a "type"-category (e.g. "Hintergrund", "Bilderserie" etc.) for subpages and topics. + If the overview has subpages, it will recursively yield additional scrapy.Requests to the overview-subpages. + Afterwards it yields the (10) individual topic_urls (per overview page) to the parse()-method. Scrapy Contracts: @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons @@ -71,7 +74,7 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http. # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially # displayed 10 elements - last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get() + last_page_button_url: str = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get() # the string last_page_button_url typically looks like this: # "/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images&tx_solr%5Bpage%5D=8" page_number_regex = re.compile(r'(?P.*&tx_solr%5Bpage%5D=)(?P\d+)') @@ -79,7 +82,7 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http. overview_urls_parsed: set = set() # temporary set used for checking off already visited URLs if last_page_button_url is not None: page_number_dict: dict = page_number_regex.search(last_page_button_url).groupdict() - url_without_page_parameter = response.urljoin(page_number_dict.get('url_with_parameters')) + url_without_page_parameter: str = response.urljoin(page_number_dict.get('url_with_parameters')) last_page_number = int(page_number_dict.get('nr')) for i in range(2, last_page_number + 1): # the initial url from start_urls already counts as page 1, therefore we're iterating @@ -99,8 +102,9 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http. parsed_urls.add(url) self.topic_urls_parsed.update(parsed_urls) - def parse(self, response, **kwargs): + def parse(self, response: scrapy.http.Response, **kwargs): """ + Parses an individual topic url for metadata and yields a BaseItem. Scrapy Contracts: @url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/ @@ -110,8 +114,8 @@ def parse(self, response, **kwargs): base = BaseItemLoader() base.add_value('sourceId', response.url) - date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get() - date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw) + date_raw: str = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get() + date_cleaned_up: str = w3lib.html.strip_html5_whitespace(date_raw) hash_temp = str(date_cleaned_up + self.version) base.add_value('hash', hash_temp) base.add_value('lastModified', date_cleaned_up) @@ -122,13 +126,13 @@ def parse(self, response, **kwargs): general = LomGeneralItemloader() general.add_value('identifier', response.url) - title = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get() + title: str = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get() general.add_value('title', title) - keywords = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall() + keywords: list = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall() if len(keywords) >= 1: # only add keywords if the list isn't empty general.add_value('keyword', keywords) - description = response.xpath('/html/head/meta[@name="description"]/@content').get() + description: str = response.xpath('/html/head/meta[@name="description"]/@content').get() general.add_value('description', description) general.add_value('language', 'de') @@ -214,7 +218,7 @@ def parse(self, response, **kwargs): vs.add_value('dataProtectionConformity', 'Sensible data collection') # see: https://www.umwelt-im-unterricht.de/datenschutz/ - disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() + disciplines_raw: list = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall() if len(disciplines_raw) >= 1: disciplines = list() for discipline_value in disciplines_raw: @@ -255,7 +259,7 @@ def parse(self, response, **kwargs): license_url = license_url.replace("http://", "https://") lic.add_value('url', license_url) - license_description_raw = response.xpath('//div[@class="cc-licence-info"]').get() + license_description_raw: str = response.xpath('//div[@class="cc-licence-info"]').get() if license_description_raw is not None: license_description_raw = w3lib.html.remove_tags(license_description_raw) license_description_raw = w3lib.html.replace_escape_chars(license_description_raw, which_ones="\n",