Skip to content

Commit

Permalink
add type hinting, descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
Criamos committed Oct 12, 2021
1 parent 23817ec commit 32dc564
Showing 1 changed file with 14 additions and 10 deletions.
24 changes: 14 additions & 10 deletions converter/spiders/umwelt_im_unterricht_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def parse_start_url(self, response, **kwargs):

def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.Response):
"""
Crawls an overview page of a "type"-category (e.g. "Hintergrund", "Bilderserie" etc.) for subpages and topics.
If the overview has subpages, it will recursively yield additional scrapy.Requests to the overview-subpages.
Afterwards it yields the (10) individual topic_urls (per overview page) to the parse()-method.
Scrapy Contracts:
@url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons
Expand All @@ -71,15 +74,15 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.

# if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially
# displayed 10 elements
last_page_button_url = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
last_page_button_url: str = response.xpath('//li[@class="tx-pagebrowse-last last"]/a/@href').get()
# the string last_page_button_url typically looks like this:
# "/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images&tx_solr%5Bpage%5D=8"
page_number_regex = re.compile(r'(?P<url_with_parameters>.*&tx_solr%5Bpage%5D=)(?P<nr>\d+)')

overview_urls_parsed: set = set() # temporary set used for checking off already visited URLs
if last_page_button_url is not None:
page_number_dict: dict = page_number_regex.search(last_page_button_url).groupdict()
url_without_page_parameter = response.urljoin(page_number_dict.get('url_with_parameters'))
url_without_page_parameter: str = response.urljoin(page_number_dict.get('url_with_parameters'))
last_page_number = int(page_number_dict.get('nr'))
for i in range(2, last_page_number + 1):
# the initial url from start_urls already counts as page 1, therefore we're iterating
Expand All @@ -99,8 +102,9 @@ def parse_category_overview_for_topics_and_subpages(self, response: scrapy.http.
parsed_urls.add(url)
self.topic_urls_parsed.update(parsed_urls)

def parse(self, response, **kwargs):
def parse(self, response: scrapy.http.Response, **kwargs):
"""
Parses an individual topic url for metadata and yields a BaseItem.
Scrapy Contracts:
@url https://www.umwelt-im-unterricht.de/hintergrund/generationengerechtigkeit-klimaschutz-und-eine-lebenswerte-zukunft/
Expand All @@ -110,8 +114,8 @@ def parse(self, response, **kwargs):
base = BaseItemLoader()

base.add_value('sourceId', response.url)
date_raw = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
date_cleaned_up = w3lib.html.strip_html5_whitespace(date_raw)
date_raw: str = response.xpath('//div[@class="b-cpsuiu-show-info"]/span/text()').get()
date_cleaned_up: str = w3lib.html.strip_html5_whitespace(date_raw)
hash_temp = str(date_cleaned_up + self.version)
base.add_value('hash', hash_temp)
base.add_value('lastModified', date_cleaned_up)
Expand All @@ -122,13 +126,13 @@ def parse(self, response, **kwargs):

general = LomGeneralItemloader()
general.add_value('identifier', response.url)
title = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get()
title: str = response.xpath('//div[@class="tx-cps-uiu"]/article/h1/text()').get()
general.add_value('title', title)
keywords = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
keywords: list = response.xpath('//div[@class="b-cpsuiu-show-keywords"]/ul/li/a/text()').getall()
if len(keywords) >= 1:
# only add keywords if the list isn't empty
general.add_value('keyword', keywords)
description = response.xpath('/html/head/meta[@name="description"]/@content').get()
description: str = response.xpath('/html/head/meta[@name="description"]/@content').get()
general.add_value('description', description)
general.add_value('language', 'de')

Expand Down Expand Up @@ -214,7 +218,7 @@ def parse(self, response, **kwargs):
vs.add_value('dataProtectionConformity', 'Sensible data collection')
# see: https://www.umwelt-im-unterricht.de/datenschutz/

disciplines_raw = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
disciplines_raw: list = response.xpath('//div[@class="b-cpsuiu-show-subjects"]/ul/li/a/text()').getall()
if len(disciplines_raw) >= 1:
disciplines = list()
for discipline_value in disciplines_raw:
Expand Down Expand Up @@ -255,7 +259,7 @@ def parse(self, response, **kwargs):
license_url = license_url.replace("http://", "https://")
lic.add_value('url', license_url)

license_description_raw = response.xpath('//div[@class="cc-licence-info"]').get()
license_description_raw: str = response.xpath('//div[@class="cc-licence-info"]').get()
if license_description_raw is not None:
license_description_raw = w3lib.html.remove_tags(license_description_raw)
license_description_raw = w3lib.html.replace_escape_chars(license_description_raw, which_ones="\n",
Expand Down

0 comments on commit 32dc564

Please sign in to comment.