Skip to content

Commit

Permalink
Merge pull request #15 from Criamos/vimeo_spider
Browse files Browse the repository at this point in the history
Vimeo spider
  • Loading branch information
torsten-simon authored May 6, 2021
2 parents b574d96 + edae519 commit c6833c1
Showing 1 changed file with 217 additions and 0 deletions.
217 changes: 217 additions & 0 deletions converter/spiders/schule_im_aufbruch_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
import html
import json
import logging

import scrapy

from converter.items import BaseItemLoader, LomGeneralItemloader, LomTechnicalItemLoader, \
LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, PermissionItemLoader, LomLifecycleItemloader
from converter.spiders.base_classes import LomBase


class SchuleImAufbruchSpider(scrapy.Spider, LomBase):
name = "schule_im_aufbruch_spider"
friendlyName = "Schule im Aufbruch"
url = "https://vimeo.com/user12637410/videos"
version = "0.1.1"

# this list will be filled with urls to crawl through, (currently it's only used for debugging purposes)
video_urls_to_crawl = list()

def __init__(self, **kwargs):
LomBase.__init__(self, **kwargs)

def getId(self, response: scrapy.http.Response = None) -> str:
# currently returns the video-title as ID
return response.xpath('//title//text()').get()

def getHash(self, response: scrapy.http.Response = None) -> str:
pass

def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)

def parse(self, response, **kwargs):
"""
uses variable "url" (format: https://vimeo.com/userID/videos) as the starting point to crawl the overview-page
for links to video-sub-pages and grabs the next overview-page afterwards
(by default vimeo shows only 12 video-thumbnails per overview-page)
"""

yield from self.get_video_urls_from_overview(response)
logging.debug("urls_to_crawl is currently the size of " + str(len(self.video_urls_to_crawl)))

# by default vimeo shows only 12 videos per overview-page,
# we need to iterate through all pages on the vimeo-channel:
yield from self.get_next_vimeo_overview_page(response)

def get_video_urls_from_overview(self, response):
"""
looks for the ld+json script block on the current overview page and grab the URLs.
Afterwards tells the video parser to go through the video-sub-pages and yield the metadata
"""

# TODO: acquire thumbnail from the overview?
# thumbnails can be acquired using the 'srcset' attribute on each thumbnail, e.g.:
# response.xpath('//*[@id="clip_412230600"]/a/img/@srcset').get()

# acquire current URLs from <script type="application/ld+json"> block
current_page_ld = response.xpath('/html/body/script[1]/text()').get().strip()
current_page_json = json.loads(current_page_ld)

# the urls we need are inside a nested dictionary
current_page_nested = json.dumps(current_page_json[1]["itemListElement"])
# we need to be able to access the data as JSON elements to get their values more comfortably:
current_page_video_dictionary = json.loads(current_page_nested)

for items in current_page_video_dictionary:
video_full_url = response.urljoin(items["url"])

# TODO: technically we don't need the video_urls_to_crawl list anymore
self.video_urls_to_crawl.append(video_full_url)
# following each video_url to the dedicated video-subpage to grab metadata
yield response.follow(url=video_full_url, callback=self.parse_video_page)

def parse_video_page(self, response: scrapy.http.Response = None):
"""
parses a video-page (e.g. https://vimeo.com/videoID whereby videoID is a number) for metadata
(condition: only if there is a "json+ld"-script found within the video-page).
"""
# XPath to description of a video looks like this:
# //*[@id="main"]/div/main/div/div/div/div[2]/div[3]/div

# if ld+json script-container doesn't exist, at least log the error
if (response.xpath('/html/body/script[1]/text()').get().strip()) is not None:

# TODO: there's additional metadata inside a script block: window.vimeo.clip_page_config
# - longer description - maybe use this one?
# - duration (both in seconds and formatted)
# - ads
# - house_ads_enabled
# - third_party_ads_enabled
# response.xpath('//*[@id="wrap"]/div[2]/script[1]/text()').get()
# might have to access it and split it up with regEx

return LomBase.parse(self, response)
else:
logging.debug("Could not find ld+json script, skipping entry: " + response.url)

@staticmethod
def get_ld_json(response: scrapy.http.Response) -> list:
"""
acquires the ld+json script block from the current page and deserializes it into a json list
"""
ld_json_string = response.xpath('/html/body/script[1]/text()').get().strip()
ld_json = json.loads(ld_json_string)
return ld_json

@staticmethod
def get_license(response: scrapy.http.Response = None) -> str:
"""
grabs the license information from the "about"-button (pop-in)
:return: url of license as String
"""
# check first if the license information is present:
if (response.xpath('/html/head/link[9]/@rel').get()) == "license":
license_url = response.xpath('/html/head/link[9]/@href').get()
return license_url
else:
return "license information not found"

def getBase(self, response=None) -> BaseItemLoader:
base = LomBase.getBase(self, response)
ld_json = self.get_ld_json(response)
current_url = str(response.url) # making double-sure that we're using a string for sourceID
base.add_value('sourceId', current_url)
# maybe add sourceID + dateModified as hash?
base.add_value("hash", ld_json[0]["dateModified"])
base.add_value("lastModified", ld_json[0]["dateModified"])
base.add_value('thumbnail', ld_json[0]["thumbnailUrl"])
return base

def getLOMGeneral(self, response=None) -> LomGeneralItemloader:
general = LomBase.getLOMGeneral(self, response)
ld_json = self.get_ld_json(response)
general.add_value('title', html.unescape(ld_json[0]["name"]))
general.add_value('description', html.unescape(ld_json[0]["description"]))
# TODO: set manually if there are no keywords given?
# general.add_value('keyword', '') # manual keywords?
return general

def getLOMTechnical(self, response=None) -> LomTechnicalItemLoader:
# TODO: LomTechnicalItemLoader()
technical = LomBase.getLOMTechnical(self, response)
ld_json = self.get_ld_json(response)

# TODO: Make sure that we're grabbing the right type for 'format'
# if we were to acquire the format by an API call
# (see https://developer.vimeo.com/api/reference/responses/video), vimeo would offer 3 options:
# 'live' (for live events),
# 'stock' (this video is a Vimeo Stock video)
# 'video' (this video is a standard Vimeo video)

# grabs the video type from the metadata header - most of the times it'll be video.other
technical.add_value('format', response.xpath('/html/head/meta[18]/@content').get())
technical.add_value('location', ld_json[0]["url"])
return technical

def getValuespaces(self, response) -> ValuespaceItemLoader:
vs = LomBase.getValuespaces(self, response)
# TODO: ValueSpaceItemLoader() missing keys? which ones are to be manually set?
# - dataProtectionConformity
# - fskRating
# - oer
# - educationalContext
# - educationalContentType
vs.add_value('conditionsOfAccess', 'no_login')
# vs.add_value('containsAdvertisement', 'no') # do vimeo-advertisements for their own vimeo-plans count?
vs.add_value('price', 'no')
vs.add_value('intendedEndUserRole', 'teacher')
vs.add_value('discipline', '720') # is this the correct category? (allgemein)
vs.add_value('learningResourceType', 'video')
return vs

def getLOMLifecycle(self, response=None) -> LomLifecycleItemloader:
lifecycle = LomBase.getLOMLifecycle(self, response)
ld_json = self.get_ld_json(response)
# author information is inside a dictionary with schema.org type Person
# we could maybe grab the whole object instead?
author_dict = ld_json[1]["itemListElement"][0]["item"]
# TODO: LomLifeCycleItemLoader
lifecycle.add_value('organization', author_dict["name"])
lifecycle.add_value('url', author_dict["@id"])
return lifecycle

def getLOMEducational(self, response=None) -> LomEducationalItemLoader:
edu = LomBase.getLOMEducational(self, response)
# TODO: which category does "schule im Aufbruch" fit into? double-check!
edu.add_value('language', 'de') # okay to hardcode this? (some videos are bilingual, but meta
# data from vimeo doesn't offer language attributes)
return edu

def getLicense(self, response=None) -> LicenseItemLoader:
lic = LomBase.getLicense(self, response)
license_url = self.get_license(response)
lic.add_value('url', license_url)
return lic

def getPermissions(self, response=None) -> PermissionItemLoader:
permissions = LomBase.getPermissions(self, response)
# TODO: PermissionItemLoader - which value should be set?
permissions.add_value('public', self.settings.get("DEFAULT_PUBLIC_STATE")) # is this necessary?
return permissions

def get_next_vimeo_overview_page(self, response: scrapy.http.Response):
"""
if there is a "next"-button at the bottom of the vimeo-user's overview page:
grabs the url from it and yields it
"""
# next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get()
next_vimeo_overview_page = response.css('#pagination > ol > li.pagination_next a::attr(href)').get()
if next_vimeo_overview_page is not None:
yield response.follow(next_vimeo_overview_page, self.parse)

0 comments on commit c6833c1

Please sign in to comment.