forked from openeduhub/oeh-search-etl
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from Criamos/vimeo_spider
Vimeo spider
- Loading branch information
Showing
1 changed file
with
217 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
import html | ||
import json | ||
import logging | ||
|
||
import scrapy | ||
|
||
from converter.items import BaseItemLoader, LomGeneralItemloader, LomTechnicalItemLoader, \ | ||
LomEducationalItemLoader, ValuespaceItemLoader, LicenseItemLoader, PermissionItemLoader, LomLifecycleItemloader | ||
from converter.spiders.base_classes import LomBase | ||
|
||
|
||
class SchuleImAufbruchSpider(scrapy.Spider, LomBase): | ||
name = "schule_im_aufbruch_spider" | ||
friendlyName = "Schule im Aufbruch" | ||
url = "https://vimeo.com/user12637410/videos" | ||
version = "0.1.1" | ||
|
||
# this list will be filled with urls to crawl through, (currently it's only used for debugging purposes) | ||
video_urls_to_crawl = list() | ||
|
||
def __init__(self, **kwargs): | ||
LomBase.__init__(self, **kwargs) | ||
|
||
def getId(self, response: scrapy.http.Response = None) -> str: | ||
# currently returns the video-title as ID | ||
return response.xpath('//title//text()').get() | ||
|
||
def getHash(self, response: scrapy.http.Response = None) -> str: | ||
pass | ||
|
||
def start_requests(self): | ||
yield scrapy.Request(url=self.url, callback=self.parse) | ||
|
||
def parse(self, response, **kwargs): | ||
""" | ||
uses variable "url" (format: https://vimeo.com/userID/videos) as the starting point to crawl the overview-page | ||
for links to video-sub-pages and grabs the next overview-page afterwards | ||
(by default vimeo shows only 12 video-thumbnails per overview-page) | ||
""" | ||
|
||
yield from self.get_video_urls_from_overview(response) | ||
logging.debug("urls_to_crawl is currently the size of " + str(len(self.video_urls_to_crawl))) | ||
|
||
# by default vimeo shows only 12 videos per overview-page, | ||
# we need to iterate through all pages on the vimeo-channel: | ||
yield from self.get_next_vimeo_overview_page(response) | ||
|
||
def get_video_urls_from_overview(self, response): | ||
""" | ||
looks for the ld+json script block on the current overview page and grab the URLs. | ||
Afterwards tells the video parser to go through the video-sub-pages and yield the metadata | ||
""" | ||
|
||
# TODO: acquire thumbnail from the overview? | ||
# thumbnails can be acquired using the 'srcset' attribute on each thumbnail, e.g.: | ||
# response.xpath('//*[@id="clip_412230600"]/a/img/@srcset').get() | ||
|
||
# acquire current URLs from <script type="application/ld+json"> block | ||
current_page_ld = response.xpath('/html/body/script[1]/text()').get().strip() | ||
current_page_json = json.loads(current_page_ld) | ||
|
||
# the urls we need are inside a nested dictionary | ||
current_page_nested = json.dumps(current_page_json[1]["itemListElement"]) | ||
# we need to be able to access the data as JSON elements to get their values more comfortably: | ||
current_page_video_dictionary = json.loads(current_page_nested) | ||
|
||
for items in current_page_video_dictionary: | ||
video_full_url = response.urljoin(items["url"]) | ||
|
||
# TODO: technically we don't need the video_urls_to_crawl list anymore | ||
self.video_urls_to_crawl.append(video_full_url) | ||
# following each video_url to the dedicated video-subpage to grab metadata | ||
yield response.follow(url=video_full_url, callback=self.parse_video_page) | ||
|
||
def parse_video_page(self, response: scrapy.http.Response = None): | ||
""" | ||
parses a video-page (e.g. https://vimeo.com/videoID whereby videoID is a number) for metadata | ||
(condition: only if there is a "json+ld"-script found within the video-page). | ||
""" | ||
# XPath to description of a video looks like this: | ||
# //*[@id="main"]/div/main/div/div/div/div[2]/div[3]/div | ||
|
||
# if ld+json script-container doesn't exist, at least log the error | ||
if (response.xpath('/html/body/script[1]/text()').get().strip()) is not None: | ||
|
||
# TODO: there's additional metadata inside a script block: window.vimeo.clip_page_config | ||
# - longer description - maybe use this one? | ||
# - duration (both in seconds and formatted) | ||
# - ads | ||
# - house_ads_enabled | ||
# - third_party_ads_enabled | ||
# response.xpath('//*[@id="wrap"]/div[2]/script[1]/text()').get() | ||
# might have to access it and split it up with regEx | ||
|
||
return LomBase.parse(self, response) | ||
else: | ||
logging.debug("Could not find ld+json script, skipping entry: " + response.url) | ||
|
||
@staticmethod | ||
def get_ld_json(response: scrapy.http.Response) -> list: | ||
""" | ||
acquires the ld+json script block from the current page and deserializes it into a json list | ||
""" | ||
ld_json_string = response.xpath('/html/body/script[1]/text()').get().strip() | ||
ld_json = json.loads(ld_json_string) | ||
return ld_json | ||
|
||
@staticmethod | ||
def get_license(response: scrapy.http.Response = None) -> str: | ||
""" | ||
grabs the license information from the "about"-button (pop-in) | ||
:return: url of license as String | ||
""" | ||
# check first if the license information is present: | ||
if (response.xpath('/html/head/link[9]/@rel').get()) == "license": | ||
license_url = response.xpath('/html/head/link[9]/@href').get() | ||
return license_url | ||
else: | ||
return "license information not found" | ||
|
||
def getBase(self, response=None) -> BaseItemLoader: | ||
base = LomBase.getBase(self, response) | ||
ld_json = self.get_ld_json(response) | ||
current_url = str(response.url) # making double-sure that we're using a string for sourceID | ||
base.add_value('sourceId', current_url) | ||
# maybe add sourceID + dateModified as hash? | ||
base.add_value("hash", ld_json[0]["dateModified"]) | ||
base.add_value("lastModified", ld_json[0]["dateModified"]) | ||
base.add_value('thumbnail', ld_json[0]["thumbnailUrl"]) | ||
return base | ||
|
||
def getLOMGeneral(self, response=None) -> LomGeneralItemloader: | ||
general = LomBase.getLOMGeneral(self, response) | ||
ld_json = self.get_ld_json(response) | ||
general.add_value('title', html.unescape(ld_json[0]["name"])) | ||
general.add_value('description', html.unescape(ld_json[0]["description"])) | ||
# TODO: set manually if there are no keywords given? | ||
# general.add_value('keyword', '') # manual keywords? | ||
return general | ||
|
||
def getLOMTechnical(self, response=None) -> LomTechnicalItemLoader: | ||
# TODO: LomTechnicalItemLoader() | ||
technical = LomBase.getLOMTechnical(self, response) | ||
ld_json = self.get_ld_json(response) | ||
|
||
# TODO: Make sure that we're grabbing the right type for 'format' | ||
# if we were to acquire the format by an API call | ||
# (see https://developer.vimeo.com/api/reference/responses/video), vimeo would offer 3 options: | ||
# 'live' (for live events), | ||
# 'stock' (this video is a Vimeo Stock video) | ||
# 'video' (this video is a standard Vimeo video) | ||
|
||
# grabs the video type from the metadata header - most of the times it'll be video.other | ||
technical.add_value('format', response.xpath('/html/head/meta[18]/@content').get()) | ||
technical.add_value('location', ld_json[0]["url"]) | ||
return technical | ||
|
||
def getValuespaces(self, response) -> ValuespaceItemLoader: | ||
vs = LomBase.getValuespaces(self, response) | ||
# TODO: ValueSpaceItemLoader() missing keys? which ones are to be manually set? | ||
# - dataProtectionConformity | ||
# - fskRating | ||
# - oer | ||
# - educationalContext | ||
# - educationalContentType | ||
vs.add_value('conditionsOfAccess', 'no_login') | ||
# vs.add_value('containsAdvertisement', 'no') # do vimeo-advertisements for their own vimeo-plans count? | ||
vs.add_value('price', 'no') | ||
vs.add_value('intendedEndUserRole', 'teacher') | ||
vs.add_value('discipline', '720') # is this the correct category? (allgemein) | ||
vs.add_value('learningResourceType', 'video') | ||
return vs | ||
|
||
def getLOMLifecycle(self, response=None) -> LomLifecycleItemloader: | ||
lifecycle = LomBase.getLOMLifecycle(self, response) | ||
ld_json = self.get_ld_json(response) | ||
# author information is inside a dictionary with schema.org type Person | ||
# we could maybe grab the whole object instead? | ||
author_dict = ld_json[1]["itemListElement"][0]["item"] | ||
# TODO: LomLifeCycleItemLoader | ||
lifecycle.add_value('organization', author_dict["name"]) | ||
lifecycle.add_value('url', author_dict["@id"]) | ||
return lifecycle | ||
|
||
def getLOMEducational(self, response=None) -> LomEducationalItemLoader: | ||
edu = LomBase.getLOMEducational(self, response) | ||
# TODO: which category does "schule im Aufbruch" fit into? double-check! | ||
edu.add_value('language', 'de') # okay to hardcode this? (some videos are bilingual, but meta | ||
# data from vimeo doesn't offer language attributes) | ||
return edu | ||
|
||
def getLicense(self, response=None) -> LicenseItemLoader: | ||
lic = LomBase.getLicense(self, response) | ||
license_url = self.get_license(response) | ||
lic.add_value('url', license_url) | ||
return lic | ||
|
||
def getPermissions(self, response=None) -> PermissionItemLoader: | ||
permissions = LomBase.getPermissions(self, response) | ||
# TODO: PermissionItemLoader - which value should be set? | ||
permissions.add_value('public', self.settings.get("DEFAULT_PUBLIC_STATE")) # is this necessary? | ||
return permissions | ||
|
||
def get_next_vimeo_overview_page(self, response: scrapy.http.Response): | ||
""" | ||
if there is a "next"-button at the bottom of the vimeo-user's overview page: | ||
grabs the url from it and yields it | ||
""" | ||
# next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get() | ||
next_vimeo_overview_page = response.css('#pagination > ol > li.pagination_next a::attr(href)').get() | ||
if next_vimeo_overview_page is not None: | ||
yield response.follow(next_vimeo_overview_page, self.parse) |