diff --git a/iiify/app.py b/iiify/app.py index e78e60a..7064032 100755 --- a/iiify/app.py +++ b/iiify/app.py @@ -7,8 +7,9 @@ from flask_cors import CORS from flask_caching import Cache from iiif2 import iiif, web -from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \ - purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations +from .resolver import ia_resolver, create_manifest, create_manifest3, scrape, \ + collection, purify_domain, cantaloupe_resolver, create_collection3, IsCollection, \ + create_annotations from .configs import options, cors, approot, cache_root, media_root, \ cache_expr, version, image_server, cache_timeouts from urllib.parse import quote @@ -56,18 +57,27 @@ def mainentry(): @app.route('/iiif/') def index(): """Lists all available book and image items on Archive.org""" - cursor = request.args.get('cursor', '') q = request.args.get('q', '') - return jsonify(getids(q, cursor=cursor)) - + cursor = request.args.get('cursor', '') + fields = request.args.get('fields', '') + sorts = request.args.get('sorts', '') + r = scrape(q, cursor=cursor, fields=fields, sorts=sorts, restrict_to_iiif=True) + return jsonify(r) @app.route('/iiif/collection.json') def catalog(): - cursor = request.args.get('cursor', '') q = request.args.get('q', '') + cursor = request.args.get('cursor', '') + fields = request.args.get('fields', '') + sorts = request.args.get('sorts', '') domain = purify_domain(request.args.get('domain', request.url_root)) - return ldjsonify(collection(domain, getids(q, limit, cursor)['ids'])) + identifiers = [ + i.get('identifier') for i in scrape( + q, cursor=cursor, fields=fields, sorts=sorts, restrict_to_iiif=True + ).get('items') + ] + return ldjsonify(collection(domain, identifiers)) @app.route('/iiif/cache') @@ -99,16 +109,16 @@ def helper(identifier): return render_template('helpers/image.html', identifier=identifier, cantaloupe_id=cantaloupe_id, esc_cantaloupe_id=esc_cantaloupe_id) except ValueError: abort(404) - + elif mediatype == "audio" or mediatype == "etree": return render_template('helpers/audio.html', identifier=identifier) elif mediatype == "movies": return render_template('helpers/movies.html', identifier=identifier) elif mediatype == "texts": return render_template('helpers/texts.html', identifier=identifier) - else: + else: return render_template('helpers/unknown.html', identifier=identifier) - + @app.route('/iiif/') def view(identifier): @@ -129,7 +139,7 @@ def view(identifier): @app.route('/iiif/3//collection.json') @cache.cached(timeout=cache_timeouts["med"], forced_update=cache_bust) -def collection3(identifier): +def collection3JSON(identifier): domain = purify_domain(request.args.get('domain', request.url_root)) try: @@ -164,7 +174,7 @@ def collection3page(identifier, page): @app.route('/iiif//collection.json') @cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust) -def collection(identifier): +def collectionJSON(identifier): return redirect(f'/iiif/3/{identifier}/collection.json', code=302) @@ -239,7 +249,6 @@ def add_header(response): def ldjsonify(data): j = jsonify(data) - # j.headers.set('Access-Control-Allow-Origin', '*') j.mimetype = "application/ld+json" return j diff --git a/iiify/resolver.py b/iiify/resolver.py index f41f6ae..2ff8d6c 100644 --- a/iiify/resolver.py +++ b/iiify/resolver.py @@ -3,16 +3,16 @@ import os import requests from iiif2 import iiif, web - from .configs import options, cors, approot, cache_root, media_root, apiurl, LINKS from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage,AnnotationPageRef, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef - from urllib.parse import urlparse, parse_qs, quote import json -import math +import math import re import xml.etree.ElementTree as ET +SCRAPE_API = 'https://archive.org/services/search/v1/scrape' +ADVANCED_SEARCH = 'https://archive.org/advancedsearch.php' IMG_CTX = 'http://iiif.io/api/image/2/context.json' PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json' ARCHIVE = 'https://archive.org' @@ -22,6 +22,12 @@ bookreader = "http://%s/BookReader/BookReaderImages.php" URI_PRIFIX = "https://iiif.archive.org/iiif" +MAX_SCRAPE_LIMIT = 10_000 +MAX_API_LIMIT = 1_000 + +class MaxLimitException(Exception): + pass + valid_filetypes = ['jpg', 'jpeg', 'png', 'gif', 'tif', 'jp2', 'pdf', 'tiff'] class IsCollection(Exception): @@ -32,15 +38,60 @@ def purify_domain(domain): domain = re.sub('^http:\/\/', "https://", domain) return domain if domain.endswith('/iiif/') else domain + 'iiif/' -def getids(q, limit=1000, cursor=''): - r = requests.get('%s/iiif' % apiurl, params={ - 'q': q, - 'limit': limit, - 'cursor': cursor - }, allow_redirects=True, timeout=None) + +def scrape(query, fields="", sorts="", count=100, cursor="", restrict_to_iiif=False, security=True): + """ + params: + query: the query (using the same query Lucene-like queries supported by Internet Archive Advanced Search. + fields: Metadata fields to return, comma delimited + sorts: Fields to sort on, comma delimited (if identifier is specified, it must be last) + count: Number of results to return (minimum of 100) + cursor: A cursor, if any (otherwise, search starts at the beginning) + restrict_to_iiif: restrict query to supported IIIF collections? + security: enforce API page limit + """ + if restrict_to_iiif or not query: + _query = "(mediatype:(texts) OR mediatype:(image))" + query = f"{_query} AND {query}" if query else _query + + if int(count) > MAX_API_LIMIT and security: + raise MaxLimitException(f"Limit may not exceed {MAX_API_LIMIT}.") + + fields = fields or 'identifier,title' + + params = { + 'q': query + } + if sorts: + params['sorts'] = sorts + if fields: + params['fields'] = fields + if count: + params['count'] = count + if cursor: + params['cursor'] = cursor + + r = requests.get(SCRAPE_API, params=params) return r.json() -def checkMultiItem(metadata): +def search(query, page=1, limit=100, security=True, sort=None, fields=None): + if not query: + raise ValueError("GET query parameters 'q' required") + + if int(limit) > MAX_API_LIMIT and security: + raise MaxLimitException(f"Limit may not exceed {MAX_API_LIMIT}.") + + return requests.get( + ADVANCED_SEARCH, + params={'q': query, + 'sort[]': sort or ['date asc', 'createdate'], + 'rows': limit, + 'page': page, + 'fl[]': fields or 'identifier,title', + 'output': 'json', + }).json() + +def checkMultiItem(metadata): # Maybe add call to book stack to see if that works first # Count the number of each original file @@ -51,17 +102,16 @@ def checkMultiItem(metadata): file_types[file['format']] = 0 file_types[file['format']] += 1 - #print (file_types) + #print (file_types) # If there is multiple files of the same type then return the first format # Will have to see if there are objects with multiple images and formats for format in file_types: - if file_types[format] > 1 and format.lower() in valid_filetypes: + if file_types[format] > 1 and format.lower() in valid_filetypes: return (True, format) return (False, None) - def to_mimetype(format): formats = { "VBR MP3": "audio/mp3", @@ -86,7 +136,7 @@ def to_mimetype(format): "Cinepack": "video/x-msvideo", "AIFF": "audio/aiff", "Apple Lossless Audio": "audio/x-m4a", - "MPEG-4 Audio": "audio/mp4" + "MPEG-4 Audio": "audio/mp4" } return formats.get(format, "application/octet-stream") @@ -107,7 +157,7 @@ def collection(domain, identifiers, label='Custom Archive.org IIIF Collection'): }) return cs -def create_collection3(identifier, domain, page=1, rows=1000): +def create_collection3(identifier, domain, page=1, rows=MAX_API_LIMIT): # Get item metadata metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json() @@ -119,26 +169,26 @@ def create_collection3(identifier, domain, page=1, rows=1000): addMetadata(collection, identifier, metadata['metadata'], collection=True) - asURL = f'https://archive.org/advancedsearch.php?q=collection%3A{identifier}&fl[]=identifier&fl[]=mediatype&fl[]=title&fl[]=description&sort[]=&sort[]=&sort[]=&rows={rows}&page={page}&output=json&save=yes' + asURL = f'{ADVANCED_SEARCH}?q=collection%3A{identifier}&fl[]=identifier&fl[]=mediatype&fl[]=title&fl[]=description&sort[]=&sort[]=&sort[]=&rows={rows}&page={page}&output=json&save=yes' itemsSearch = requests.get(asURL).json() total = itemsSearch['response']['numFound'] # There is a max of 10,000 items that can be retrieved from the advanced search - if total > 10000: - total = 10000 + if total > MAX_SCRAPE_LIMIT: + total = MAX_SCRAPE_LIMIT if len(itemsSearch['response']['docs']) == 0: - return None + return None pages = math.ceil(total / rows) for item in itemsSearch['response']['docs']: child = None if item['mediatype'] == 'collection': child = CollectionRef(id=f"{domain}{item['identifier']}/collection.json", type="Collection", label=item['title']) - else: + else: child = ManifestRef(id=f"{domain}{item['identifier']}/manifest.json", type="Manifest", label=item['title']) - + if "description" in item: - child.summary = {"none": [item['description']]} + child.summary = {"none": [item['description']]} collection.add_item(child) page += 1 @@ -147,7 +197,7 @@ def create_collection3(identifier, domain, page=1, rows=1000): collection.add_item(child) return json.loads(collection.jsonld()) - + def manifest_page(identifier, label='', page='', width='', height='', metadata=None, canvasId=""): if not canvasId: canvasId = f"{identifier}/canvas" @@ -256,7 +306,7 @@ def create_manifest(identifier, domain=None, page=None): 'itemPath': subPrefix, 'itemId': identifier }) - if r.status_code != 200: + if r.status_code != 200: # If the bookdata failed then treat as a single image fileName = "" for f in resp['files']: @@ -264,7 +314,7 @@ def create_manifest(identifier, domain=None, page=None): and f['source'].lower() == 'original' \ and 'thumb' not in f['name']: fileName = f['name'] - break + break if not fileName: # Original wasn't an image @@ -326,7 +376,7 @@ def singleImage(metadata, identifier, manifest, uri): and f['source'].lower() == 'original' \ and 'thumb' not in f['name']: fileName = f['name'] - break + break if not fileName: # Original wasn't an image @@ -336,12 +386,12 @@ def singleImage(metadata, identifier, manifest, uri): imgId = f"{identifier}/{fileName}".replace('/','%2f') imgURL = f"{IMG_SRV}/3/{imgId}" - + manifest.make_canvas_from_iiif(url=imgURL, id=f"{URI_PRIFIX}/{identifier}/canvas", label="1", anno_page_id=f"{uri}/annotationPage/1", - anno_id=f"{uri}/annotation/1") + anno_id=f"{uri}/annotation/1") def addMetadata(item, identifier, metadata, collection=False): item.homepage = [{"id": f"https://archive.org/details/{identifier}", @@ -376,7 +426,7 @@ def addMetadata(item, identifier, metadata, collection=False): excluded_fields = [ 'avg_rating', 'backup_location', 'btih', 'description', 'downloads', - 'imagecount', 'indexflag', 'item_size', 'licenseurl', 'curation', + 'imagecount', 'indexflag', 'item_size', 'licenseurl', 'curation', 'noindex', 'num_reviews', 'oai_updatedate', 'publicdate', 'publisher', 'reviewdate', 'scanningcentre', 'stripped_tags', 'uploader' ] @@ -482,7 +532,7 @@ def create_manifest3(identifier, domain=None, page=None): for fileMd in metadata['files']: if fileMd['name'].endswith('_scandata.xml'): subprefix = fileMd['name'].replace('_scandata.xml', '') - if fileMd['format'] == 'Djvu XML': + if fileMd['format'] == 'Djvu XML': djvuFile = fileMd['name'] bookReaderURL = f"https://{metadata.get('server')}/BookReader/BookReaderJSIA.php?id={identifier}&itemPath={metadata.get('dir')}&server={metadata.get('server')}&format=jsonp&subPrefix={subprefix}" @@ -523,7 +573,7 @@ def create_manifest3(identifier, domain=None, page=None): # id=f"https://iiif.archivelab.org/iiif/{identifier}${pageCount}/canvas", # label=f"{page['leafNum']}") pageCount += 1 - + # Setting logic for paging behavior and starting canvases # Start with paged (default) or individual behaviors @@ -555,7 +605,7 @@ def create_manifest3(identifier, domain=None, page=None): annotations.append( AnnotationPageRef(id=f"{domain}3/annotations/{identifier}/{quote(djvuFile, safe='()')}/{count}.json", type="AnnotationPage") - ) + ) canvas.annotations = annotations count += 1 elif mediatype == 'image': @@ -569,7 +619,7 @@ def create_manifest3(identifier, domain=None, page=None): imgId = f"{identifier}/{file['name']}".replace('/','%2f') imgURL = f"{IMG_SRV}/3/{imgId}" pageCount += 1 - + try: manifest.make_canvas_from_iiif(url=imgURL, id=f"{URI_PRIFIX}/{identifier}${pageCount}/canvas", @@ -655,10 +705,10 @@ def create_manifest3(identifier, domain=None, page=None): # Example: cruz-test.en.vtt and 34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt sourceFilename = re.sub('\.[a-zA-H-]*\.vtt', '', f['name']) if sourceFilename not in vttfiles: - vttfiles[sourceFilename] = [] - - vttfiles[sourceFilename].append(f) - + vttfiles[sourceFilename] = [] + + vttfiles[sourceFilename].append(f) + # create the canvases for each original for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]: normalised_id = file['name'].rsplit(".", 1)[0] @@ -672,9 +722,9 @@ def create_manifest3(identifier, domain=None, page=None): vttNo = 1 for vttFile in vttfiles[normalised_id]: - vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", - motivation="supplementing", - target=c.id, + vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", + motivation="supplementing", + target=c.id, anno_page_id=vttAPId, body={"id": f"{domain}resource/{identifier}/{vttFile['name']}", "type": "Text", @@ -706,9 +756,9 @@ def create_manifest3(identifier, domain=None, page=None): type='Video', format=to_mimetype(format), label={"none": [format]}, - duration=float(file['length']), + duration=float(file['length']), height=int(file['height']), - width=int(file['width']), + width=int(file['width']), ) body.items.append(r) elif file['format'] == format: @@ -877,7 +927,7 @@ def cantaloupe_resolver(identifier): # single image file - find the filename filename = None - for f in files: + for f in files: if valid_filetype(f['name']) \ and f['source'].lower() == 'original' \ and 'thumb' not in f['name']: @@ -900,7 +950,7 @@ def cantaloupe_resolver(identifier): filename = f['name'] fileIdentifier = filename[:-1 * len('_jp2.zip')] - # next look for any _jp2.zip that has a different name to the identifier + # next look for any _jp2.zip that has a different name to the identifier if not filename: for f in files: if f['name'].endswith('_jp2.zip'): @@ -921,13 +971,8 @@ def cantaloupe_resolver(identifier): fileIdentifier = filename[:-1 * len('_tif.zip')] extension = ".tif" - #filename = next(f for f in files if f['source'].lower() == 'derivative' \ - # and f['name'].endswith('_jp2.zip'))['name'] if filename: dirpath = filename[:-4] filepath = f"{fileIdentifier}_{leaf.zfill(4)}{extension}" return f"{identifier}%2f{filename}%2f{dirpath}%2f{filepath}" - # print (f'images not found for {identifier}') - # for f in files: - # print (f"source: {f['source'].lower()} name: {f['name']} and {f['source'].lower() == 'derivative'} {f['name'].endswith('_jp2.zip')}")