Adding extra video formats and refactoring Audio format support

internetarchive · Dec 5, 2024 · 3acd3fb · 3acd3fb
1 parent b6c6ef4
commit 3acd3fb
Showing 1 changed file with 40 additions and 38 deletions.
diff --git a/iiify/resolver.py b/iiify/resolver.py
@@ -22,14 +22,15 @@
 bookdata = 'https://%s/BookReader/BookReaderJSON.php'
 bookreader = "https://%s/BookReader/BookReaderImages.php"
 URI_PRIFIX = "https://iiif.archive.org/iiif"
-
 MAX_SCRAPE_LIMIT = 10_000
 MAX_API_LIMIT = 1_000
 
 class MaxLimitException(Exception):
     pass
 
 valid_filetypes = ['jpg', 'jpeg', 'png', 'gif', 'tif', 'jp2', 'pdf', 'tiff']
+AUDIO_FORMATS = ['VBR MP3', '32Kbps MP3', '56Kbps MP3', '64Kbps MP3', '96Kbps MP3', '128Kbps MP3', 'MPEG-4 Audio', 'Flac', 'AIFF', 'Apple Lossless Audio', 'Ogg Vorbis', 'WAVE', '24bit Flac', 'Shorten']
+VIDEO_FORMATS = ['MPEG4', 'h.264 HD', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack','QuickTime']
 
 class IsCollection(Exception):
     # Used for when we need to raise to the route handler from inside the manifest function
@@ -506,6 +507,35 @@ def addThumbnails(manifest, identifier, files):
     if thumbnails:
         manifest.thumbnail = thumbnails
 
+def sortDerivatives(metadata, includeVtt=False):
+    """
+        Sort the files into originals and derivatives, splitting the derivatives into buckets based on the original
+    """
+    originals = []
+    derivatives = {}
+    vttfiles = {}
+    for f in metadata['files']:
+        if f['source'] == 'derivative':
+            if f['original'] in derivatives and not isinstance(f['original'], list):
+                derivatives[f['original']][f['format']] = f
+            else:
+                derivatives[f['original']] = {f['format']: f}
+        elif f['source'] == 'original':
+            originals.append(f)
+
+        if includeVtt and f['format'] == 'Web Video Text Tracks':
+            # Example: cruz-test.en.vtt and 34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt
+            sourceFilename = re.sub(r'\.[a-zA-H-]*\.vtt', '', f['name'])
+            if sourceFilename not in vttfiles:
+                vttfiles[sourceFilename] = []    
+
+            vttfiles[sourceFilename].append(f)  
+
+    if includeVtt:
+        return (originals, derivatives, vttfiles)
+    else:    
+        return (originals, derivatives)
+
 def create_manifest3(identifier, domain=None, page=None):
     # Get item metadata
     metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
@@ -611,7 +641,6 @@ def create_manifest3(identifier, domain=None, page=None):
                 count += 1
     elif mediatype == 'image':
         (multiFile, format) = checkMultiItem(metadata)
-        print (f"Checking multiFile {multiFile} {format}")
         if multiFile:
             # Create multi file manifest
             pageCount = 0
@@ -637,19 +666,11 @@ def create_manifest3(identifier, domain=None, page=None):
             singleImage(metadata, identifier, manifest, uri)
     elif mediatype == 'audio' or mediatype == 'etree':
         # sort the files into originals and derivatives, splitting the derivatives into buckets based on the original
-        originals = []
-        derivatives = {}
-        for f in metadata['files']:
-            if f['source'] == 'derivative' and not isinstance(f['original'], list):
-                if f['original'] in derivatives:
-                    derivatives[f['original']][f['format']] = f
-                else:
-                    derivatives[f['original']] = {f['format']: f}
-            elif f['source'] == 'original':
-                originals.append(f)
-
+        (originals, derivatives) = sortDerivatives(metadata)
+
         # create the canvases for each original
-        for file in [f for f in originals if f['format'] in ['VBR MP3', '32Kbps MP3', '56Kbps MP3', '64Kbps MP3', '96Kbps MP3', '128Kbps MP3', 'MPEG-4 Audio', 'Flac', 'AIFF', 'Apple Lossless Audio', 'Ogg Vorbis', 'WAVE', '24bit Flac', 'Shorten']]:
+
+        for file in [f for f in originals if f['format'] in AUDIO_FORMATS]:
             normalised_id = file['name'].rsplit(".", 1)[0]
             slugged_id = normalised_id.replace(" ", "-")
             c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
@@ -663,7 +684,7 @@ def create_manifest3(identifier, domain=None, page=None):
             if file['name'] in derivatives:
                 body = Choice(items=[])
                 # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
-                for format in ['VBR MP3', '32Kbps MP3', '56Kbps MP3', '64Kbps MP3', '96Kbps MP3', '128Kbps MP3', 'MPEG-4 Audio', 'Flac', 'AIFF', 'Apple Lossless Audio', 'Ogg Vorbis', 'WAVE', '24bit Flac', 'Shorten']:
+                for format in AUDIO_FORMATS:
                     if format in derivatives[file['name']]:
                         r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
                                          type='Sound',
@@ -689,26 +710,7 @@ def create_manifest3(identifier, domain=None, page=None):
             manifest.add_item(c)
 
     elif mediatype == "movies":
-        # sort the files into originals and derivatives, splitting the derivatives into buckets based on the original
-        originals = []
-        derivatives = {}
-        vttfiles = {}
-        for f in metadata['files']:
-            if f['source'] == 'derivative':
-                if f['original'] in derivatives:
-                    derivatives[f['original']][f['format']] = f
-                else:
-                    derivatives[f['original']] = {f['format']: f}
-            elif f['source'] == 'original':
-                originals.append(f)
-
-            if f['format'] == 'Web Video Text Tracks':
-                # Example: cruz-test.en.vtt and 34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt
-                sourceFilename = re.sub(r'\.[a-zA-H-]*\.vtt', '', f['name'])
-                if sourceFilename not in vttfiles:
-                    vttfiles[sourceFilename] = []    
-
-                vttfiles[sourceFilename].append(f)    
+        (originals, derivatives, vttfiles) = sortDerivatives(metadata, includeVtt=True)
 
         if 'access-restricted-item' in metadata['metadata'] and metadata['metadata']['access-restricted-item']:
             # this is a news item so has to be treated differently
@@ -723,7 +725,7 @@ def create_manifest3(identifier, domain=None, page=None):
                     filedata = file
 
             # create the canvases for each original
-            for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 HD', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack','QuickTime']]:
+            for file in [f for f in originals if f['format'] in VIDEO_FORMATS]:
                 normalised_id = file['name'].rsplit(".", 1)[0]
                 slugged_id = normalised_id.replace(" ", "-")
                 c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
@@ -768,7 +770,7 @@ def create_manifest3(identifier, domain=None, page=None):
                 manifest.add_item(c)
         else:
             # create the canvases for each original
-            for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
+            for file in [f for f in originals if f['format'] in VIDEO_FORMATS]:
                 normalised_id = file['name'].rsplit(".", 1)[0]
                 slugged_id = normalised_id.replace(" ", "-")
                 c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
@@ -808,7 +810,7 @@ def create_manifest3(identifier, domain=None, page=None):
                 if file['name'] in derivatives:
                     body = Choice(items=[])
                     # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734
-                    for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']:
+                    for format in VIDEO_FORMATS:
                         if format in derivatives[file['name']]:
                             r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}",
                                             type='Video',