work on video import

jbilcke-hf · Jul 22, 2024 · bfb5428 · bfb5428
1 parent 7702570
commit bfb5428
Show file tree

Hide file tree

Showing 7 changed files with 291 additions and 233 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -37,9 +37,9 @@
   "dependencies": {
     "@aitube/broadway": "0.0.22",
     "@aitube/clap": "0.0.30",
-    "@aitube/clapper-services": "0.0.28",
+    "@aitube/clapper-services": "0.0.29",
     "@aitube/engine": "0.0.26",
-    "@aitube/timeline": "0.0.42",
+    "@aitube/timeline": "0.0.43",
     "@fal-ai/serverless-client": "^0.13.0",
     "@ffmpeg/ffmpeg": "^0.12.10",
     "@ffmpeg/util": "^0.12.1",

diff --git a/src/lib/core/constants.ts b/src/lib/core/constants.ts
@@ -3,7 +3,7 @@
 export const HARD_LIMIT_NB_MAX_ASSETS_TO_GENERATE_IN_PARALLEL = 32
 
 export const APP_NAME = 'Clapper.app'
-export const APP_REVISION = 'r20240721-0835'
+export const APP_REVISION = 'r20240722-0205'
 
 export const APP_DOMAIN = 'Clapper.app'
 export const APP_LINK = 'https://clapper.app'
diff --git a/src/lib/utils/base64DataUriToFile.ts b/src/lib/utils/base64DataUriToFile.ts
@@ -5,8 +5,8 @@ export function base64DataUriToFile(dataUrl: string, fileName: string) {
   const bstr = atob(arr[arr.length - 1])
   let n = bstr.length
   const u8arr = new Uint8Array(n)
-  while(n--){
-      u8arr[n] = bstr.charCodeAt(n);
+  while (n--) {
+    u8arr[n] = bstr.charCodeAt(n)
   }
-  return new File([u8arr], fileName, {type:mime});
-}
+  return new File([u8arr], fileName, { type: mime })
+}
diff --git a/src/services/io/extractCaptionsFromFrames.ts b/src/services/io/extractCaptionsFromFrames.ts
@@ -7,7 +7,11 @@ import {
 
 export async function extractCaptionsFromFrames(
   images: string[] = [],
-  onProgress: (progress: number, storyboardIndex: number, nbStoryboards: number) => void
+  onProgress: (
+    progress: number,
+    storyboardIndex: number,
+    nbStoryboards: number
+  ) => void
 ): Promise<string[]> {
   if (!(navigator as any).gpu) {
     throw new Error(`Please enable WebGPU to analyze video frames:
@@ -40,15 +44,15 @@ Linux experimental support also requires launching the browser with --enable-fea
     }
   )
 
-  onProgress(progress = 5, 0, images.length)
+  onProgress((progress = 5), 0, images.length)
 
   const processor = await AutoProcessor.from_pretrained(model_id)
 
-  onProgress(progress = 10, 0, images.length)
+  onProgress((progress = 10), 0, images.length)
 
   const tokenizer = await AutoTokenizer.from_pretrained(model_id)
 
-  onProgress(progress = 15, 0, images.length)
+  onProgress((progress = 15), 0, images.length)
 
   // not all prompts will work properly, see the official examples:
   // https://huggingface.co/microsoft/Florence-2-base-ft/blob/e7a5acc73559546de6e12ec0319cd7cc1fa2437c/processing_florence2.py#L115-L117
@@ -60,7 +64,6 @@ Linux experimental support also requires launching the browser with --enable-fea
   let i = 1
   const captions: string[] = []
   for (const imageInBase64DataUri of images) {
-
     console.log('analyzing image:', imageInBase64DataUri.slice(0, 64))
     // Prepare vision inputs
     const image = await RawImage.fromURL(imageInBase64DataUri)

diff --git a/src/services/io/parseFileIntoSegments.ts b/src/services/io/parseFileIntoSegments.ts
@@ -5,6 +5,7 @@ import {
   ClapOutputType,
   ClapSegmentCategory,
   ClapSegmentStatus,
+  isValidNumber,
   newSegment,
   UUID,
 } from '@aitube/clap'
@@ -14,7 +15,8 @@ import {
   SegmentVisibility,
   TimelineSegment,
   useTimeline,
-  TimelineStore
+  TimelineStore,
+  DEFAULT_DURATION_IN_MS_PER_STEP,
 } from '@aitube/timeline'
 
 import { blobToBase64DataUri } from '@/lib/utils/blobToBase64DataUri'
@@ -24,13 +26,21 @@ import { ResourceCategory, ResourceType } from '@aitube/clapper-services'
 
 export async function parseFileIntoSegments({
   file,
+  track,
+  startTimeInMs: maybeStartTimeInMs,
+  endTimeInMs: maybeEndTimeInMs,
 }: {
   /**
    * The file to import
    */
   file: File
+
+  track?: number
+  startTimeInMs?: number
+  endTimeInMs?: number
 }): Promise<TimelineSegment[]> {
   const timeline: TimelineStore = useTimeline.getState()
+  const { cursorTimestampAtInMs } = timeline
   // console.log(`parseFileIntoSegments(): filename = ${file.name}`)
   // console.log(`parseFileIntoSegments(): file size = ${file.size} bytes`)
   // console.log(`parseFileIntoSegments(): file type = ${file.type}`)
@@ -41,41 +51,43 @@ export async function parseFileIntoSegments({
     'TODO: open a popup to ask if this is a voice character sample, dialogue, music etc'
   )
 
-  let type: ResourceType = 'misc'
-  let resourceCategory: ResourceCategory = 'misc'
-
   const newSegments: TimelineSegment[] = []
 
   switch (file.type) {
     case 'image/jpeg':
     case 'image/png':
     case 'image/avif':
     case 'image/heic':
-    case 'image/webp':
-      type = 'image'
-      resourceCategory = 'control_image'
-      const startTimeInMs = cursorInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
-      const durationInSteps = 4
-      const durationInMs = durationInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
-      const endTimeInMs = startTimeInMs + durationInMs
+    case 'image/webp': {
+      const type: ResourceType = 'image'
+      const resourceCategory: ResourceCategory = 'control_image'
 
       // ok let's stop for a minute there:
       // if someone drops a .mp3, and assuming we don't yet have the UI to select the category,
       // do you think it should be a SOUND, a VOICE or a MUSIC by default?
       // I expect people will use AI service providers for sound and voice,
       // maybe in some case music too, but there are also many people
       // who will want to use their own track eg. to create a music video
-      const category = ClapSegmentCategory.MUSIC
+      const category = ClapSegmentCategory.STORYBOARD
 
       const assetUrl = await blobToBase64DataUri(file)
 
+      const startTimeInMs = isValidNumber(maybeStartTimeInMs)
+        ? maybeStartTimeInMs!
+        : cursorTimestampAtInMs
+      const durationInSteps = 4
+      const durationInMs = durationInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
+      const endTimeInMs = isValidNumber(maybeEndTimeInMs)
+        ? maybeEndTimeInMs!
+        : startTimeInMs + durationInMs
+
       const newSegmentData: Partial<TimelineSegment> = {
-        prompt: 'audio track',
+        prompt: 'Storyboard', // note: this can be set later with an automatic captioning worker
         startTimeInMs, // start time of the segment
         endTimeInMs, // end time of the segment (startTimeInMs + durationInMs)
         status: ClapSegmentStatus.COMPLETED,
         // track: findFreeTrack({ segments, startTimeInMs, endTimeInMs }), // track row index
-        label: `${file.name} (${Math.round(durationInMs / 1000)}s @ ${Math.round(bpm * 100) / 100} BPM)`, // a short label to name the segment (optional, can be human or LLM-defined)
+        label: `${file.name}`, // a short label to name the segment (optional, can be human or LLM-defined)
         category,
         assetUrl,
         assetDurationInMs: endTimeInMs,
@@ -86,9 +98,12 @@ export async function parseFileIntoSegments({
       const timelineSegment = await clapSegmentToTimelineSegment(
         newSegment(newSegmentData)
       )
-      timelineSegment.outputType = ClapOutputType.AUDIO
-      timelineSegment.outputGain = 1.0
-      timelineSegment.audioBuffer = audioBuffer
+
+      if (isValidNumber(track)) {
+        timelineSegment.track = track
+      }
+
+      timelineSegment.outputType = ClapOutputType.IMAGE
 
       // we assume we want it to be immediately visible
       timelineSegment.visibility = SegmentVisibility.VISIBLE
@@ -98,9 +113,7 @@ export async function parseFileIntoSegments({
       // poof! type disappears.. it's magic
       newSegments.push(timelineSegment)
       break
-
-      break
-
+    }
 
     case 'audio/mpeg': // this is the "official" one
     case 'audio/mp3': // this is just an alias
@@ -109,10 +122,10 @@ export async function parseFileIntoSegments({
     case 'audio/x-mp4': // should be rare, normally is is audio/mp4
     case 'audio/m4a': // shouldn't exist
     case 'audio/x-m4a': // should be rare, normally is is audio/mp4
-    case 'audio/webm':
+    case 'audio/webm': {
       // for background track, or as an inspiration track, or a voice etc
-      type = 'audio'
-      resourceCategory = 'background_music'
+      const type: ResourceType = 'audio'
+      const resourceCategory: ResourceCategory = 'background_music'
 
       // TODO: add caption analysis
       const { durationInMs, durationInSteps, bpm, audioBuffer } =
@@ -124,11 +137,12 @@ export async function parseFileIntoSegments({
       })
 
       // TODO: use the correct drop time
-      const startTimeInMs = 0
-      const startTimeInSteps = 1
-
-      const endTimeInSteps = durationInSteps
-      const endTimeInMs = startTimeInMs + durationInMs
+      const startTimeInMs = isValidNumber(maybeStartTimeInMs)
+        ? maybeStartTimeInMs!
+        : 0
+      const endTimeInMs = isValidNumber(maybeEndTimeInMs)
+        ? maybeEndTimeInMs!
+        : startTimeInMs + durationInMs
 
       // ok let's stop for a minute there:
       // if someone drops a .mp3, and assuming we don't yet have the UI to select the category,
@@ -145,6 +159,7 @@ export async function parseFileIntoSegments({
         startTimeInMs, // start time of the segment
         endTimeInMs, // end time of the segment (startTimeInMs + durationInMs)
         status: ClapSegmentStatus.COMPLETED,
+        track,
         // track: findFreeTrack({ segments, startTimeInMs, endTimeInMs }), // track row index
         label: `${file.name} (${Math.round(durationInMs / 1000)}s @ ${Math.round(bpm * 100) / 100} BPM)`, // a short label to name the segment (optional, can be human or LLM-defined)
         category,
@@ -157,6 +172,11 @@ export async function parseFileIntoSegments({
       const timelineSegment = await clapSegmentToTimelineSegment(
         newSegment(newSegmentData)
       )
+
+      if (isValidNumber(track)) {
+        timelineSegment.track = track
+      }
+
       timelineSegment.outputType = ClapOutputType.AUDIO
       timelineSegment.outputGain = 1.0
       timelineSegment.audioBuffer = audioBuffer
@@ -169,28 +189,31 @@ export async function parseFileIntoSegments({
       // poof! type disappears.. it's magic
       newSegments.push(timelineSegment)
       break
+    }
 
-    case 'text/plain':
+    case 'text/plain': {
       // for dialogue, prompts..
-      type = 'text'
-      resourceCategory = 'text_prompt'
+      const type: ResourceType = 'text'
+      const resourceCategory: ResourceCategory = 'text_prompt'
       break
+    }
 
-    default:
+    default: {
       console.log(`unrecognized file type "${file.type}"`)
       break
+    }
   }
 
   // note: we always upload the files, because even if it is an unhandled format (eg. a PDF)
   // this can still be part of the project as a resource for humans (inspiration, guidelines etc)
 
+  /*
   const id = UUID()
   const fileName = `${id}.${extension}`
 
   const storage = `resources`
   const filePath = `${type}/${fileName}`
 
-  /*
   const { data, error } = await supabase
     .storage
     .from('avatars')

diff --git a/src/services/io/useIO.ts b/src/services/io/useIO.ts
@@ -18,6 +18,7 @@ import {
   TimelineSegment,
   removeFinalVideosAndConvertToTimelineSegments,
   getFinalVideo,
+  DEFAULT_DURATION_IN_MS_PER_STEP,
 } from '@aitube/timeline'
 import { ParseScriptProgressUpdate, parseScriptToClap } from '@aitube/broadway'
 import { IOStore, TaskCategory, TaskVisibility } from '@aitube/clapper-services'
@@ -129,14 +130,32 @@ export const useIO = create<IOStore>((set, get) => ({
           },
         })
 
+        // optional: reset the project
+        await timeline.setClap(newClap())
+
+        const track = 1
         let i = 0
+        let startTimeInMs = 0
+        const durationInSteps = 4
+        const durationInMs = durationInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
+        let endTimeInMs = startTimeInMs + durationInMs
+
         for (const frame of frames) {
           const frameFile = base64DataUriToFile(frame, `storyboard_${i++}.png`)
-          const newSegments = await parseFileIntoSegments({ file: frameFile })
+          const newSegments = await parseFileIntoSegments({
+            file: frameFile,
+            startTimeInMs,
+            endTimeInMs,
+            track,
+          })
+
+          startTimeInMs += durationInMs
+          endTimeInMs += durationInMs
 
           console.log('calling timeline.addSegments with:', newSegments)
           await timeline.addSegments({
             segments: newSegments,
+            track,
           })
         }
 
@@ -157,18 +176,24 @@ export const useIO = create<IOStore>((set, get) => ({
           })
 
           console.log('calling extractCaptionsFromFrames() with:', frames)
-          const captions = await extractCaptionsFromFrames(frames, (progress: number, storyboardIndex: number, nbStoryboards: number) => {
-            captioningTask.setProgress({
-              message: `Analyzing storyboards (${progress}%)`,
-              value: progress,
-            })
-          })
+          const captions = await extractCaptionsFromFrames(
+            frames,
+            (
+              progress: number,
+              storyboardIndex: number,
+              nbStoryboards: number
+            ) => {
+              captioningTask.setProgress({
+                message: `Analyzing storyboards (${progress}%)`,
+                value: progress,
+              })
+            }
+          )
           console.log('captions:', captions)
-          // TODO: add 
+          // TODO: add
 
           captioningTask.success()
         }
-
       }
     }
   },