Skip to content

Commit

Permalink
work on video import
Browse files Browse the repository at this point in the history
  • Loading branch information
jbilcke-hf committed Jul 22, 2024
1 parent 7702570 commit bfb5428
Show file tree
Hide file tree
Showing 7 changed files with 291 additions and 233 deletions.
365 changes: 186 additions & 179 deletions package-lock.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@
"dependencies": {
"@aitube/broadway": "0.0.22",
"@aitube/clap": "0.0.30",
"@aitube/clapper-services": "0.0.28",
"@aitube/clapper-services": "0.0.29",
"@aitube/engine": "0.0.26",
"@aitube/timeline": "0.0.42",
"@aitube/timeline": "0.0.43",
"@fal-ai/serverless-client": "^0.13.0",
"@ffmpeg/ffmpeg": "^0.12.10",
"@ffmpeg/util": "^0.12.1",
Expand Down
2 changes: 1 addition & 1 deletion src/lib/core/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
export const HARD_LIMIT_NB_MAX_ASSETS_TO_GENERATE_IN_PARALLEL = 32

export const APP_NAME = 'Clapper.app'
export const APP_REVISION = 'r20240721-0835'
export const APP_REVISION = 'r20240722-0205'

export const APP_DOMAIN = 'Clapper.app'
export const APP_LINK = 'https://clapper.app'
8 changes: 4 additions & 4 deletions src/lib/utils/base64DataUriToFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ export function base64DataUriToFile(dataUrl: string, fileName: string) {
const bstr = atob(arr[arr.length - 1])
let n = bstr.length
const u8arr = new Uint8Array(n)
while(n--){
u8arr[n] = bstr.charCodeAt(n);
while (n--) {
u8arr[n] = bstr.charCodeAt(n)
}
return new File([u8arr], fileName, {type:mime});
}
return new File([u8arr], fileName, { type: mime })
}
13 changes: 8 additions & 5 deletions src/services/io/extractCaptionsFromFrames.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ import {

export async function extractCaptionsFromFrames(
images: string[] = [],
onProgress: (progress: number, storyboardIndex: number, nbStoryboards: number) => void
onProgress: (
progress: number,
storyboardIndex: number,
nbStoryboards: number
) => void
): Promise<string[]> {
if (!(navigator as any).gpu) {
throw new Error(`Please enable WebGPU to analyze video frames:
Expand Down Expand Up @@ -40,15 +44,15 @@ Linux experimental support also requires launching the browser with --enable-fea
}
)

onProgress(progress = 5, 0, images.length)
onProgress((progress = 5), 0, images.length)

const processor = await AutoProcessor.from_pretrained(model_id)

onProgress(progress = 10, 0, images.length)
onProgress((progress = 10), 0, images.length)

const tokenizer = await AutoTokenizer.from_pretrained(model_id)

onProgress(progress = 15, 0, images.length)
onProgress((progress = 15), 0, images.length)

// not all prompts will work properly, see the official examples:
// https://huggingface.co/microsoft/Florence-2-base-ft/blob/e7a5acc73559546de6e12ec0319cd7cc1fa2437c/processing_florence2.py#L115-L117
Expand All @@ -60,7 +64,6 @@ Linux experimental support also requires launching the browser with --enable-fea
let i = 1
const captions: string[] = []
for (const imageInBase64DataUri of images) {

console.log('analyzing image:', imageInBase64DataUri.slice(0, 64))
// Prepare vision inputs
const image = await RawImage.fromURL(imageInBase64DataUri)
Expand Down
89 changes: 56 additions & 33 deletions src/services/io/parseFileIntoSegments.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
ClapOutputType,
ClapSegmentCategory,
ClapSegmentStatus,
isValidNumber,
newSegment,
UUID,
} from '@aitube/clap'
Expand All @@ -14,7 +15,8 @@ import {
SegmentVisibility,
TimelineSegment,
useTimeline,
TimelineStore
TimelineStore,
DEFAULT_DURATION_IN_MS_PER_STEP,
} from '@aitube/timeline'

import { blobToBase64DataUri } from '@/lib/utils/blobToBase64DataUri'
Expand All @@ -24,13 +26,21 @@ import { ResourceCategory, ResourceType } from '@aitube/clapper-services'

export async function parseFileIntoSegments({
file,
track,
startTimeInMs: maybeStartTimeInMs,
endTimeInMs: maybeEndTimeInMs,
}: {
/**
* The file to import
*/
file: File

track?: number
startTimeInMs?: number
endTimeInMs?: number
}): Promise<TimelineSegment[]> {
const timeline: TimelineStore = useTimeline.getState()
const { cursorTimestampAtInMs } = timeline
// console.log(`parseFileIntoSegments(): filename = ${file.name}`)
// console.log(`parseFileIntoSegments(): file size = ${file.size} bytes`)
// console.log(`parseFileIntoSegments(): file type = ${file.type}`)
Expand All @@ -41,41 +51,43 @@ export async function parseFileIntoSegments({
'TODO: open a popup to ask if this is a voice character sample, dialogue, music etc'
)

let type: ResourceType = 'misc'
let resourceCategory: ResourceCategory = 'misc'

const newSegments: TimelineSegment[] = []

switch (file.type) {
case 'image/jpeg':
case 'image/png':
case 'image/avif':
case 'image/heic':
case 'image/webp':
type = 'image'
resourceCategory = 'control_image'
const startTimeInMs = cursorInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
const durationInSteps = 4
const durationInMs = durationInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
const endTimeInMs = startTimeInMs + durationInMs
case 'image/webp': {
const type: ResourceType = 'image'
const resourceCategory: ResourceCategory = 'control_image'

// ok let's stop for a minute there:
// if someone drops a .mp3, and assuming we don't yet have the UI to select the category,
// do you think it should be a SOUND, a VOICE or a MUSIC by default?
// I expect people will use AI service providers for sound and voice,
// maybe in some case music too, but there are also many people
// who will want to use their own track eg. to create a music video
const category = ClapSegmentCategory.MUSIC
const category = ClapSegmentCategory.STORYBOARD

const assetUrl = await blobToBase64DataUri(file)

const startTimeInMs = isValidNumber(maybeStartTimeInMs)
? maybeStartTimeInMs!
: cursorTimestampAtInMs
const durationInSteps = 4
const durationInMs = durationInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
const endTimeInMs = isValidNumber(maybeEndTimeInMs)
? maybeEndTimeInMs!
: startTimeInMs + durationInMs

const newSegmentData: Partial<TimelineSegment> = {
prompt: 'audio track',
prompt: 'Storyboard', // note: this can be set later with an automatic captioning worker
startTimeInMs, // start time of the segment
endTimeInMs, // end time of the segment (startTimeInMs + durationInMs)
status: ClapSegmentStatus.COMPLETED,
// track: findFreeTrack({ segments, startTimeInMs, endTimeInMs }), // track row index
label: `${file.name} (${Math.round(durationInMs / 1000)}s @ ${Math.round(bpm * 100) / 100} BPM)`, // a short label to name the segment (optional, can be human or LLM-defined)
label: `${file.name}`, // a short label to name the segment (optional, can be human or LLM-defined)
category,
assetUrl,
assetDurationInMs: endTimeInMs,
Expand All @@ -86,9 +98,12 @@ export async function parseFileIntoSegments({
const timelineSegment = await clapSegmentToTimelineSegment(
newSegment(newSegmentData)
)
timelineSegment.outputType = ClapOutputType.AUDIO
timelineSegment.outputGain = 1.0
timelineSegment.audioBuffer = audioBuffer

if (isValidNumber(track)) {
timelineSegment.track = track
}

timelineSegment.outputType = ClapOutputType.IMAGE

// we assume we want it to be immediately visible
timelineSegment.visibility = SegmentVisibility.VISIBLE
Expand All @@ -98,9 +113,7 @@ export async function parseFileIntoSegments({
// poof! type disappears.. it's magic
newSegments.push(timelineSegment)
break

break

}

case 'audio/mpeg': // this is the "official" one
case 'audio/mp3': // this is just an alias
Expand All @@ -109,10 +122,10 @@ export async function parseFileIntoSegments({
case 'audio/x-mp4': // should be rare, normally is is audio/mp4
case 'audio/m4a': // shouldn't exist
case 'audio/x-m4a': // should be rare, normally is is audio/mp4
case 'audio/webm':
case 'audio/webm': {
// for background track, or as an inspiration track, or a voice etc
type = 'audio'
resourceCategory = 'background_music'
const type: ResourceType = 'audio'
const resourceCategory: ResourceCategory = 'background_music'

// TODO: add caption analysis
const { durationInMs, durationInSteps, bpm, audioBuffer } =
Expand All @@ -124,11 +137,12 @@ export async function parseFileIntoSegments({
})

// TODO: use the correct drop time
const startTimeInMs = 0
const startTimeInSteps = 1

const endTimeInSteps = durationInSteps
const endTimeInMs = startTimeInMs + durationInMs
const startTimeInMs = isValidNumber(maybeStartTimeInMs)
? maybeStartTimeInMs!
: 0
const endTimeInMs = isValidNumber(maybeEndTimeInMs)
? maybeEndTimeInMs!
: startTimeInMs + durationInMs

// ok let's stop for a minute there:
// if someone drops a .mp3, and assuming we don't yet have the UI to select the category,
Expand All @@ -145,6 +159,7 @@ export async function parseFileIntoSegments({
startTimeInMs, // start time of the segment
endTimeInMs, // end time of the segment (startTimeInMs + durationInMs)
status: ClapSegmentStatus.COMPLETED,
track,
// track: findFreeTrack({ segments, startTimeInMs, endTimeInMs }), // track row index
label: `${file.name} (${Math.round(durationInMs / 1000)}s @ ${Math.round(bpm * 100) / 100} BPM)`, // a short label to name the segment (optional, can be human or LLM-defined)
category,
Expand All @@ -157,6 +172,11 @@ export async function parseFileIntoSegments({
const timelineSegment = await clapSegmentToTimelineSegment(
newSegment(newSegmentData)
)

if (isValidNumber(track)) {
timelineSegment.track = track
}

timelineSegment.outputType = ClapOutputType.AUDIO
timelineSegment.outputGain = 1.0
timelineSegment.audioBuffer = audioBuffer
Expand All @@ -169,28 +189,31 @@ export async function parseFileIntoSegments({
// poof! type disappears.. it's magic
newSegments.push(timelineSegment)
break
}

case 'text/plain':
case 'text/plain': {
// for dialogue, prompts..
type = 'text'
resourceCategory = 'text_prompt'
const type: ResourceType = 'text'
const resourceCategory: ResourceCategory = 'text_prompt'
break
}

default:
default: {
console.log(`unrecognized file type "${file.type}"`)
break
}
}

// note: we always upload the files, because even if it is an unhandled format (eg. a PDF)
// this can still be part of the project as a resource for humans (inspiration, guidelines etc)

/*
const id = UUID()
const fileName = `${id}.${extension}`
const storage = `resources`
const filePath = `${type}/${fileName}`
/*
const { data, error } = await supabase
.storage
.from('avatars')
Expand Down
43 changes: 34 additions & 9 deletions src/services/io/useIO.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
TimelineSegment,
removeFinalVideosAndConvertToTimelineSegments,
getFinalVideo,
DEFAULT_DURATION_IN_MS_PER_STEP,
} from '@aitube/timeline'
import { ParseScriptProgressUpdate, parseScriptToClap } from '@aitube/broadway'
import { IOStore, TaskCategory, TaskVisibility } from '@aitube/clapper-services'
Expand Down Expand Up @@ -129,14 +130,32 @@ export const useIO = create<IOStore>((set, get) => ({
},
})

// optional: reset the project
await timeline.setClap(newClap())

const track = 1
let i = 0
let startTimeInMs = 0
const durationInSteps = 4
const durationInMs = durationInSteps * DEFAULT_DURATION_IN_MS_PER_STEP
let endTimeInMs = startTimeInMs + durationInMs

for (const frame of frames) {
const frameFile = base64DataUriToFile(frame, `storyboard_${i++}.png`)
const newSegments = await parseFileIntoSegments({ file: frameFile })
const newSegments = await parseFileIntoSegments({
file: frameFile,
startTimeInMs,
endTimeInMs,
track,
})

startTimeInMs += durationInMs
endTimeInMs += durationInMs

console.log('calling timeline.addSegments with:', newSegments)
await timeline.addSegments({
segments: newSegments,
track,
})
}

Expand All @@ -157,18 +176,24 @@ export const useIO = create<IOStore>((set, get) => ({
})

console.log('calling extractCaptionsFromFrames() with:', frames)
const captions = await extractCaptionsFromFrames(frames, (progress: number, storyboardIndex: number, nbStoryboards: number) => {
captioningTask.setProgress({
message: `Analyzing storyboards (${progress}%)`,
value: progress,
})
})
const captions = await extractCaptionsFromFrames(
frames,
(
progress: number,
storyboardIndex: number,
nbStoryboards: number
) => {
captioningTask.setProgress({
message: `Analyzing storyboards (${progress}%)`,
value: progress,
})
}
)
console.log('captions:', captions)
// TODO: add
// TODO: add

captioningTask.success()
}

}
}
},
Expand Down

0 comments on commit bfb5428

Please sign in to comment.