fixing bad bug

jbilcke-hf · Aug 6, 2024 · 9a0fef2 · 9a0fef2
1 parent 5e9ff60
commit 9a0fef2
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 15 deletions.
diff --git a/src/app/api/assistant/templates.ts b/src/app/api/assistant/templates.ts
@@ -58,8 +58,10 @@ Each item describes a different property (or facet) of the scene, based on its c
 - Remember, if the director is asking to edit the video project data structure, you MUST only return the item object, in JSON format.
 - If you don't understand how to modify, it's okay to say you don't understand and politely ask for clarification.
 - When you edit a JSON list, sure to recopy the id for each field exactly like it is in the original, otherwise it breaks everything.
-- The director might give a query in English, French, Spanish.. but the movie scene is in English.
+- The director might give a query in French, English, Spanish.. but the movie scene is in English.
 - ALWAYS write the output in English: if the query is in another language, translate it to English.
+- When updating a scene (with UPDATE_STORY_AND_SCENE) never forget to update the updatedSceneSegments array!
+- Also when updating a scene segments, NEVER, EVER FORGET ABOUT THE CAMERA SEGMENTS! BEcause this is how we actually split our scene into separate shots!
 - Important: if the director is asking a QUESTION ("who is.. what is.. please analyze etc..") then DO NOT return JSON, but raw text instead`
 
 export const examples = `

diff --git a/src/services/assistant/useAssistant.ts b/src/services/assistant/useAssistant.ts
@@ -37,8 +37,6 @@ import { parseRawInputToAction } from './parseRawInputToAction'
 import { useAudio } from '../audio/useAudio'
 import { updateStoryAndScene } from './updateStoryAndScene'
 
-const enableTextToSpeech = false
-
 export const useAssistant = create<AssistantStore>((set, get) => ({
   ...getDefaultAssistantState(),
 
@@ -182,6 +180,8 @@ export const useAssistant = create<AssistantStore>((set, get) => ({
       return
     }
 
+    console.log('processUserMessage():', input)
+
     const { addEventToHistory, processActionOrMessage } = get()
     const {
       bufferedSegments: { activeSegments },

diff --git a/src/services/assistant/useVoiceAssistant.ts b/src/services/assistant/useVoiceAssistant.ts
@@ -14,11 +14,9 @@ export function useVoiceAssistant() {
   const stop = useMic((s) => s.stop)
   const clear = useMic((s) => s.clear)
 
-  const debouncedTranscript = useDebounce(transcript, 1200)
-
   useEffect(() => {
-    processUserMessage(debouncedTranscript)
-  }, [debouncedTranscript, processUserMessage])
+    processUserMessage(transcript)
+  }, [transcript, processUserMessage])
 
   return {
     isSupported,

diff --git a/src/services/mic/useMic.ts b/src/services/mic/useMic.ts
@@ -5,6 +5,8 @@ import { MicStore } from '@aitube/clapper-services'
 
 import { getDefaultMicState } from './getDefaultMicState'
 
+const cutoffTimeInMs = 1200
+
 export const useMic = create<MicStore>((set, get) => ({
   ...getDefaultMicState(),
 
@@ -28,20 +30,50 @@ export const useMic = create<MicStore>((set, get) => ({
       })
     }
 
-    recognition.interimResults = interimResults
+    recognition.interimResults = true
     recognition.lang = lang
-    recognition.continuous = continuous
+    recognition.continuous = true
 
     const speechRecognitionList = new window.webkitSpeechGrammarList()
     speechRecognitionList.addFromString(grammar, grammarWeight)
     recognition.grammars = speechRecognitionList
 
+    let debounceTimer: NodeJS.Timeout | null = null
+    let lastCompleteTranscript = ''
+    let currentTranscript = ''
+    let lastSpeechTime = Date.now()
+
     const handleResult = (event: SpeechRecognitionEvent) => {
-      let transcript = ''
-      for (let i = 0; i < event.results.length; i++) {
-        transcript += event.results?.[i]?.[0]?.transcript || ''
+      const currentTime = Date.now()
+
+      // Check if it's been more than $cutoffTimeInMs since the last speech
+      if (currentTime - lastSpeechTime > cutoffTimeInMs) {
+        lastCompleteTranscript = ''
+        currentTranscript = ''
+      }
+
+      lastSpeechTime = currentTime
+
+      // Get the most recent result
+      const latestResult = event.results[event.results.length - 1]
+      currentTranscript = latestResult[0].transcript.trim()
+
+      // If it's a final result, update lastCompleteTranscript
+      if (latestResult.isFinal) {
+        lastCompleteTranscript = currentTranscript
       }
-      set({ transcript })
+
+      const fullTranscript = lastCompleteTranscript + 
+        (currentTranscript !== lastCompleteTranscript ? ' ' + currentTranscript : '')
+
+      if (debounceTimer) {
+        clearTimeout(debounceTimer)
+      }
+
+      debounceTimer = setTimeout(() => {
+        set({ transcript: fullTranscript.trim() })
+        debounceTimer = null
+      }, cutoffTimeInMs)
     }
 
     const handleError = (event: SpeechRecognitionErrorEvent) => {
@@ -54,7 +86,13 @@ export const useMic = create<MicStore>((set, get) => ({
     }
 
     const handleEnd = () => {
-      set({ isListening: false, transcript: '' })
+      if (debounceTimer) {
+        clearTimeout(debounceTimer)
+        const fullTranscript = lastCompleteTranscript + 
+          (currentTranscript !== lastCompleteTranscript ? ' ' + currentTranscript : '')
+        set({ transcript: fullTranscript.trim() })
+      }
+      set({ isListening: false })
     }
 
     recognition.addEventListener('result', handleResult)
@@ -86,4 +124,4 @@ export const useMic = create<MicStore>((set, get) => ({
 
 if (typeof window !== 'undefined') {
   useMic.getState().init()
-}
+}