From 3027524a0b890aab006cba5bee79950bea1ec466 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Fri, 15 Nov 2024 14:54:11 +0000 Subject: [PATCH] Updated whisper packages (#985) - Updated whisper packages - Cleaned up SpeechChat.cs, some reformatting and lots of helper code removed in favour of using existing helpers (UserSettings, AnsiConsole) - Refactored UserSettings to reduce duplication in file names --- LLama.Examples/Examples/SpeechChat.cs | 301 ++++++++++++-------------- LLama.Examples/LLama.Examples.csproj | 6 +- LLama.Examples/UserSettings.cs | 89 ++++---- 3 files changed, 186 insertions(+), 210 deletions(-) diff --git a/LLama.Examples/Examples/SpeechChat.cs b/LLama.Examples/Examples/SpeechChat.cs index bd637d60d..e12aabe94 100644 --- a/LLama.Examples/Examples/SpeechChat.cs +++ b/LLama.Examples/Examples/SpeechChat.cs @@ -1,99 +1,110 @@ using LLama.Common; using NAudio.Wave; +using Spectre.Console; using Whisper.net; namespace LLama.Examples.Examples { - public class SpeechChat + public static class SpeechChat { public static async Task Run() { - ConsoleStyleHelpers.WriteLine( -""" -This example demonstrates the basics of audio transcriptions, speech recognition, and speech commands, - as well as how to recognize a user's voice in real time and then get a response from LLM. -It uses whisper.net and models could be found in: https://huggingface.co/ggerganov/whisper.cpp/tree/main. -To use it, you need a working microphone and enough RAM to host both audio + language models. -Once you've selected the models, just speak to your microphone and watch the LLM continue your text. -While it's going, you can say something like 'Okay, stop', or 'Stop now', to interrupt the LLM's inference. - -NOTE: You may need to poke around with the voice detection threshold, based on your mic's sensitivity. ------------------------------------------------------------------------------------------------------------ -""", ConsoleColor.Yellow); - - if (ConsoleStyleHelpers.SelectAudioModel() is not string model) { return; } - - bool loadFinished = false; - var loading = ConsoleStyleHelpers.LoadPrint("Loading transcription model...", () => loadFinished); - - using var speechRecognitionServer = new SpeechRecognitionServer(model); - loadFinished = true; loading.Wait(); - - Console.WriteLine("Audio model loaded. Insert path for language model."); - using var _ = new LlamaSession_SpeechListener(speechRecognitionServer); - - await ConsoleStyleHelpers.WaitUntilExit(); + AnsiConsole.MarkupLine(""" + [yellow on black] + This example demonstrates the basics of audio transcriptions, speech recognition, and speech commands, + as well as how to recognize a user's voice in real time and then get a response from LLM. + It uses whisper.net and models could be found in: https://huggingface.co/ggerganov/whisper.cpp/tree/main. + To use it, you need a working microphone and enough RAM to host both audio + language models. + Once you've selected the models, just speak to your microphone and watch the LLM continue your text. + While it's going, you can say something like 'Okay, stop', or 'Stop now', to interrupt the LLM's inference. + + NOTE: You may need to poke around with the voice detection threshold, based on your mic's sensitivity. + [/] + """); + + AnsiConsole.MarkupLine("[white on black]You can find the official ggml models in whisper.cpp's huggingface repository: https://huggingface.co/ggerganov/whisper.cpp/tree/main [/]"); + + var whisperModel = UserSettings.GetWhisperPath(); + var languageModel = UserSettings.GetModelPath(); + + using var speechRecognitionServer = new SpeechRecognitionServer(whisperModel); + + using var _ = new LlamaSessionSpeechListener(speechRecognitionServer, languageModel); + + AnsiConsole.MarkupLine("[green]Voice active. Begin talking to transcribe. Press any key at any time to exit.[/]"); + await Task.Delay(1000); + Console.ReadKey(); } - class LlamaSession_SpeechListener : ISpeechListener, IDisposable + private class LlamaSessionSpeechListener + : ISpeechListener, IDisposable { - bool isModelResponding; - SpeechRecognitionServer audioServer; + private bool _isModelResponding; + private readonly SpeechRecognitionServer _audioServer; - LLamaWeights model; - LLamaContext context; - InteractiveExecutor executor; + private readonly LLamaWeights _model; + private readonly LLamaContext _context; + private readonly InteractiveExecutor _executor; - string fullPrompt = ""; - bool canceled; + private string _fullPrompt = ""; + private bool _canceled; - public LlamaSession_SpeechListener(SpeechRecognitionServer server) + public LlamaSessionSpeechListener(SpeechRecognitionServer server, string languageModelPath) { - var parameters = new ModelParams(UserSettings.GetModelPath()) + var parameters = new ModelParams(languageModelPath) { GpuLayerCount = 99 }; - model = LLamaWeights.LoadFromFile(parameters); - context = model.CreateContext(parameters); - executor = new InteractiveExecutor(context); - (audioServer = server).ServiceUsers.Add(this); + _model = LLamaWeights.LoadFromFile(parameters); + _context = _model.CreateContext(parameters); + _executor = new InteractiveExecutor(_context); + + _audioServer = server; + _audioServer.ServiceUsers.Add(this); } // Whisper is struggling with single words and very short phrases without context, so it's actually better to say something like "Ok, Stop!" to have it work better. - bool ISpeechListener.IsInterested(string audioTranscription) => !isModelResponding || audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase); + bool ISpeechListener.IsInterested(string audioTranscription) => !_isModelResponding || audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase); void ISpeechListener.HandleSpeech(string audioTranscription) { - if (isModelResponding && audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase)) { canceled = true; } - else if (!isModelResponding) { _ = SendMessage(audioTranscription); } + if (_isModelResponding && audioTranscription.Contains("stop", StringComparison.CurrentCultureIgnoreCase)) + _canceled = true; + else if (!_isModelResponding) + _ = SendMessage(audioTranscription); } - async Task SendMessage(string newMessage) + private async Task SendMessage(string newMessage) { // While a response is queried, we want to detect short phrases/commands like 'stop', - audioServer.detectionSettings = (1, 2); // ..so we lower the min Speech Detection time. + _audioServer.DetectionSettings = (1, 2); // ..so we lower the min Speech Detection time. - isModelResponding = true; + _isModelResponding = true; AddToPrompt($"\n{newMessage}\n", ConsoleColor.Blue); - await foreach (var token in executor.InferAsync(fullPrompt)) + await foreach (var token in _executor.InferAsync(_fullPrompt)) { - AddToPrompt(token, ConsoleColor.Yellow); - if (canceled) { AddToPrompt("[...stopped]", ConsoleColor.Red); break; } + AddToPrompt(token); + if (_canceled) + { + AddToPrompt("[...stopped]", ConsoleColor.Red); + break; + } } - audioServer.detectionSettings = (2, 3); // Reset back to default detection settings to avoid false positives. - (isModelResponding, canceled) = (false, false); // Reset the state variables to their default. + _audioServer.DetectionSettings = (2, 3); // Reset back to default detection settings to avoid false positives. + (_isModelResponding, _canceled) = (false, false); // Reset the state variables to their default. } - void AddToPrompt(string msg, ConsoleColor color = ConsoleColor.Yellow) + private void AddToPrompt(string msg, ConsoleColor color = ConsoleColor.Yellow) { - fullPrompt += msg; - ConsoleStyleHelpers.Write(msg, color); + _fullPrompt += msg; + + AnsiConsole.Markup($"[{color}]{Markup.Escape(msg)}[/]"); } void IDisposable.Dispose() { - model.Dispose(); - context.Dispose(); + _model.Dispose(); + _context.Dispose(); } } @@ -103,19 +114,21 @@ public interface ISpeechListener void HandleSpeech(string audioTranscription); } - public class SpeechRecognitionServer : IDisposable + public sealed class SpeechRecognitionServer + : IDisposable { - const int clipLength = 250; // ms - const float voiceDetectionThreshold = 0.01f; // Adjust as needed - readonly string[] knownFalsePositives = ["[BLANK_AUDIO]", "Thank you", "[silence]"]; + private static readonly TimeSpan ClipLength = TimeSpan.FromMilliseconds(250); - WaveInEvent waveIn; - WaveFormat waveFormat = new(16000, 16, 1); // 16KHz, 16 bits, Mono Channel - List recordedBytes = []; + private const float VoiceDetectionThreshold = 0.01f; // Adjust as needed + private readonly string[] _knownFalsePositives = ["[BLANK_AUDIO]", "Thank you", "[silence]"]; - WhisperFactory? whisperFactory; - WhisperProcessor? processor; - string whisperPrompt = + private readonly WaveInEvent _waveIn; + private readonly WaveFormat _waveFormat = new(16000, 16, 1); // 16KHz, 16 bits, Mono Channel + private readonly List _recordedBytes = []; + + private readonly WhisperProcessor? _processor; + + private readonly string _whisperPrompt = """ The short audio comes from a user that is speaking to an AI Language Model in real time. Pay extra attentions for commands like 'ok stop' or just 'stop'. @@ -123,133 +136,103 @@ The short audio comes from a user that is speaking to an AI Language Model in re """.Trim(); // Tracked stats for Speech Recognition, Parsing, and Serving. - int currentBlankClips; // Ideally would work with milliseconds, - int totalNonBlankClips; // ..but for example's sake they work on a - int nonIdleTime; // ..clip-based quant-length (1 = clipLength). + private int _currentBlankClips; // Ideally would work with milliseconds, + private int _totalNonBlankClips; // ..but for example's sake they work on a + + private int _nonIdleTime; // ..clip-based quant-length (1 = clipLength). // Default detection settings: A speech of 750ms, followed by pause of 500ms. (2x250ms) - public (int minBlanksPerSeparation, int minNonBlanksForValidMessages) detectionSettings = (2, 3); + public (int minBlanksPerSeparation, int minNonBlanksForValidMessages) DetectionSettings = (2, 3); - public HashSet ServiceUsers = []; + public readonly HashSet ServiceUsers = []; public SpeechRecognitionServer(string modelPath) { - // Adjust the path based on your GPU's type. On your build you ideally want just the correct runtime build for your project, but here we're having all references, so it's getting confused. - var libPath = @$"{Environment.GetFolderPath(Environment.SpecialFolder.UserProfile)}\.nuget\packages\whisper.net.runtime.cublas\1.5.0\build\win-x64\whisper.dll"; // Defaulting to cuBlas. - if (!File.Exists(libPath)) { ConsoleStyleHelpers.WriteLine($"Could not find dll file at {libPath}.\nWhisper will load with the default runtime (possibly CPU).\nIf you own a non-Nvidia GPU, you need to adjust the library path based on your GPU's type.", ConsoleColor.Red); libPath = null; } - whisperFactory = WhisperFactory.FromPath(modelPath, libraryPath: libPath); - - var builder = whisperFactory.CreateBuilder().WithThreads(16).WithPrompt(whisperPrompt).WithSingleSegment().WithLanguage("en"); - (builder.WithBeamSearchSamplingStrategy() as BeamSearchSamplingStrategyBuilder)!.WithPatience(0.2f).WithBeamSize(5); - processor = builder.Build(); - - waveIn = new WaveInEvent() { BufferMilliseconds = clipLength, WaveFormat = waveFormat }; - waveIn.DataAvailable += OnAudioDataAvailable; - waveIn.StartRecording(); + var whisperFactory = WhisperFactory.FromPath(modelPath); + + var builder = whisperFactory + .CreateBuilder() + .WithThreads(16) + .WithPrompt(_whisperPrompt) + .WithSingleSegment() + .WithLanguage("en"); + ((BeamSearchSamplingStrategyBuilder)builder.WithBeamSearchSamplingStrategy()).WithPatience(0.2f).WithBeamSize(5); + _processor = builder.Build(); + + _waveIn = new WaveInEvent { BufferMilliseconds = (int)ClipLength.TotalMilliseconds, WaveFormat = _waveFormat }; + _waveIn.DataAvailable += OnAudioDataAvailable; + _waveIn.StartRecording(); } - void OnAudioDataAvailable(object? sender, WaveInEventArgs e) + private void OnAudioDataAvailable(object? sender, WaveInEventArgs e) { // Cache the recorded bytes - recordedBytes.AddRange(e.Buffer[..e.BytesRecorded]); - if (recordedBytes.Count > 110000000) { recordedBytes.RemoveRange(0, 50000000); } + _recordedBytes.AddRange(e.Buffer[..e.BytesRecorded]); + if (_recordedBytes.Count > 110000000) { _recordedBytes.RemoveRange(0, 50000000); } // Get the max volume contained inside the clip. Since the clip is recorded as bytes, we need to translate them to samples before getting their volume. var maxVolume = 0f; // This byte->sample algorithm is from: https://github.com/naudio/NAudio/blob/master/Docs/RecordingLevelMeter.md#calculating-peak-values - for (int i = 0; i < e.BytesRecorded; i += 2) { maxVolume = Math.Max(maxVolume, Math.Abs((short) ((e.Buffer[i + 1] << 8) | e.Buffer[i + 0]) / 32768f)); } + for (var i = 0; i < e.BytesRecorded; i += 2) + { + maxVolume = Math.Max(maxVolume, Math.Abs((short)((e.Buffer[i + 1] << 8) | e.Buffer[i + 0]) / 32768f)); + } // Compare the volume with the threshold and act accordingly. Once an interesting and 'full' set of clips pops up, serve it. - if (maxVolume >= voiceDetectionThreshold) { currentBlankClips = 0; totalNonBlankClips++; nonIdleTime++; } - else if (++currentBlankClips < detectionSettings.minBlanksPerSeparation) { nonIdleTime++; } + if (maxVolume >= VoiceDetectionThreshold) + { + _currentBlankClips = 0; + _totalNonBlankClips++; + _nonIdleTime++; + } + else if (++_currentBlankClips < DetectionSettings.minBlanksPerSeparation) + { + _nonIdleTime++; + } else { - if (totalNonBlankClips >= detectionSettings.minNonBlanksForValidMessages) { SendTranscription(); } - else if (totalNonBlankClips > 0) { } // This might be case of a false-positive -- knock, noise, cough, anything. - (currentBlankClips, totalNonBlankClips, nonIdleTime) = (0, 0, 0); + if (_totalNonBlankClips >= DetectionSettings.minNonBlanksForValidMessages) + SendTranscription(); + else if (_totalNonBlankClips > 0) { } // This might be case of a false-positive -- knock, noise, cough, anything. + (_currentBlankClips, _totalNonBlankClips, _nonIdleTime) = (0, 0, 0); } async void SendTranscription() { - var bytesPerClip = waveFormat.BitsPerSample * clipLength * 2; - var capturedClipBytes = recordedBytes.TakeLast(bytesPerClip * (nonIdleTime + 2)).ToArray(); - var transcribedText = await ProcessAudio(capturedClipBytes, "Assets\\temp.wav"); // Save to temporary file. - if (knownFalsePositives.Contains(transcribedText)) { return; } // False positive.. yikes! - foreach (var user in ServiceUsers.Where(x => x.IsInterested(transcribedText))) { user.HandleSpeech(transcribedText); } + var bytesPerClip = _waveFormat.BitsPerSample * (int)ClipLength.TotalMilliseconds * 2; + var capturedClipBytes = _recordedBytes.TakeLast(bytesPerClip * (_nonIdleTime + 2)).ToArray(); + + // Save to temporary file. + var transcribedText = await ProcessAudio(capturedClipBytes, "Assets\\temp.wav"); + + // False positive.. yikes! + if (_knownFalsePositives.Contains(transcribedText)) + return; + + foreach (var user in ServiceUsers.Where(x => x.IsInterested(transcribedText))) + user.HandleSpeech(transcribedText); } } /// Requests a transcription and responds with the text. - async Task ProcessAudio(byte[] bytes, string tempWavFilePath) + private async Task ProcessAudio(byte[] bytes, string tempWavFilePath) { await using var wavStream = new MemoryStream(); - using (var writer = new WaveFileWriter(tempWavFilePath, waveFormat)) { writer.Write(bytes, 0, bytes.Length); } - using (var fileStream = File.OpenRead(tempWavFilePath)) { await fileStream.CopyToAsync(wavStream); } + + await using (var writer = new WaveFileWriter(tempWavFilePath, _waveFormat)) + writer.Write(bytes, 0, bytes.Length); + await using (var fileStream = File.OpenRead(tempWavFilePath)) + await fileStream.CopyToAsync(wavStream); + wavStream.Seek(0, SeekOrigin.Begin); Console.Beep(); - return string.Join(' ', await processor!.ProcessAsync(wavStream).Select(x => x.Text).ToListAsync()).Trim(); + return string.Join(' ', await _processor!.ProcessAsync(wavStream).Select(x => x.Text).ToListAsync()).Trim(); } void IDisposable.Dispose() { - waveIn.Dispose(); - processor?.Dispose(); - } - } - - public static class ConsoleStyleHelpers - { - public static string? SelectAudioModel() - { - var models = Directory.GetFiles("Assets", "*bin"); - if (models.Length == 1) { return models[0]; } - else if (models.Length != 0) - { - WriteLine("Available Models:", ConsoleColor.Green); - for (int i = 0; i < models.Length; i++) - { - Write($"{i + 1}. ", ConsoleColor.Blue); - WriteLine(models[i]["Assets\\".Length..], ConsoleColor.Yellow); - } - while (true) - { - Write($"Please choose a model (1-{models.Length}): ", ConsoleColor.DarkCyan); - if (!int.TryParse(Console.ReadKey().KeyChar.ToString(), out var i) || i > models.Length || i <= 0) { Console.WriteLine(); continue; } - Console.WriteLine(); - return models[i - 1]; - } - } - else - { - WriteLine($"Download a non-quantized model and place it in the executing directory:", ConsoleColor.Red); - WriteLine($"\t{Environment.CurrentDirectory}\\Assets", ConsoleColor.Yellow); - WriteLine("You can find the official ggml models in whisper.cpp's huggingface repository: ", ConsoleColor.Red); - WriteLine("\thttps://huggingface.co/ggerganov/whisper.cpp/tree/main", ConsoleColor.Blue); - return null; - } - } - public static async Task LoadPrint(string initialText, Func ShouldContinue) - { - var startTime = DateTime.Now; - Console.WriteLine(initialText); - while (!ShouldContinue()) { Console.Write("."); await Task.Delay(100); } - Console.WriteLine($" Completed in {(DateTime.Now - startTime).TotalSeconds:f2}s."); - } - - public async static Task WaitUntilExit() - { - WriteLine("Voice active. Begin talking to transcribe. Press any key at any time to exit.", ConsoleColor.Green); - await Task.Delay(1000); - Console.ReadKey(); - } - - public static void Write(string text, ConsoleColor consoleColor) => ColorAction(consoleColor, () => Console.Write(text)); - public static void WriteLine(string text, ConsoleColor consoleColor) => ColorAction(consoleColor, () => Console.WriteLine(text)); - public static void ColorAction(ConsoleColor consoleColor, Action action) - { - Console.ForegroundColor = consoleColor; - action?.Invoke(); - Console.ForegroundColor = ConsoleColor.White; + _waveIn.Dispose(); + _processor?.Dispose(); } } } diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj index 55be83b1e..708e9ea15 100644 --- a/LLama.Examples/LLama.Examples.csproj +++ b/LLama.Examples/LLama.Examples.csproj @@ -21,10 +21,10 @@ - - + + - + diff --git a/LLama.Examples/UserSettings.cs b/LLama.Examples/UserSettings.cs index 9d8836c33..e7e355d66 100644 --- a/LLama.Examples/UserSettings.cs +++ b/LLama.Examples/UserSettings.cs @@ -7,6 +7,7 @@ internal static class UserSettings private static readonly string SettingsModelPath = Path.Join(AppContext.BaseDirectory, "DefaultModel.env"); private static readonly string SettingsMMprojPath = Path.Join(AppContext.BaseDirectory, "DefaultMMProj.env"); private static readonly string SettingsImagePath = Path.Join(AppContext.BaseDirectory, "DefaultImage.env"); + private static readonly string WhisperModelPath = Path.Join(AppContext.BaseDirectory, "DefaultWhisper.env"); private static string? ReadDefaultPath(string file) { @@ -20,68 +21,60 @@ internal static class UserSettings return path; } - private static void WriteDefaultPath(string settings, string path) + public static string GetModelPath(bool alwaysPrompt = false) { - File.WriteAllText(settings, path); + return PromptPath("model.gguf", SettingsModelPath, alwaysPrompt); } - public static string GetModelPath(bool alwaysPrompt = false) + public static string GetMMProjPath(bool alwaysPrompt = false) { - var defaultPath = ReadDefaultPath(SettingsModelPath); - var path = defaultPath is null || alwaysPrompt - ? PromptUserForPath() - : PromptUserForPathWithDefault(defaultPath); - - if (File.Exists(path)) - WriteDefaultPath(SettingsModelPath, path); + return PromptPath("mmproj", SettingsMMprojPath, alwaysPrompt); + } - return path; - } - - // TODO: Refactorize - public static string GetMMProjPath(bool alwaysPrompt = false) + public static string GetImagePath(bool alwaysPrompt = false) { - var defaultPath = ReadDefaultPath(SettingsMMprojPath); - var path = defaultPath is null || alwaysPrompt - ? PromptUserForPath("MMProj") - : PromptUserForPathWithDefault(defaultPath, "MMProj"); + return PromptPath("image", SettingsImagePath, alwaysPrompt); + } - if (File.Exists(path)) - WriteDefaultPath(SettingsMMprojPath, path); + public static string GetWhisperPath(bool alwaysPrompt = false) + { + return PromptPath("whisper model.bin", WhisperModelPath, alwaysPrompt); + } - return path; - } - - // TODO: Refactorize - public static string GetImagePath(bool alwaysPrompt = false) + private static string PromptPath(string label, string saveFile, bool alwaysPrompt) { - var defaultPath = ReadDefaultPath(SettingsImagePath); + var defaultPath = ReadDefaultPath(saveFile); var path = defaultPath is null || alwaysPrompt - ? PromptUserForPath("image") - : PromptUserForPathWithDefault(defaultPath, "image"); + ? PromptUserForPath(label) + : PromptUserForPathWithDefault(defaultPath, label); if (File.Exists(path)) - WriteDefaultPath(SettingsImagePath, path); + WriteDefaultPath(saveFile, path); return path; - } - private static string PromptUserForPath(string text = "model") - { - return AnsiConsole.Prompt( - new TextPrompt(string.Format("Please input your {0} path:", text) ) - .PromptStyle("white") - .Validate(File.Exists, string.Format("[red]ERROR: invalid {0} file path - file does not exist[/]", text) ) - ); - } + static void WriteDefaultPath(string settings, string path) + { + File.WriteAllText(settings, path); + } - private static string PromptUserForPathWithDefault(string defaultPath, string text = "model") - { - return AnsiConsole.Prompt( - new TextPrompt(string.Format("Please input your {0} path (or ENTER for default):", text) ) - .DefaultValue(defaultPath) - .PromptStyle("white") - .Validate(File.Exists, string.Format("[red]ERROR: invalid {0} file path - file does not exist[/]", text)) - ); - } + static string PromptUserForPath(string text = "model") + { + return AnsiConsole.Prompt( + new TextPrompt($"Please input your {text} path:") + .PromptStyle("white") + .Validate(File.Exists, $"[red]ERROR: invalid {text} file path - file does not exist[/]") + ); + } + + static string PromptUserForPathWithDefault(string defaultPath, string text = "model") + { + return AnsiConsole.Prompt( + new TextPrompt($"Please input your {text} path (or ENTER for default):") + .DefaultValue(defaultPath) + .PromptStyle("white") + .Validate(File.Exists, $"[red]ERROR: invalid {text} file path - file does not exist[/]") + ); + } +} }