From e3d656c366db8e7db716cd99d88091df335361f7 Mon Sep 17 00:00:00 2001 From: Mike Bailey Date: Sun, 15 Dec 2024 17:54:27 +1100 Subject: [PATCH] feat: Support custom Whisper API endpoints for voice transcription This change adds the ability to use alternative Whisper API providers for voice transcription, while maintaining backwards compatibility with the default OpenAI endpoint. - Added support for custom Whisper API endpoints via `WHISPER_API_BASE` and `WHISPER_API_KEY` environment variables - Updated the voice transcription logic to handle both custom and OpenAI endpoints - Added validation to require a specific API key when using custom endpoints - Updated documentation to explain the new configuration options - Added new test cases to verify API endpoint and key handling - When using a custom Whisper endpoint (`WHISPER_API_BASE`): - Must provide a corresponding `WHISPER_API_KEY` - OpenAI API key will not be used - When using default OpenAI endpoint: - Will use `WHISPER_API_KEY` if provided - Falls back to `OPENAI_API_KEY` if no Whisper-specific key is set - Added configuration examples in the optional installation docs - Updated voice coding documentation to reference the new features - Added new environment variables to the sample.env file This change enables users to: - Use alternative Whisper API providers - Run Whisper locally or on their own infrastructure - Better control costs and data privacy for voice transcription --- aider/commands.py | 15 ++++++++--- aider/voice.py | 31 ++++++++++++++++++--- aider/website/assets/sample.env | 6 +++++ aider/website/docs/install/optional.md | 37 ++++++++++++++++++++++++++ aider/website/docs/usage/voice.md | 4 +-- tests/basic/test_voice.py | 31 +++++++++++++++++++++ 6 files changed, 115 insertions(+), 9 deletions(-) diff --git a/aider/commands.py b/aider/commands.py index 8c02fe0bb53..2955cc0d4f3 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -1112,12 +1112,21 @@ def cmd_voice(self, args): "Record and transcribe voice input" if not self.voice: - if "OPENAI_API_KEY" not in os.environ: - self.io.tool_error("To use /voice you must provide an OpenAI API key.") + if "WHISPER_API_BASE" in os.environ and "WHISPER_API_KEY" not in os.environ: + self.io.tool_error( + "To use /voice with a custom Whisper API you must provide a custom Whisper API key" + ) + return + elif "OPENAI_API_KEY" not in os.environ: + self.io.tool_error( + "To use /voice you must provide an OpenAI API key (or custom Whisper API and key)." + ) return + try: self.voice = voice.Voice( - audio_format=self.args.voice_format, device_name=self.args.voice_input_device + audio_format=self.args.voice_format, + device_name=self.args.voice_input_device, ) except voice.SoundDeviceError: self.io.tool_error( diff --git a/aider/voice.py b/aider/voice.py index 47eddc42b75..3edb5a89e4f 100644 --- a/aider/voice.py +++ b/aider/voice.py @@ -150,8 +150,34 @@ def raw_record_and_transcribe(self, history, language): with open(filename, "rb") as fh: try: + # Get API configuration from environment + api_base = os.getenv("WHISPER_API_BASE", None ) # None is the default OpenAI endpoint + api_key = os.getenv("WHISPER_API_KEY", None ) # None causes OPENAI_API_KEY to be used + + # If a custom base is specified, require a specific whisper key + if api_base and not api_key: + raise Exception( + "When using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY" + " via --api whisper=" + ) + + # Only use OpenAI key as fallback if using default OpenAI endpoint + if not api_key: + if not api_base or api_base == "https://api.openai.com/v1": + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise Exception( + "No API key found. Please set either WHISPER_API_KEY or OPENAI_API_KEY" + " environment variables, or use --api whisper=" + ) + transcript = litellm.transcription( - model="whisper-1", file=fh, prompt=history, language=language + model="whisper-1", + file=fh, + prompt=history, + language=language, + api_base=api_base, + api_key=api_key, ) except Exception as err: print(f"Unable to transcribe {filename}: {err}") @@ -165,7 +191,4 @@ def raw_record_and_transcribe(self, history, language): if __name__ == "__main__": - api_key = os.getenv("OPENAI_API_KEY") - if not api_key: - raise ValueError("Please set the OPENAI_API_KEY environment variable.") print(Voice().record_and_transcribe()) diff --git a/aider/website/assets/sample.env b/aider/website/assets/sample.env index 8cf37e66869..587b9fb8cc9 100644 --- a/aider/website/assets/sample.env +++ b/aider/website/assets/sample.env @@ -69,6 +69,12 @@ ## Specify the api base url #AIDER_OPENAI_API_BASE= +## Specify an alternate api base url for Whisper transcriptions (optional) +#AIDER_WHISPER_API_BASE= + +## Specify an alternate api key for Whisper transcriptions (optional) +#AIDER_WHISPER_API_KEY= + ## (deprecated, use --set-env OPENAI_API_TYPE=) #AIDER_OPENAI_API_TYPE= diff --git a/aider/website/docs/install/optional.md b/aider/website/docs/install/optional.md index 8e4285873d1..dddbebef940 100644 --- a/aider/website/docs/install/optional.md +++ b/aider/website/docs/install/optional.md @@ -46,6 +46,9 @@ for additional information. Aider supports [coding with your voice](https://aider.chat/docs/usage/voice.html) using the in-chat `/voice` command. + +### Audio capture setup + Aider uses the [PortAudio](http://www.portaudio.com) library to capture audio. Installing PortAudio is completely optional, but can usually be accomplished like this: @@ -55,6 +58,40 @@ Installing PortAudio is completely optional, but can usually be accomplished lik - For Linux, do `sudo apt-get install libportaudio2` - Some linux environments may also need `sudo apt install libasound2-plugins` +### Whisper API configuration + +By default, aider uses OpenAI's Whisper API for voice transcription. You can configure an alternate Whisper API provider: + +```bash +# Via command line arguments +aider --set-env WHISPER_API_BASE=https://api.example.com --api-key whisper=your-api-key + +# Via environment variables or .env file +WHISPER_API_BASE=https://api.example.com +WHISPER_API_KEY=your-api-key + +# Via config file (.aider.conf.yml) +api-base: + - whisper=https://api.example.com +api-key: + - whisper=your-api-key +``` + +When using an alternate Whisper API endpoint: +- You must provide both the API base URL and API key +- Your OpenAI API key will not be sent to the alternate endpoint +- This is useful for: + - Using a different Whisper API provider + - Running Whisper locally or on your own infrastructure + - Controlling costs or data privacy + +API key behavior: +- If using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY +- If using the default OpenAI endpoint (or no endpoint specified): + - Will use WHISPER_API_KEY if provided + - Will fall back to OPENAI_API_KEY if no WHISPER_API_KEY is set +- If neither key is available, an error will be raised + ## Add aider to your editor Other projects have integrated aider into some IDE/editors. diff --git a/aider/website/docs/usage/voice.md b/aider/website/docs/usage/voice.md index 9422d3f1223..978a7e31fc7 100644 --- a/aider/website/docs/usage/voice.md +++ b/aider/website/docs/usage/voice.md @@ -17,11 +17,11 @@ when you ask aider to edit your code. Use the in-chat `/voice` command to start recording, and press `ENTER` when you're done speaking. Your voice coding instructions will be transcribed, -as if you had typed them into +as if you had typed them into the aider chat session. See the [installation instructions](https://aider.chat/docs/install/optional.html#enable-voice-coding) for -information on how to enable the `/voice` command. +information on how to enable the `/voice` command and configure alternate Whisper API endpoints.
diff --git a/tests/basic/test_voice.py b/tests/basic/test_voice.py index 00ec1d79d01..97ae0455903 100644 --- a/tests/basic/test_voice.py +++ b/tests/basic/test_voice.py @@ -101,3 +101,34 @@ def test_record_and_transcribe_device_error(): ): result = voice.record_and_transcribe() assert result is None + +def test_record_and_transcribe_no_api_key(): + with patch("aider.voice.sf", MagicMock()): + voice = Voice() + with patch.dict(os.environ, {}, clear=True): # Clear environment variables + result = voice.record_and_transcribe() + assert result is None + +def test_record_and_transcribe_custom_base_no_key(): + with patch("aider.voice.sf", MagicMock()): + voice = Voice() + with patch.dict(os.environ, {"WHISPER_API_BASE": "http://custom.api"}, clear=True): + with pytest.raises(Exception) as exc: + voice.record_and_transcribe() + assert "When using a custom WHISPER_API_BASE" in str(exc.value) + assert "via --api whisper=" in str(exc.value) + +def test_record_and_transcribe_custom_base_with_key(): + with patch("aider.voice.sf", MagicMock()): + voice = Voice() + with patch.dict( + os.environ, + { + "WHISPER_API_BASE": "http://custom.api", + "WHISPER_API_KEY": "test-key" + }, + clear=True + ): + with patch.object(voice, "raw_record_and_transcribe") as mock_record: + voice.record_and_transcribe() + assert mock_record.called