diff --git a/aider/commands.py b/aider/commands.py index 8c02fe0bb53..2955cc0d4f3 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -1112,12 +1112,21 @@ def cmd_voice(self, args): "Record and transcribe voice input" if not self.voice: - if "OPENAI_API_KEY" not in os.environ: - self.io.tool_error("To use /voice you must provide an OpenAI API key.") + if "WHISPER_API_BASE" in os.environ and "WHISPER_API_KEY" not in os.environ: + self.io.tool_error( + "To use /voice with a custom Whisper API you must provide a custom Whisper API key" + ) + return + elif "OPENAI_API_KEY" not in os.environ: + self.io.tool_error( + "To use /voice you must provide an OpenAI API key (or custom Whisper API and key)." + ) return + try: self.voice = voice.Voice( - audio_format=self.args.voice_format, device_name=self.args.voice_input_device + audio_format=self.args.voice_format, + device_name=self.args.voice_input_device, ) except voice.SoundDeviceError: self.io.tool_error( diff --git a/aider/voice.py b/aider/voice.py index 47eddc42b75..3edb5a89e4f 100644 --- a/aider/voice.py +++ b/aider/voice.py @@ -150,8 +150,34 @@ def raw_record_and_transcribe(self, history, language): with open(filename, "rb") as fh: try: + # Get API configuration from environment + api_base = os.getenv("WHISPER_API_BASE", None ) # None is the default OpenAI endpoint + api_key = os.getenv("WHISPER_API_KEY", None ) # None causes OPENAI_API_KEY to be used + + # If a custom base is specified, require a specific whisper key + if api_base and not api_key: + raise Exception( + "When using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY" + " via --api whisper=" + ) + + # Only use OpenAI key as fallback if using default OpenAI endpoint + if not api_key: + if not api_base or api_base == "https://api.openai.com/v1": + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise Exception( + "No API key found. Please set either WHISPER_API_KEY or OPENAI_API_KEY" + " environment variables, or use --api whisper=" + ) + transcript = litellm.transcription( - model="whisper-1", file=fh, prompt=history, language=language + model="whisper-1", + file=fh, + prompt=history, + language=language, + api_base=api_base, + api_key=api_key, ) except Exception as err: print(f"Unable to transcribe {filename}: {err}") @@ -165,7 +191,4 @@ def raw_record_and_transcribe(self, history, language): if __name__ == "__main__": - api_key = os.getenv("OPENAI_API_KEY") - if not api_key: - raise ValueError("Please set the OPENAI_API_KEY environment variable.") print(Voice().record_and_transcribe()) diff --git a/aider/website/assets/sample.env b/aider/website/assets/sample.env index 8cf37e66869..587b9fb8cc9 100644 --- a/aider/website/assets/sample.env +++ b/aider/website/assets/sample.env @@ -69,6 +69,12 @@ ## Specify the api base url #AIDER_OPENAI_API_BASE= +## Specify an alternate api base url for Whisper transcriptions (optional) +#AIDER_WHISPER_API_BASE= + +## Specify an alternate api key for Whisper transcriptions (optional) +#AIDER_WHISPER_API_KEY= + ## (deprecated, use --set-env OPENAI_API_TYPE=) #AIDER_OPENAI_API_TYPE= diff --git a/aider/website/docs/install/optional.md b/aider/website/docs/install/optional.md index 8e4285873d1..dddbebef940 100644 --- a/aider/website/docs/install/optional.md +++ b/aider/website/docs/install/optional.md @@ -46,6 +46,9 @@ for additional information. Aider supports [coding with your voice](https://aider.chat/docs/usage/voice.html) using the in-chat `/voice` command. + +### Audio capture setup + Aider uses the [PortAudio](http://www.portaudio.com) library to capture audio. Installing PortAudio is completely optional, but can usually be accomplished like this: @@ -55,6 +58,40 @@ Installing PortAudio is completely optional, but can usually be accomplished lik - For Linux, do `sudo apt-get install libportaudio2` - Some linux environments may also need `sudo apt install libasound2-plugins` +### Whisper API configuration + +By default, aider uses OpenAI's Whisper API for voice transcription. You can configure an alternate Whisper API provider: + +```bash +# Via command line arguments +aider --set-env WHISPER_API_BASE=https://api.example.com --api-key whisper=your-api-key + +# Via environment variables or .env file +WHISPER_API_BASE=https://api.example.com +WHISPER_API_KEY=your-api-key + +# Via config file (.aider.conf.yml) +api-base: + - whisper=https://api.example.com +api-key: + - whisper=your-api-key +``` + +When using an alternate Whisper API endpoint: +- You must provide both the API base URL and API key +- Your OpenAI API key will not be sent to the alternate endpoint +- This is useful for: + - Using a different Whisper API provider + - Running Whisper locally or on your own infrastructure + - Controlling costs or data privacy + +API key behavior: +- If using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY +- If using the default OpenAI endpoint (or no endpoint specified): + - Will use WHISPER_API_KEY if provided + - Will fall back to OPENAI_API_KEY if no WHISPER_API_KEY is set +- If neither key is available, an error will be raised + ## Add aider to your editor Other projects have integrated aider into some IDE/editors. diff --git a/aider/website/docs/usage/voice.md b/aider/website/docs/usage/voice.md index 9422d3f1223..978a7e31fc7 100644 --- a/aider/website/docs/usage/voice.md +++ b/aider/website/docs/usage/voice.md @@ -17,11 +17,11 @@ when you ask aider to edit your code. Use the in-chat `/voice` command to start recording, and press `ENTER` when you're done speaking. Your voice coding instructions will be transcribed, -as if you had typed them into +as if you had typed them into the aider chat session. See the [installation instructions](https://aider.chat/docs/install/optional.html#enable-voice-coding) for -information on how to enable the `/voice` command. +information on how to enable the `/voice` command and configure alternate Whisper API endpoints.
diff --git a/tests/basic/test_voice.py b/tests/basic/test_voice.py index 00ec1d79d01..97ae0455903 100644 --- a/tests/basic/test_voice.py +++ b/tests/basic/test_voice.py @@ -101,3 +101,34 @@ def test_record_and_transcribe_device_error(): ): result = voice.record_and_transcribe() assert result is None + +def test_record_and_transcribe_no_api_key(): + with patch("aider.voice.sf", MagicMock()): + voice = Voice() + with patch.dict(os.environ, {}, clear=True): # Clear environment variables + result = voice.record_and_transcribe() + assert result is None + +def test_record_and_transcribe_custom_base_no_key(): + with patch("aider.voice.sf", MagicMock()): + voice = Voice() + with patch.dict(os.environ, {"WHISPER_API_BASE": "http://custom.api"}, clear=True): + with pytest.raises(Exception) as exc: + voice.record_and_transcribe() + assert "When using a custom WHISPER_API_BASE" in str(exc.value) + assert "via --api whisper=" in str(exc.value) + +def test_record_and_transcribe_custom_base_with_key(): + with patch("aider.voice.sf", MagicMock()): + voice = Voice() + with patch.dict( + os.environ, + { + "WHISPER_API_BASE": "http://custom.api", + "WHISPER_API_KEY": "test-key" + }, + clear=True + ): + with patch.object(voice, "raw_record_and_transcribe") as mock_record: + voice.record_and_transcribe() + assert mock_record.called