feat: Support custom Whisper API endpoints for voice transcription

This change adds the ability to use alternative Whisper API providers for voice transcription, while maintaining backwards compatibility with the default OpenAI endpoint. - Added support for custom Whisper API endpoints via `WHISPER_API_BASE` and `WHISPER_API_KEY` environment variables - Updated the voice transcription logic to handle both custom and OpenAI endpoints - Added validation to require a specific API key when using custom endpoints - Updated documentation to explain the new configuration options - Added new test cases to verify API endpoint and key handling - When using a custom Whisper endpoint (`WHISPER_API_BASE`): - Must provide a corresponding `WHISPER_API_KEY` - OpenAI API key will not be used - When using default OpenAI endpoint: - Will use `WHISPER_API_KEY` if provided - Falls back to `OPENAI_API_KEY` if no Whisper-specific key is set - Added configuration examples in the optional installation docs - Updated voice coding documentation to reference the new features - Added new environment variables to the sample.env file This change enables users to: - Use alternative Whisper API providers - Run Whisper locally or on their own infrastructure - Better control costs and data privacy for voice transcription
Aider-AI · Dec 17, 2024 · e3d656c · e3d656c
1 parent 1042e2f
commit e3d656c
Show file tree

Hide file tree

Showing 6 changed files with 115 additions and 9 deletions.
diff --git a/aider/commands.py b/aider/commands.py
@@ -1112,12 +1112,21 @@ def cmd_voice(self, args):
         "Record and transcribe voice input"
 
         if not self.voice:
-            if "OPENAI_API_KEY" not in os.environ:
-                self.io.tool_error("To use /voice you must provide an OpenAI API key.")
+            if "WHISPER_API_BASE" in os.environ and "WHISPER_API_KEY" not in os.environ:
+                self.io.tool_error(
+                    "To use /voice with a custom Whisper API you must provide a custom Whisper API key"
+                )
+                return
+            elif "OPENAI_API_KEY" not in os.environ:
+                self.io.tool_error(
+                    "To use /voice you must provide an OpenAI API key (or custom Whisper API and key)."
+                )
                 return
+
             try:
                 self.voice = voice.Voice(
-                    audio_format=self.args.voice_format, device_name=self.args.voice_input_device
+                    audio_format=self.args.voice_format,
+                    device_name=self.args.voice_input_device,
                 )
             except voice.SoundDeviceError:
                 self.io.tool_error(

diff --git a/aider/voice.py b/aider/voice.py
@@ -150,8 +150,34 @@ def raw_record_and_transcribe(self, history, language):
 
         with open(filename, "rb") as fh:
             try:
+                # Get API configuration from environment
+                api_base = os.getenv("WHISPER_API_BASE", None ) # None is the default OpenAI endpoint
+                api_key = os.getenv("WHISPER_API_KEY", None )   # None causes OPENAI_API_KEY to be used
+
+                # If a custom base is specified, require a specific whisper key
+                if api_base and not api_key:
+                    raise Exception(
+                        "When using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY"
+                        " via --api whisper=<key>"
+                    )
+
+                # Only use OpenAI key as fallback if using default OpenAI endpoint
+                if not api_key:
+                    if not api_base or api_base == "https://api.openai.com/v1":
+                        api_key = os.getenv("OPENAI_API_KEY")
+                        if not api_key:
+                            raise Exception(
+                                "No API key found. Please set either WHISPER_API_KEY or OPENAI_API_KEY"
+                                " environment variables, or use --api whisper=<key>"
+                            )
+
                 transcript = litellm.transcription(
-                    model="whisper-1", file=fh, prompt=history, language=language
+                    model="whisper-1",
+                    file=fh,
+                    prompt=history,
+                    language=language,
+                    api_base=api_base,
+                    api_key=api_key,
                 )
             except Exception as err:
                 print(f"Unable to transcribe {filename}: {err}")
@@ -165,7 +191,4 @@ def raw_record_and_transcribe(self, history, language):
 
 
 if __name__ == "__main__":
-    api_key = os.getenv("OPENAI_API_KEY")
-    if not api_key:
-        raise ValueError("Please set the OPENAI_API_KEY environment variable.")
     print(Voice().record_and_transcribe())
diff --git a/aider/website/assets/sample.env b/aider/website/assets/sample.env
@@ -69,6 +69,12 @@
 ## Specify the api base url
 #AIDER_OPENAI_API_BASE=
 
+## Specify an alternate api base url for Whisper transcriptions (optional)
+#AIDER_WHISPER_API_BASE=
+
+## Specify an alternate api key for Whisper transcriptions (optional)
+#AIDER_WHISPER_API_KEY=
+
 ## (deprecated, use --set-env OPENAI_API_TYPE=<value>)
 #AIDER_OPENAI_API_TYPE=
 

diff --git a/aider/website/docs/install/optional.md b/aider/website/docs/install/optional.md
@@ -46,6 +46,9 @@ for additional information.
 Aider supports 
 [coding with your voice](https://aider.chat/docs/usage/voice.html)
 using the in-chat `/voice` command.
+
+### Audio capture setup
+
 Aider uses the [PortAudio](http://www.portaudio.com) library to
 capture audio.
 Installing PortAudio is completely optional, but can usually be accomplished like this:
@@ -55,6 +58,40 @@ Installing PortAudio is completely optional, but can usually be accomplished lik
 - For Linux, do `sudo apt-get install libportaudio2`
   - Some linux environments may also need `sudo apt install libasound2-plugins`
 
+### Whisper API configuration
+
+By default, aider uses OpenAI's Whisper API for voice transcription. You can configure an alternate Whisper API provider:
+
+```bash
+# Via command line arguments
+aider --set-env WHISPER_API_BASE=https://api.example.com --api-key whisper=your-api-key
+
+# Via environment variables or .env file
+WHISPER_API_BASE=https://api.example.com
+WHISPER_API_KEY=your-api-key
+
+# Via config file (.aider.conf.yml)
+api-base:
+  - whisper=https://api.example.com
+api-key:
+  - whisper=your-api-key
+```
+
+When using an alternate Whisper API endpoint:
+- You must provide both the API base URL and API key
+- Your OpenAI API key will not be sent to the alternate endpoint
+- This is useful for:
+  - Using a different Whisper API provider
+  - Running Whisper locally or on your own infrastructure
+  - Controlling costs or data privacy
+
+API key behavior:
+- If using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY
+- If using the default OpenAI endpoint (or no endpoint specified):
+  - Will use WHISPER_API_KEY if provided
+  - Will fall back to OPENAI_API_KEY if no WHISPER_API_KEY is set
+- If neither key is available, an error will be raised
+
 ## Add aider to your editor 
 
 Other projects have integrated aider into some IDE/editors.

diff --git a/aider/website/docs/usage/voice.md b/aider/website/docs/usage/voice.md
@@ -17,11 +17,11 @@ when you ask aider to edit your code.
 Use the in-chat `/voice` command to start recording,
 and press `ENTER` when you're done speaking.
 Your voice coding instructions will be transcribed, 
-as if you had  typed them into
+as if you had typed them into
 the aider chat session.
 
 See the [installation instructions](https://aider.chat/docs/install/optional.html#enable-voice-coding) for
-information on how to enable the `/voice` command.
+information on how to enable the `/voice` command and configure alternate Whisper API endpoints.
 
 <br/>
 <div class="chat-transcript" markdown="1">

diff --git a/tests/basic/test_voice.py b/tests/basic/test_voice.py
@@ -101,3 +101,34 @@ def test_record_and_transcribe_device_error():
         ):
             result = voice.record_and_transcribe()
             assert result is None
+
+def test_record_and_transcribe_no_api_key():
+    with patch("aider.voice.sf", MagicMock()):
+        voice = Voice()
+        with patch.dict(os.environ, {}, clear=True):  # Clear environment variables
+            result = voice.record_and_transcribe()
+            assert result is None
+
+def test_record_and_transcribe_custom_base_no_key():
+    with patch("aider.voice.sf", MagicMock()):
+        voice = Voice()
+        with patch.dict(os.environ, {"WHISPER_API_BASE": "http://custom.api"}, clear=True):
+            with pytest.raises(Exception) as exc:
+                voice.record_and_transcribe()
+            assert "When using a custom WHISPER_API_BASE" in str(exc.value)
+            assert "via --api whisper=<key>" in str(exc.value)
+
+def test_record_and_transcribe_custom_base_with_key():
+    with patch("aider.voice.sf", MagicMock()):
+        voice = Voice()
+        with patch.dict(
+            os.environ,
+            {
+                "WHISPER_API_BASE": "http://custom.api",
+                "WHISPER_API_KEY": "test-key"
+            },
+            clear=True
+        ):
+            with patch.object(voice, "raw_record_and_transcribe") as mock_record:
+                voice.record_and_transcribe()
+                assert mock_record.called