Skip to content

Commit

Permalink
feat: Support custom Whisper API endpoints for voice transcription
Browse files Browse the repository at this point in the history
This change adds the ability to use alternative Whisper API providers
for voice transcription, while maintaining backwards compatibility
with the default OpenAI endpoint.

- Added support for custom Whisper API endpoints via `WHISPER_API_BASE` and `WHISPER_API_KEY` environment variables
- Updated the voice transcription logic to handle both custom and OpenAI endpoints
- Added validation to require a specific API key when using custom endpoints
- Updated documentation to explain the new configuration options
- Added new test cases to verify API endpoint and key handling

- When using a custom Whisper endpoint (`WHISPER_API_BASE`):
  - Must provide a corresponding `WHISPER_API_KEY`
  - OpenAI API key will not be used
- When using default OpenAI endpoint:
  - Will use `WHISPER_API_KEY` if provided
  - Falls back to `OPENAI_API_KEY` if no Whisper-specific key is set

- Added configuration examples in the optional installation docs
- Updated voice coding documentation to reference the new features
- Added new environment variables to the sample.env file

This change enables users to:

- Use alternative Whisper API providers
- Run Whisper locally or on their own infrastructure
- Better control costs and data privacy for voice transcription
  • Loading branch information
mbailey committed Dec 17, 2024
1 parent 1042e2f commit e3d656c
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 9 deletions.
15 changes: 12 additions & 3 deletions aider/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1112,12 +1112,21 @@ def cmd_voice(self, args):
"Record and transcribe voice input"

if not self.voice:
if "OPENAI_API_KEY" not in os.environ:
self.io.tool_error("To use /voice you must provide an OpenAI API key.")
if "WHISPER_API_BASE" in os.environ and "WHISPER_API_KEY" not in os.environ:
self.io.tool_error(
"To use /voice with a custom Whisper API you must provide a custom Whisper API key"
)
return
elif "OPENAI_API_KEY" not in os.environ:
self.io.tool_error(
"To use /voice you must provide an OpenAI API key (or custom Whisper API and key)."
)
return

try:
self.voice = voice.Voice(
audio_format=self.args.voice_format, device_name=self.args.voice_input_device
audio_format=self.args.voice_format,
device_name=self.args.voice_input_device,
)
except voice.SoundDeviceError:
self.io.tool_error(
Expand Down
31 changes: 27 additions & 4 deletions aider/voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,34 @@ def raw_record_and_transcribe(self, history, language):

with open(filename, "rb") as fh:
try:
# Get API configuration from environment
api_base = os.getenv("WHISPER_API_BASE", None ) # None is the default OpenAI endpoint
api_key = os.getenv("WHISPER_API_KEY", None ) # None causes OPENAI_API_KEY to be used

# If a custom base is specified, require a specific whisper key
if api_base and not api_key:
raise Exception(
"When using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY"
" via --api whisper=<key>"
)

# Only use OpenAI key as fallback if using default OpenAI endpoint
if not api_key:
if not api_base or api_base == "https://api.openai.com/v1":
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise Exception(
"No API key found. Please set either WHISPER_API_KEY or OPENAI_API_KEY"
" environment variables, or use --api whisper=<key>"
)

transcript = litellm.transcription(
model="whisper-1", file=fh, prompt=history, language=language
model="whisper-1",
file=fh,
prompt=history,
language=language,
api_base=api_base,
api_key=api_key,
)
except Exception as err:
print(f"Unable to transcribe {filename}: {err}")
Expand All @@ -165,7 +191,4 @@ def raw_record_and_transcribe(self, history, language):


if __name__ == "__main__":
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("Please set the OPENAI_API_KEY environment variable.")
print(Voice().record_and_transcribe())
6 changes: 6 additions & 0 deletions aider/website/assets/sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@
## Specify the api base url
#AIDER_OPENAI_API_BASE=

## Specify an alternate api base url for Whisper transcriptions (optional)
#AIDER_WHISPER_API_BASE=

## Specify an alternate api key for Whisper transcriptions (optional)
#AIDER_WHISPER_API_KEY=

## (deprecated, use --set-env OPENAI_API_TYPE=<value>)
#AIDER_OPENAI_API_TYPE=

Expand Down
37 changes: 37 additions & 0 deletions aider/website/docs/install/optional.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ for additional information.
Aider supports
[coding with your voice](https://aider.chat/docs/usage/voice.html)
using the in-chat `/voice` command.

### Audio capture setup

Aider uses the [PortAudio](http://www.portaudio.com) library to
capture audio.
Installing PortAudio is completely optional, but can usually be accomplished like this:
Expand All @@ -55,6 +58,40 @@ Installing PortAudio is completely optional, but can usually be accomplished lik
- For Linux, do `sudo apt-get install libportaudio2`
- Some linux environments may also need `sudo apt install libasound2-plugins`

### Whisper API configuration

By default, aider uses OpenAI's Whisper API for voice transcription. You can configure an alternate Whisper API provider:

```bash
# Via command line arguments
aider --set-env WHISPER_API_BASE=https://api.example.com --api-key whisper=your-api-key

# Via environment variables or .env file
WHISPER_API_BASE=https://api.example.com
WHISPER_API_KEY=your-api-key

# Via config file (.aider.conf.yml)
api-base:
- whisper=https://api.example.com
api-key:
- whisper=your-api-key
```

When using an alternate Whisper API endpoint:
- You must provide both the API base URL and API key
- Your OpenAI API key will not be sent to the alternate endpoint
- This is useful for:
- Using a different Whisper API provider
- Running Whisper locally or on your own infrastructure
- Controlling costs or data privacy

API key behavior:
- If using a custom WHISPER_API_BASE, you must provide a WHISPER_API_KEY
- If using the default OpenAI endpoint (or no endpoint specified):
- Will use WHISPER_API_KEY if provided
- Will fall back to OPENAI_API_KEY if no WHISPER_API_KEY is set
- If neither key is available, an error will be raised

## Add aider to your editor

Other projects have integrated aider into some IDE/editors.
Expand Down
4 changes: 2 additions & 2 deletions aider/website/docs/usage/voice.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ when you ask aider to edit your code.
Use the in-chat `/voice` command to start recording,
and press `ENTER` when you're done speaking.
Your voice coding instructions will be transcribed,
as if you had typed them into
as if you had typed them into
the aider chat session.

See the [installation instructions](https://aider.chat/docs/install/optional.html#enable-voice-coding) for
information on how to enable the `/voice` command.
information on how to enable the `/voice` command and configure alternate Whisper API endpoints.

<br/>
<div class="chat-transcript" markdown="1">
Expand Down
31 changes: 31 additions & 0 deletions tests/basic/test_voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,34 @@ def test_record_and_transcribe_device_error():
):
result = voice.record_and_transcribe()
assert result is None

def test_record_and_transcribe_no_api_key():
with patch("aider.voice.sf", MagicMock()):
voice = Voice()
with patch.dict(os.environ, {}, clear=True): # Clear environment variables
result = voice.record_and_transcribe()
assert result is None

def test_record_and_transcribe_custom_base_no_key():
with patch("aider.voice.sf", MagicMock()):
voice = Voice()
with patch.dict(os.environ, {"WHISPER_API_BASE": "http://custom.api"}, clear=True):
with pytest.raises(Exception) as exc:
voice.record_and_transcribe()
assert "When using a custom WHISPER_API_BASE" in str(exc.value)
assert "via --api whisper=<key>" in str(exc.value)

def test_record_and_transcribe_custom_base_with_key():
with patch("aider.voice.sf", MagicMock()):
voice = Voice()
with patch.dict(
os.environ,
{
"WHISPER_API_BASE": "http://custom.api",
"WHISPER_API_KEY": "test-key"
},
clear=True
):
with patch.object(voice, "raw_record_and_transcribe") as mock_record:
voice.record_and_transcribe()
assert mock_record.called

0 comments on commit e3d656c

Please sign in to comment.