Skip to content

Commit

Permalink
feat: improve POST /tts documentation (#145)
Browse files Browse the repository at this point in the history
* ckpt

* feat: small additions

* revert: formatting

---------

Co-authored-by: Sid Sharma <[email protected]>
  • Loading branch information
sidroopdaska and Sid Sharma authored May 7, 2024
1 parent c951ccf commit 12df077
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 7 deletions.
5 changes: 3 additions & 2 deletions fam/llm/decoders.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
import os
import pathlib
import uuid
Expand Down Expand Up @@ -90,12 +91,12 @@ def decode(
raise Exception("wav predicted is shorter than 400ms!")

try:
wav_file_name = self.output_dir / f"synth_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
wav_file_name = self.output_dir / f"synth_{datetime.now().strftime('%y-%m-%d--%H-%M-%S')}_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
self._save_audio(wav_file_name, wav)
return wav_file_name
except Exception as e:
print(f"Failed to save audio! Reason: {e}")

wav_file_name = self.output_dir / f"synth_{uuid.uuid4()}"
wav_file_name = self.output_dir / f"synth_{datetime.now().strftime('%y-%m-%d--%H-%M-%S')}_{uuid.uuid4()}"
self._save_audio(wav_file_name, wav)
return wav_file_name
1 change: 1 addition & 0 deletions fam/llm/fast_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.
TelemetryEvent(
name="user_ran_tts",
properties={
"model_name": self._model_name,
"text": text,
"temperature": temperature,
"guidance_scale": guidance_scale,
Expand Down
12 changes: 7 additions & 5 deletions serving.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# curl -X POST http://127.0.0.1:58003/tts -F "text=Testing this inference server." -F "speaker_ref_path=https://cdn.themetavoice.xyz/speakers/bria.mp3" -F "guidance=3.0" -F "top_p=0.95" --output out.wav

import logging
import shlex
import subprocess
Expand Down Expand Up @@ -56,11 +58,11 @@ async def health_check():

@app.post("/tts", response_class=Response)
async def text_to_speech(
text: str = Form(...),
speaker_ref_path: Optional[str] = Form(None),
guidance: float = Form(3.0),
top_p: float = Form(0.95),
audiodata: Optional[UploadFile] = File(None),
text: str = Form(..., description="Text to convert to speech."),
speaker_ref_path: Optional[str] = Form(None, description="Optional URL to an audio file of a reference speaker. Provide either this URL or audio data through 'audiodata'."),
audiodata: Optional[UploadFile] = File(None, description="Optional audio data of a reference speaker. Provide either this file or a URL through 'speaker_ref_path'."),
guidance: float = Form(3.0, description="Control speaker similarity - how closely to match speaker identity and speech style, range: 0.0 to 5.0.", ge=0.0, le=5.0),
top_p: float = Form(0.95, description="Controls speech stability - improves text following for a challenging speaker, range: 0.0 to 1.0.", ge=0.0, le=1.0),
):
# Ensure at least one of speaker_ref_path or audiodata is provided
if not audiodata and not speaker_ref_path:
Expand Down

0 comments on commit 12df077

Please sign in to comment.