feat: improve POST /tts documentation (#145)

* ckpt * feat: small additions * revert: formatting --------- Co-authored-by: Sid Sharma <[email protected]>
metavoiceio · May 7, 2024 · 12df077 · 12df077
1 parent c951ccf
commit 12df077
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 7 deletions.
diff --git a/fam/llm/decoders.py b/fam/llm/decoders.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 import os
 import pathlib
 import uuid
@@ -90,12 +91,12 @@ def decode(
             raise Exception("wav predicted is shorter than 400ms!")
 
         try:
-            wav_file_name = self.output_dir / f"synth_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
+            wav_file_name = self.output_dir / f"synth_{datetime.now().strftime('%y-%m-%d--%H-%M-%S')}_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
             self._save_audio(wav_file_name, wav)
             return wav_file_name
         except Exception as e:
             print(f"Failed to save audio! Reason: {e}")
 
-            wav_file_name = self.output_dir / f"synth_{uuid.uuid4()}"
+            wav_file_name = self.output_dir / f"synth_{datetime.now().strftime('%y-%m-%d--%H-%M-%S')}_{uuid.uuid4()}"
             self._save_audio(wav_file_name, wav)
             return wav_file_name
diff --git a/fam/llm/fast_inference.py b/fam/llm/fast_inference.py
@@ -174,6 +174,7 @@ def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.
             TelemetryEvent(
                 name="user_ran_tts",
                 properties={
+                    "model_name": self._model_name,
                     "text": text,
                     "temperature": temperature,
                     "guidance_scale": guidance_scale,

diff --git a/serving.py b/serving.py
@@ -1,3 +1,5 @@
+# curl -X POST http://127.0.0.1:58003/tts -F "text=Testing this inference server." -F "speaker_ref_path=https://cdn.themetavoice.xyz/speakers/bria.mp3" -F "guidance=3.0" -F "top_p=0.95" --output out.wav
+
 import logging
 import shlex
 import subprocess
@@ -56,11 +58,11 @@ async def health_check():
 
 @app.post("/tts", response_class=Response)
 async def text_to_speech(
-    text: str = Form(...),
-    speaker_ref_path: Optional[str] = Form(None),
-    guidance: float = Form(3.0),
-    top_p: float = Form(0.95),
-    audiodata: Optional[UploadFile] = File(None),
+    text: str = Form(..., description="Text to convert to speech."),
+    speaker_ref_path: Optional[str] = Form(None, description="Optional URL to an audio file of a reference speaker. Provide either this URL or audio data through 'audiodata'."),
+    audiodata: Optional[UploadFile] = File(None, description="Optional audio data of a reference speaker. Provide either this file or a URL through 'speaker_ref_path'."),
+    guidance: float = Form(3.0, description="Control speaker similarity - how closely to match speaker identity and speech style, range: 0.0 to 5.0.", ge=0.0, le=5.0),
+    top_p: float = Form(0.95, description="Controls speech stability - improves text following for a challenging speaker, range: 0.0 to 1.0.", ge=0.0, le=1.0),
 ):
     # Ensure at least one of speaker_ref_path or audiodata is provided
     if not audiodata and not speaker_ref_path: