adding updated version of xtts v2 (#375)

basetenlabs · Nov 5, 2024 · ce272e9 · ce272e9
1 parent 7786b53
commit ce272e9
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 52 deletions.
diff --git a/xtts-v2-truss/config.yaml b/xtts-v2-truss/config.yaml
@@ -1,17 +1,17 @@
 environment_variables:
-  COQUI_TOS_AGREED: '1'
+  COQUI_TOS_AGREED: "1"
 external_package_dirs: []
 model_metadata:
   example_model_input:
     language: en
-    speaker_voice: <BASE64-STRING>
-    text: I love robots. Robots are cool!
+    speaker_voice: Claribel Dervla
+    text: Kurt watched the incoming Pelicans. The blocky jet-powered craft were so distant they were only specks against the setting sun. He hit the magnification on his faceplate and saw lines of fire tracing their reentry vectors. They would touch down in three minutes.
   tags:
   - text-to-speech
 model_name: XTTS V2
 python_version: py310
 requirements:
-- git+https://github.com/coqui-ai/TTS.git@v0.21.3
+  - git+https://github.com/htrivedi99/TTS.git
 resources:
   accelerator: T4
   cpu: '3'

diff --git a/xtts-v2-truss/model/model.py b/xtts-v2-truss/model/model.py
@@ -1,9 +1,11 @@
 import base64
-import os
+from tempfile import NamedTemporaryFile
 from typing import Dict
 
 from TTS.api import TTS
 
+DEFAULT_SPEAKER_NAME = "Claribel Dervla"
+
 
 class Model:
     def __init__(self, **kwargs):
@@ -24,51 +26,18 @@ def wav_to_base64(self, file_path):
             base64_string = base64_data.decode("utf-8")
             return base64_string
 
-    def preprocess(self, request: Dict) -> Dict:
-        text = request.get("text")
-        speaker_voice = request.get("speaker_voice")
-        language = request.get("language")
-        supported_languages = {
-            "en",
-            "es",
-            "fr",
-            "de",
-            "it",
-            "pt",
-            "pl",
-            "tr",
-            "ru",
-            "nl",
-            "cs",
-            "ar",
-            "zh-cn",
-        }
-
-        if language not in supported_languages:
-            return {
-                "output": f"The language you chose is not supported. Please select from the following choices: {supported_languages}"
-            }
-
-        self.base64_to_wav(speaker_voice, "speaker_voice.wav")
-        return {
-            "text": text,
-            "speaker_voice": "speaker_voice.wav",
-            "language": language,
-        }
-
     def predict(self, request: Dict) -> Dict:
-        text = request.pop("text")
-        speaker_voice = request.pop("speaker_voice")
-        language = request.pop("language")
-        self.model.tts_to_file(
-            text=text,
-            file_path="output.wav",
-            speaker_wav=speaker_voice,
-            language=language,
-        )
-
-        base64_string = self.wav_to_base64("output.wav")
-
-        os.remove("speaker_voice.wav")
-        os.remove("output.wav")
-        return {"output": base64_string}
+        text = request.get("text")
+        speaker_voice = request.get("speaker_voice", DEFAULT_SPEAKER_NAME)
+        language = request.get("language", "en")
+
+        with NamedTemporaryFile(delete=True) as fp:
+            self.model.tts_to_file(
+                text=text,
+                file_path=fp.name,
+                speaker=speaker_voice,
+                language=language,
+            )
+
+            base64_string = self.wav_to_base64(fp.name)
+            return {"output": base64_string}