diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index 0976f89e2b..4ede6514a5 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -234,8 +234,12 @@ jobs: run: | cd ./java-api-examples + ./run-non-streaming-tts-kokoro-en.sh ./run-non-streaming-tts-matcha-zh.sh ./run-non-streaming-tts-matcha-en.sh + ls -lh + + rm -rf kokoro-en-* rm -rf matcha-icefall-* rm hifigan_v2.onnx diff --git a/CHANGELOG.md b/CHANGELOG.md index b7a29ecb12..dfd4ef92bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,23 @@ +## 1.10.40 + +* Fix building wheels (#1703) +* Export kokoro to sherpa-onnx (#1713) +* Add C++ and Python API for Kokoro TTS models. (#1715) +* Add C API for Kokoro TTS models (#1717) +* Fix style issues (#1718) +* Add C# API for Kokoro TTS models (#1720) +* Add Swift API for Kokoro TTS models (#1721) +* Add Go API for Kokoro TTS models (#1722) +* Add Dart API for Kokoro TTS models (#1723) +* Add Pascal API for Kokoro TTS models (#1724) +* Add JavaScript API (node-addon) for Kokoro TTS models (#1725) +* Add JavaScript (WebAssembly) API for Kokoro TTS models. (#1726) +* Add Koltin and Java API for Kokoro TTS models (#1728) +* Update README.md for KWS to not use git lfs. (#1729) + + + + ## 1.10.39 * Fix building without TTS (#1691) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9f4276667..9955bc9f7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ project(sherpa-onnx) # Remember to update # ./CHANGELOG.md # ./new-release.sh -set(SHERPA_ONNX_VERSION "1.10.39") +set(SHERPA_ONNX_VERSION "1.10.40") # Disable warning about # diff --git a/android/SherpaOnnxAar/README.md b/android/SherpaOnnxAar/README.md index ec50ebf402..8f4932072d 100644 --- a/android/SherpaOnnxAar/README.md +++ b/android/SherpaOnnxAar/README.md @@ -4,8 +4,8 @@ git clone https://github.com/k2-fsa/sherpa-onnx cd sherpa-onnx -wget https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.10.39/sherpa-onnx-v1.10.39-android.tar.bz2 -tar xvf sherpa-onnx-v1.10.39-android.tar.bz2 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.10.40/sherpa-onnx-v1.10.40-android.tar.bz2 +tar xvf sherpa-onnx-v1.10.40-android.tar.bz2 cp -v jniLibs/arm64-v8a/* android/SherpaOnnxAar/sherpa_onnx/src/main/jniLibs/arm64-v8a/ cp -v jniLibs/armeabi-v7a/* android/SherpaOnnxAar/sherpa_onnx/src/main/jniLibs/armeabi-v7a/ @@ -16,5 +16,5 @@ cd android/SherpaOnnxAar ./gradlew :sherpa_onnx:assembleRelease ls -lh ./sherpa_onnx/build/outputs/aar/sherpa_onnx-release.aar -cp ./sherpa_onnx/build/outputs/aar/sherpa_onnx-release.aar ../../sherpa-onnx-1.10.39.aar +cp ./sherpa_onnx/build/outputs/aar/sherpa_onnx-release.aar ../../sherpa-onnx-1.10.40.aar ``` diff --git a/android/SherpaOnnxJavaDemo/app/build.gradle b/android/SherpaOnnxJavaDemo/app/build.gradle index 8e66f4355d..38405d9165 100644 --- a/android/SherpaOnnxJavaDemo/app/build.gradle +++ b/android/SherpaOnnxJavaDemo/app/build.gradle @@ -34,5 +34,5 @@ dependencies { implementation 'pub.devrel:easypermissions:3.0.0' implementation 'androidx.core:core-ktx:1.7.0' // implementation files('/Users/fangjun/open-source/sherpa-onnx/android/SherpaOnnxAar/sherpa_onnx/build/outputs/aar/sherpa_onnx-release.aar') - implementation 'com.github.k2-fsa:sherpa-onnx:v1.10.39' + implementation 'com.github.k2-fsa:sherpa-onnx:v1.10.40' } diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index 5119a50b28..5aa5b9ad8c 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -185,6 +185,7 @@ class MainActivity : AppCompatActivity() { var modelName: String? var acousticModelName: String? var vocoder: String? + var voices: String? var ruleFsts: String? var ruleFars: String? var lexicon: String? @@ -205,6 +206,10 @@ class MainActivity : AppCompatActivity() { vocoder = null // Matcha -- end + // For Kokoro -- begin + voices = null + // For Kokoro -- end + modelDir = null ruleFsts = null @@ -269,6 +274,13 @@ class MainActivity : AppCompatActivity() { // vocoder = "hifigan_v2.onnx" // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data" + // Example 9 + // kokoro-en-v0_19 + // modelDir = "kokoro-en-v0_19" + // modelName = "model.onnx" + // voices = "voices.bin" + // dataDir = "kokoro-en-v0_19/espeak-ng-data" + if (dataDir != null) { val newDir = copyDataDir(dataDir!!) dataDir = "$newDir/$dataDir" @@ -285,6 +297,7 @@ class MainActivity : AppCompatActivity() { modelName = modelName ?: "", acousticModelName = acousticModelName ?: "", vocoder = vocoder ?: "", + voices = voices ?: "", lexicon = lexicon ?: "", dataDir = dataDir ?: "", dictDir = dictDir ?: "", diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt index a01e0a7b6d..e372be4329 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt @@ -47,7 +47,7 @@ fun getSampleText(lang: String): String { } "eng" -> { - text = "This is a text-to-speech engine using next generation Kaldi" + text = "How are you doing today? This is a text-to-speech engine using next generation Kaldi" } "est" -> { diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt index c5fb8d9c6b..7d60454772 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt @@ -3,6 +3,10 @@ package com.k2fsa.sherpa.onnx.tts.engine import PreferenceHelper +import android.media.AudioAttributes +import android.media.AudioFormat +import android.media.AudioManager +import android.media.AudioTrack import android.media.MediaPlayer import android.net.Uri import android.os.Bundle @@ -37,9 +41,15 @@ import androidx.compose.ui.Modifier import androidx.compose.ui.text.input.KeyboardType import androidx.compose.ui.unit.dp import com.k2fsa.sherpa.onnx.tts.engine.ui.theme.SherpaOnnxTtsEngineTheme +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.channels.Channel +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext import java.io.File import kotlin.math.roundToInt import kotlinx.coroutines.delay +import kotlin.time.TimeSource const val TAG = "sherpa-onnx-tts-engine" @@ -48,9 +58,26 @@ class MainActivity : ComponentActivity() { private val ttsViewModel: TtsViewModel by viewModels() private var mediaPlayer: MediaPlayer? = null + + // see + // https://developer.android.com/reference/kotlin/android/media/AudioTrack + private lateinit var track: AudioTrack + + private var stopped: Boolean = false + + private var samplesChannel = Channel() + override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) + + Log.i(TAG, "Start to initialize TTS") TtsEngine.createTts(this) + Log.i(TAG, "Finish initializing TTS") + + Log.i(TAG, "Start to initialize AudioTrack") + initAudioTrack() + Log.i(TAG, "Finish initializing AudioTrack") + val preferenceHelper = PreferenceHelper(this) TtsEngine.cacheSize = preferenceHelper.getCacheSizeInMB() @@ -108,6 +135,11 @@ class MainActivity : ComponentActivity() { val testTextContent = getSampleText(TtsEngine.lang ?: "") var testText by remember { mutableStateOf(testTextContent) } + var startEnabled by remember { mutableStateOf(true) } + var playEnabled by remember { mutableStateOf(false) } + var rtfText by remember { + mutableStateOf("") + } val numSpeakers = TtsEngine.tts!!.numSpeakers() if (numSpeakers > 1) { @@ -150,52 +182,117 @@ class MainActivity : ComponentActivity() { Row { Button( - modifier = Modifier.padding(20.dp), + enabled = startEnabled, + modifier = Modifier.padding(5.dp), onClick = { Log.i(TAG, "Clicked, text: $testText") if (testText.isBlank() || testText.isEmpty()) { Toast.makeText( applicationContext, - "Please input a test sentence", + "Please input some text to generate", Toast.LENGTH_SHORT ).show() } else { - val audio = TtsEngine.tts!!.generate( - text = testText, - sid = TtsEngine.speakerId, - speed = TtsEngine.speed, - ) - - val filename = - application.filesDir.absolutePath + "/generated.wav" - val ok = - audio.samples.isNotEmpty() && audio.save( - filename - ) + startEnabled = false + playEnabled = false + stopped = false - if (ok) { - stopMediaPlayer() - mediaPlayer = MediaPlayer.create( - applicationContext, - Uri.fromFile(File(filename)) - ) - mediaPlayer?.start() - } else { - Log.i(TAG, "Failed to generate or save audio") + track.pause() + track.flush() + track.play() + rtfText = "" + Log.i(TAG, "Started with text $testText") + + samplesChannel = Channel() + + CoroutineScope(Dispatchers.IO).launch { + for (samples in samplesChannel) { + track.write( + samples, + 0, + samples.size, + AudioTrack.WRITE_BLOCKING + ) + if (stopped) { + break + } + } } + + CoroutineScope(Dispatchers.Default).launch { + val timeSource = TimeSource.Monotonic + val startTime = timeSource.markNow() + + val audio = + TtsEngine.tts!!.generateWithCallback( + text = testText, + sid = TtsEngine.speakerId, + speed = TtsEngine.speed, + callback = ::callback, + ) + + val elapsed = + startTime.elapsedNow().inWholeMilliseconds.toFloat() / 1000; + val audioDuration = + audio.samples.size / TtsEngine.tts!!.sampleRate() + .toFloat() + val RTF = String.format( + "Number of threads: %d\nElapsed: %.3f s\nAudio duration: %.3f s\nRTF: %.3f/%.3f = %.3f", + TtsEngine.tts!!.config.model.numThreads, + audioDuration, + elapsed, + elapsed, + audioDuration, + elapsed / audioDuration + ) + samplesChannel.close() + + val filename = + application.filesDir.absolutePath + "/generated.wav" + + + val ok = + audio.samples.isNotEmpty() && audio.save( + filename + ) + + if (ok) { + withContext(Dispatchers.Main) { + startEnabled = true + playEnabled = true + rtfText = RTF + } + } + }.start() } }) { - Text("Test") + Text("Start") } Button( - modifier = Modifier.padding(20.dp), + modifier = Modifier.padding(5.dp), + enabled = playEnabled, onClick = { - TtsEngine.speakerId = 0 - TtsEngine.speed = 1.0f - testText = "" + stopped = true + track.pause() + track.flush() + onClickPlay() }) { - Text("Reset") + Text("Play") + } + + Button( + modifier = Modifier.padding(5.dp), + onClick = { + onClickStop() + startEnabled = true + }) { + Text("Stop") + } + } + if (rtfText.isNotEmpty()) { + Row { + Text(rtfText) } Button( @@ -238,4 +335,63 @@ class MainActivity : ComponentActivity() { mediaPlayer?.release() mediaPlayer = null } -} \ No newline at end of file + + private fun onClickPlay() { + val filename = application.filesDir.absolutePath + "/generated.wav" + stopMediaPlayer() + mediaPlayer = MediaPlayer.create( + applicationContext, + Uri.fromFile(File(filename)) + ) + mediaPlayer?.start() + } + + private fun onClickStop() { + stopped = true + track.pause() + track.flush() + + stopMediaPlayer() + } + + // this function is called from C++ + private fun callback(samples: FloatArray): Int { + if (!stopped) { + val samplesCopy = samples.copyOf() + CoroutineScope(Dispatchers.IO).launch { + samplesChannel.send(samplesCopy) + } + return 1 + } else { + track.stop() + Log.i(TAG, " return 0") + return 0 + } + } + + private fun initAudioTrack() { + val sampleRate = TtsEngine.tts!!.sampleRate() + val bufLength = AudioTrack.getMinBufferSize( + sampleRate, + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_FLOAT + ) + Log.i(TAG, "sampleRate: $sampleRate, buffLength: $bufLength") + + val attr = AudioAttributes.Builder().setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .setUsage(AudioAttributes.USAGE_MEDIA) + .build() + + val format = AudioFormat.Builder() + .setEncoding(AudioFormat.ENCODING_PCM_FLOAT) + .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) + .setSampleRate(sampleRate) + .build() + + track = AudioTrack( + attr, format, bufLength, AudioTrack.MODE_STREAM, + AudioManager.AUDIO_SESSION_ID_GENERATE + ) + track.play() + } +} diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt index 65eeea04b2..91f6b2d127 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt @@ -48,8 +48,9 @@ object TtsEngine { private var modelDir: String? = null private var modelName: String? = null - private var acousticModelName: String? = null - private var vocoder: String? = null + private var acousticModelName: String? = null // for matcha tts + private var vocoder: String? = null // for matcha tts + private var voices: String? = null // for kokoro private var ruleFsts: String? = null private var ruleFars: String? = null private var lexicon: String? = null @@ -71,6 +72,10 @@ object TtsEngine { vocoder = null // For Matcha -- end + // For Kokoro -- begin + voices = null + // For Kokoro -- end + modelDir = null ruleFsts = null ruleFars = null @@ -146,6 +151,14 @@ object TtsEngine { // vocoder = "hifigan_v2.onnx" // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data" // lang = "eng" + + // Example 9 + // kokoro-en-v0_19 + // modelDir = "kokoro-en-v0_19" + // modelName = "model.onnx" + // voices = "voices.bin" + // dataDir = "kokoro-en-v0_19/espeak-ng-data" + // lang = "eng" } fun createTts(context: Context) { @@ -174,6 +187,7 @@ object TtsEngine { modelName = modelName ?: "", acousticModelName = acousticModelName ?: "", vocoder = vocoder ?: "", + voices = voices ?: "", lexicon = lexicon ?: "", dataDir = dataDir ?: "", dictDir = dictDir ?: "", diff --git a/android/SherpaOnnxTtsEngine/app/src/main/res/values/strings.xml b/android/SherpaOnnxTtsEngine/app/src/main/res/values/strings.xml index ac28473148..67518e0a38 100755 --- a/android/SherpaOnnxTtsEngine/app/src/main/res/values/strings.xml +++ b/android/SherpaOnnxTtsEngine/app/src/main/res/values/strings.xml @@ -1,3 +1,3 @@ - TTS Engine + TTS Engine: Next-gen Kaldi \ No newline at end of file diff --git a/build-ios-shared.sh b/build-ios-shared.sh index 39e5c0f2f1..54a374352b 100755 --- a/build-ios-shared.sh +++ b/build-ios-shared.sh @@ -242,7 +242,7 @@ for d in ios-arm64_x86_64-simulator ios-arm64; do CFBundlePackageType FMWK CFBundleShortVersionString - 1.10.39 + 1.10.40 CFBundleSupportedPlatforms iPhoneOS diff --git a/dart-api-examples/add-punctuations/pubspec.yaml b/dart-api-examples/add-punctuations/pubspec.yaml index ca01d32df6..4ac2868b5a 100644 --- a/dart-api-examples/add-punctuations/pubspec.yaml +++ b/dart-api-examples/add-punctuations/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ">=3.0.0 <4.0.0" dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/audio-tagging/pubspec.yaml b/dart-api-examples/audio-tagging/pubspec.yaml index d47d93bfb8..15ecf82867 100644 --- a/dart-api-examples/audio-tagging/pubspec.yaml +++ b/dart-api-examples/audio-tagging/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ">=3.0.0 <4.0.0" dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/keyword-spotter/pubspec.yaml b/dart-api-examples/keyword-spotter/pubspec.yaml index 4210cbfb44..02a8c1eb9e 100644 --- a/dart-api-examples/keyword-spotter/pubspec.yaml +++ b/dart-api-examples/keyword-spotter/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ">=3.0.0 <4.0.0" dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 # sherpa_onnx: # path: ../../flutter/sherpa_onnx path: ^1.9.0 diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml index 93a0ad3a2f..02f5b79274 100644 --- a/dart-api-examples/non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/speaker-diarization/pubspec.yaml b/dart-api-examples/speaker-diarization/pubspec.yaml index df44568d3e..ab74d425f7 100644 --- a/dart-api-examples/speaker-diarization/pubspec.yaml +++ b/dart-api-examples/speaker-diarization/pubspec.yaml @@ -8,7 +8,7 @@ environment: sdk: ">=3.0.0 <4.0.0" dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 # sherpa_onnx: # path: ../../flutter/sherpa_onnx path: ^1.9.0 diff --git a/dart-api-examples/speaker-identification/pubspec.yaml b/dart-api-examples/speaker-identification/pubspec.yaml index bc70976d92..5725fb4340 100644 --- a/dart-api-examples/speaker-identification/pubspec.yaml +++ b/dart-api-examples/speaker-identification/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ">=3.0.0 <4.0.0" dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml index ee79c40dd0..092506fcc1 100644 --- a/dart-api-examples/streaming-asr/pubspec.yaml +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -11,7 +11,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml index 718c4201a7..34309474ab 100644 --- a/dart-api-examples/tts/pubspec.yaml +++ b/dart-api-examples/tts/pubspec.yaml @@ -8,7 +8,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml b/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml index 250c45d9a1..eff21233b3 100644 --- a/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/vad-with-non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: sdk: ">=3.0.0 <4.0.0" dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index 66ad030ace..a6b1b626d6 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ">=3.0.0 <4.0.0" dependencies: - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 path: ^1.9.0 args: ^2.5.0 diff --git a/flutter-examples/streaming_asr/pubspec.yaml b/flutter-examples/streaming_asr/pubspec.yaml index 12116dd6e1..0f67343753 100644 --- a/flutter-examples/streaming_asr/pubspec.yaml +++ b/flutter-examples/streaming_asr/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' -version: 1.10.39 +version: 1.10.40 topics: - speech-recognition @@ -31,7 +31,7 @@ dependencies: record: ^5.1.0 url_launcher: ^6.2.6 - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 # sherpa_onnx: # path: ../../flutter/sherpa_onnx diff --git a/flutter-examples/tts/pubspec.yaml b/flutter-examples/tts/pubspec.yaml index 976a03e5d7..d0a2e3f971 100644 --- a/flutter-examples/tts/pubspec.yaml +++ b/flutter-examples/tts/pubspec.yaml @@ -5,7 +5,7 @@ description: > publish_to: 'none' # Remove this line if you wish to publish to pub.dev -version: 1.10.39 +version: 1.10.40 environment: sdk: ">=2.17.0 <4.0.0" @@ -18,7 +18,7 @@ dependencies: cupertino_icons: ^1.0.6 path_provider: ^2.1.3 path: ^1.9.0 - sherpa_onnx: ^1.10.39 + sherpa_onnx: ^1.10.40 # sherpa_onnx: # path: ../../flutter/sherpa_onnx url_launcher: 6.2.6 diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index 8dc0f43b48..b0ed0e5566 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec -version: 1.10.39 +version: 1.10.40 homepage: https://github.com/k2-fsa/sherpa-onnx @@ -30,23 +30,23 @@ dependencies: flutter: sdk: flutter - sherpa_onnx_android: ^1.10.39 + sherpa_onnx_android: ^1.10.40 # sherpa_onnx_android: # path: ../sherpa_onnx_android - sherpa_onnx_macos: ^1.10.39 + sherpa_onnx_macos: ^1.10.40 # sherpa_onnx_macos: # path: ../sherpa_onnx_macos - sherpa_onnx_linux: ^1.10.39 + sherpa_onnx_linux: ^1.10.40 # sherpa_onnx_linux: # path: ../sherpa_onnx_linux - sherpa_onnx_windows: ^1.10.39 + sherpa_onnx_windows: ^1.10.40 # sherpa_onnx_windows: # path: ../sherpa_onnx_windows - sherpa_onnx_ios: ^1.10.39 + sherpa_onnx_ios: ^1.10.40 # sherpa_onnx_ios: # path: ../sherpa_onnx_ios diff --git a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec index 02f2be1e6b..8b2106c43f 100644 --- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec +++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec @@ -7,7 +7,7 @@ # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c Pod::Spec.new do |s| s.name = 'sherpa_onnx_ios' - s.version = '1.10.39' + s.version = '1.10.40' s.summary = 'A new Flutter FFI plugin project.' s.description = <<-DESC A new Flutter FFI plugin project. diff --git a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec index 89d9f7bab4..0090f84e06 100644 --- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec +++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec @@ -4,7 +4,7 @@ # Pod::Spec.new do |s| s.name = 'sherpa_onnx_macos' - s.version = '1.10.39' + s.version = '1.10.40' s.summary = 'sherpa-onnx Flutter FFI plugin project.' s.description = <<-DESC sherpa-onnx Flutter FFI plugin project. diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/README.md b/harmony-os/SherpaOnnxHar/sherpa_onnx/README.md index 917401b430..358a2d107c 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/README.md +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/README.md @@ -23,7 +23,7 @@ or update your `oh-package.json5` to include the following: ``` "dependencies": { - "sherpa_onnx": "1.10.39", + "sherpa_onnx": "1.10.40", }, ``` diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/oh-package.json5 b/harmony-os/SherpaOnnxHar/sherpa_onnx/oh-package.json5 index 34af61da28..676e140924 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/oh-package.json5 +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/oh-package.json5 @@ -1,6 +1,6 @@ { "name": "sherpa_onnx", - "version": "1.10.39", + "version": "1.10.40", "description": "On-device speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without Internet connection", "main": "Index.ets", "author": "The next-gen Kaldi team", diff --git a/harmony-os/SherpaOnnxSpeakerDiarization/entry/oh-package.json5 b/harmony-os/SherpaOnnxSpeakerDiarization/entry/oh-package.json5 index 5449d3e814..faf4a788ae 100644 --- a/harmony-os/SherpaOnnxSpeakerDiarization/entry/oh-package.json5 +++ b/harmony-os/SherpaOnnxSpeakerDiarization/entry/oh-package.json5 @@ -6,7 +6,7 @@ "author": "", "license": "", "dependencies": { - "sherpa_onnx": "1.10.39" + "sherpa_onnx": "1.10.40" } } diff --git a/harmony-os/SherpaOnnxSpeakerIdentification/entry/oh-package.json5 b/harmony-os/SherpaOnnxSpeakerIdentification/entry/oh-package.json5 index 5f2f6b5ff5..e50200119f 100644 --- a/harmony-os/SherpaOnnxSpeakerIdentification/entry/oh-package.json5 +++ b/harmony-os/SherpaOnnxSpeakerIdentification/entry/oh-package.json5 @@ -6,7 +6,7 @@ "author": "", "license": "", "dependencies": { - "sherpa_onnx": "1.10.39", + "sherpa_onnx": "1.10.40", } } diff --git a/harmony-os/SherpaOnnxStreamingAsr/entry/oh-package.json5 b/harmony-os/SherpaOnnxStreamingAsr/entry/oh-package.json5 index 5f2f6b5ff5..e50200119f 100644 --- a/harmony-os/SherpaOnnxStreamingAsr/entry/oh-package.json5 +++ b/harmony-os/SherpaOnnxStreamingAsr/entry/oh-package.json5 @@ -6,7 +6,7 @@ "author": "", "license": "", "dependencies": { - "sherpa_onnx": "1.10.39", + "sherpa_onnx": "1.10.40", } } diff --git a/harmony-os/SherpaOnnxTts/entry/oh-package.json5 b/harmony-os/SherpaOnnxTts/entry/oh-package.json5 index 5f2f6b5ff5..e50200119f 100644 --- a/harmony-os/SherpaOnnxTts/entry/oh-package.json5 +++ b/harmony-os/SherpaOnnxTts/entry/oh-package.json5 @@ -6,7 +6,7 @@ "author": "", "license": "", "dependencies": { - "sherpa_onnx": "1.10.39", + "sherpa_onnx": "1.10.40", } } diff --git a/harmony-os/SherpaOnnxVadAsr/entry/README.md b/harmony-os/SherpaOnnxVadAsr/entry/README.md index 467c213913..948dc7ffb0 100644 --- a/harmony-os/SherpaOnnxVadAsr/entry/README.md +++ b/harmony-os/SherpaOnnxVadAsr/entry/README.md @@ -1,6 +1,6 @@ # Introduction -Please download ./sherpa_onnx-v1.10.39.har +Please download ./sherpa_onnx-v1.10.40.har from Hint: For users who have no access to huggingface, please use diff --git a/harmony-os/SherpaOnnxVadAsr/entry/oh-package.json5 b/harmony-os/SherpaOnnxVadAsr/entry/oh-package.json5 index fae5caf53c..ca1e2eaf12 100644 --- a/harmony-os/SherpaOnnxVadAsr/entry/oh-package.json5 +++ b/harmony-os/SherpaOnnxVadAsr/entry/oh-package.json5 @@ -7,7 +7,7 @@ "license": "", "dependencies": { // please see https://ohpm.openharmony.cn/#/cn/detail/sherpa_onnx - "sherpa_onnx": "1.10.39", + "sherpa_onnx": "1.10.40", } } diff --git a/java-api-examples/NonStreamingTtsKokoroEn.java b/java-api-examples/NonStreamingTtsKokoroEn.java new file mode 100644 index 0000000000..e36ba663c5 --- /dev/null +++ b/java-api-examples/NonStreamingTtsKokoroEn.java @@ -0,0 +1,60 @@ +// Copyright 2025 Xiaomi Corporation + +// This file shows how to use a Kokoro English model +// to convert text to speech +import com.k2fsa.sherpa.onnx.*; + +public class NonStreamingTtsKokoroEn { + public static void main(String[] args) { + // please visit + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html + // to download model files + String model = "./kokoro-en-v0_19/model.onnx"; + String voices = "./kokoro-en-v0_19/voices.bin"; + String tokens = "./kokoro-en-v0_19/tokens.txt"; + String dataDir = "./kokoro-en-v0_19/espeak-ng-data"; + String text = + "Today as always, men fall into two groups: slaves and free men. Whoever does not have" + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a" + + " businessman, an official, or a scholar."; + + OfflineTtsKokoroModelConfig kokoroModelConfig = + OfflineTtsKokoroModelConfig.builder() + .setModel(model) + .setVoices(voices) + .setTokens(tokens) + .setDataDir(dataDir) + .build(); + + OfflineTtsModelConfig modelConfig = + OfflineTtsModelConfig.builder() + .setKokoro(kokoroModelConfig) + .setNumThreads(2) + .setDebug(true) + .build(); + + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build(); + OfflineTts tts = new OfflineTts(config); + + int sid = 0; + float speed = 1.0f; + long start = System.currentTimeMillis(); + GeneratedAudio audio = tts.generate(text, sid, speed); + long stop = System.currentTimeMillis(); + + float timeElapsedSeconds = (stop - start) / 1000.0f; + + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); + float real_time_factor = timeElapsedSeconds / audioDuration; + + String waveFilename = "tts-kokoro-en.wav"; + audio.save(waveFilename); + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); + System.out.printf("-- text: %s\n", text); + System.out.printf("-- Saved to %s\n", waveFilename); + + tts.release(); + } +} diff --git a/java-api-examples/run-non-streaming-tts-kokoro-en.sh b/java-api-examples/run-non-streaming-tts-kokoro-en.sh new file mode 100755 index 0000000000..cf8101cd88 --- /dev/null +++ b/java-api-examples/run-non-streaming-tts-kokoro-en.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html +# to download more models +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + NonStreamingTtsKokoroEn.java diff --git a/jitpack.yml b/jitpack.yml index 7f283a09fc..ba9c64759d 100644 --- a/jitpack.yml +++ b/jitpack.yml @@ -2,8 +2,8 @@ jdk: - openjdk17 before_install: - - wget https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.10.39/sherpa-onnx-1.10.39.aar + - wget https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.10.40/sherpa-onnx-1.10.40.aar install: - - FILE="-Dfile=sherpa-onnx-1.10.39.aar" - - mvn install:install-file $FILE -DgroupId=com.k2fsa.sherpa.onnx -DartifactId=sherpa-onnx -Dversion=1.10.39 -Dpackaging=aar -DgeneratePom=true + - FILE="-Dfile=sherpa-onnx-1.10.40.aar" + - mvn install:install-file $FILE -DgroupId=com.k2fsa.sherpa.onnx -DartifactId=sherpa-onnx -Dversion=1.10.40 -Dpackaging=aar -DgeneratePom=true diff --git a/kotlin-api-examples/run.sh b/kotlin-api-examples/run.sh index 63ea224d10..02339f952d 100755 --- a/kotlin-api-examples/run.sh +++ b/kotlin-api-examples/run.sh @@ -115,6 +115,12 @@ function testTts() { curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx fi + if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 + tar xf kokoro-en-v0_19.tar.bz2 + rm kokoro-en-v0_19.tar.bz2 + fi + out_filename=test_tts.jar kotlinc-jvm -include-runtime -d $out_filename \ test_tts.kt \ diff --git a/kotlin-api-examples/test_tts.kt b/kotlin-api-examples/test_tts.kt index 3865c33e31..d9637873d5 100644 --- a/kotlin-api-examples/test_tts.kt +++ b/kotlin-api-examples/test_tts.kt @@ -3,6 +3,28 @@ package com.k2fsa.sherpa.onnx fun main() { testVits() testMatcha() + testKokoro() +} + +fun testKokoro() { + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models + var config = OfflineTtsConfig( + model=OfflineTtsModelConfig( + kokoro=OfflineTtsKokoroModelConfig( + model="./kokoro-en-v0_19/model.onnx", + voices="./kokoro-en-v0_19/voices.bin", + tokens="./kokoro-en-v0_19/tokens.txt", + dataDir="./kokoro-en-v0_19/espeak-ng-data", + ), + numThreads=2, + debug=true, + ), + ) + val tts = OfflineTts(config=config) + val audio = tts.generateWithCallback(text="How are you doing today?", callback=::callback) + audio.save(filename="test-kokoro-en.wav") + tts.release() + println("Saved to test-kokoro-en.wav") } fun testMatcha() { @@ -24,9 +46,9 @@ fun testMatcha() { ) val tts = OfflineTts(config=config) val audio = tts.generateWithCallback(text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。", callback=::callback) - audio.save(filename="test-zh.wav") + audio.save(filename="test-matcha-zh.wav") tts.release() - println("Saved to test-zh.wav") + println("Saved to test-matcha-zh.wav") } fun testVits() { diff --git a/new-release.sh b/new-release.sh index e52bf65b6f..dfddf6b921 100755 --- a/new-release.sh +++ b/new-release.sh @@ -2,18 +2,18 @@ set -ex -sed -i.bak 's/1\.10\.38/1\.10\.39/g' ./build-ios-shared.sh -sed -i.bak 's/1\.10\.38/1\.10\.39/g' ./pom.xml -sed -i.bak 's/1\.10\.38/1\.10\.39/g' ./jitpack.yml -sed -i.bak 's/1\.10\.38/1\.10\.39/g' ./android/SherpaOnnxAar/README.md +sed -i.bak 's/1\.10\.39/1\.10\.40/g' ./build-ios-shared.sh +sed -i.bak 's/1\.10\.39/1\.10\.40/g' ./pom.xml +sed -i.bak 's/1\.10\.39/1\.10\.40/g' ./jitpack.yml +sed -i.bak 's/1\.10\.39/1\.10\.40/g' ./android/SherpaOnnxAar/README.md -find android -name build.gradle -type f -exec sed -i.bak 's/sherpa-onnx:v1\.10\.38/sherpa-onnx:v1\.10\.39/g' {} \; +find android -name build.gradle -type f -exec sed -i.bak 's/sherpa-onnx:v1\.10\.39/sherpa-onnx:v1\.10\.40/g' {} \; -find flutter -name *.yaml -type f -exec sed -i.bak 's/1\.10\.38/1\.10\.39/g' {} \; -find dart-api-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.38/1\.10\.39/g' {} \; -find flutter-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.38/1\.10\.39/g' {} \; -find flutter -name *.podspec -type f -exec sed -i.bak 's/1\.10\.38/1\.10\.39/g' {} \; -find nodejs-addon-examples -name package.json -type f -exec sed -i.bak 's/1\.10\.38/1\.10\.39/g' {} \; +find flutter -name *.yaml -type f -exec sed -i.bak 's/1\.10\.39/1\.10\.40/g' {} \; +find dart-api-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.39/1\.10\.40/g' {} \; +find flutter-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.39/1\.10\.40/g' {} \; +find flutter -name *.podspec -type f -exec sed -i.bak 's/1\.10\.39/1\.10\.40/g' {} \; +find nodejs-addon-examples -name package.json -type f -exec sed -i.bak 's/1\.10\.39/1\.10\.40/g' {} \; -find harmony-os -name "README.md" -type f -exec sed -i.bak 's/1\.10\.38/1\.10\.39/g' {} \; -find harmony-os -name oh-package.json5 -type f -exec sed -i.bak 's/1\.10\.38/1\.10\.39/g' {} \; +find harmony-os -name "README.md" -type f -exec sed -i.bak 's/1\.10\.39/1\.10\.40/g' {} \; +find harmony-os -name oh-package.json5 -type f -exec sed -i.bak 's/1\.10\.39/1\.10\.40/g' {} \; diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index debe3ac96e..6a77791651 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.10.39" + "sherpa-onnx-node": "^1.10.40" } } diff --git a/pom.xml b/pom.xml index 7f9c217b68..1063994c31 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ 4.0.0 com.k2fsa.sherpa.onnx sherpa-onnx-android - 1.10.39 + 1.10.40 https://github.com/k2-fsa/sherpa-onnx pom First Android Library diff --git a/scripts/apk/build-apk-tts-engine.sh.in b/scripts/apk/build-apk-tts-engine.sh.in index 69933d2fc1..dd00b72e99 100644 --- a/scripts/apk/build-apk-tts-engine.sh.in +++ b/scripts/apk/build-apk-tts-engine.sh.in @@ -39,6 +39,7 @@ model_dir={{ tts_model.model_dir }} model_name={{ tts_model.model_name }} acoustic_model_name={{ tts_model.acoustic_model_name }} vocoder={{ tts_model.vocoder }} +voices={{ tts_model.voices }} lang={{ tts_model.lang }} lang_iso_639_3={{ tts_model.lang_iso_639_3 }} @@ -70,6 +71,10 @@ sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./TtsEngine.kt {% endif %} +{% if tts_model.voices %} + sed -i.bak s/"voices = null"/"voices = \"$voices\""/ ./TtsEngine.kt +{% endif %} + {% if tts_model.rule_fsts %} rule_fsts={{ tts_model.rule_fsts }} sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt diff --git a/scripts/apk/build-apk-tts.sh.in b/scripts/apk/build-apk-tts.sh.in index 34135f1a19..f972007e2f 100644 --- a/scripts/apk/build-apk-tts.sh.in +++ b/scripts/apk/build-apk-tts.sh.in @@ -39,6 +39,7 @@ model_dir={{ tts_model.model_dir }} model_name={{ tts_model.model_name }} acoustic_model_name={{ tts_model.acoustic_model_name }} vocoder={{ tts_model.vocoder }} +voices={{ tts_model.voices }} lang={{ tts_model.lang }} wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2 @@ -69,6 +70,9 @@ sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./MainActivity.kt {% endif %} +{% if tts_model.voices %} + sed -i.bak s/"voices = null"/"voices = \"$voices\""/ ./MainActivity.kt +{% endif %} {% if tts_model.rule_fsts %} rule_fsts={{ tts_model.rule_fsts }} diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 1d804ecf94..f43dc644b8 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -33,6 +33,7 @@ class TtsModel: model_name: str = "" # for vits acoustic_model_name: str = "" # for matcha vocoder: str = "" # for matcha + voices: str = "" # for kokoro lang: str = "" # en, zh, fr, de, etc. rule_fsts: Optional[List[str]] = None rule_fars: Optional[List[str]] = None @@ -409,6 +410,21 @@ def get_matcha_models() -> List[TtsModel]: return chinese_models + english_models +def get_kokoro_models() -> List[TtsModel]: + english_models = [ + TtsModel( + model_dir="kokoro-en-v0_19", + model_name="model.onnx", + lang="en", + ) + ] + for m in english_models: + m.data_dir = f"{m.model_dir}/espeak-ng-data" + m.voices = "voices.bin" + + return english_models + + def main(): args = get_args() index = args.index @@ -421,6 +437,7 @@ def main(): all_model_list += get_mimic3_models() all_model_list += get_coqui_models() all_model_list += get_matcha_models() + all_model_list += get_kokoro_models() convert_lang_to_iso_639_3(all_model_list) print(all_model_list) diff --git a/sherpa-onnx/java-api/Makefile b/sherpa-onnx/java-api/Makefile index 816d2139f2..0721daf168 100644 --- a/sherpa-onnx/java-api/Makefile +++ b/sherpa-onnx/java-api/Makefile @@ -35,6 +35,7 @@ java_files += OfflineRecognizerResult.java java_files += OfflineStream.java java_files += OfflineRecognizer.java +java_files += OfflineTtsKokoroModelConfig.java java_files += OfflineTtsMatchaModelConfig.java java_files += OfflineTtsVitsModelConfig.java java_files += OfflineTtsModelConfig.java diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig.java new file mode 100644 index 0000000000..4088acfd38 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig.java @@ -0,0 +1,80 @@ +// Copyright 2025 Xiaomi Corporation +package com.k2fsa.sherpa.onnx; + +public class OfflineTtsKokoroModelConfig { + private final String model; + private final String voices; + private final String tokens; + private final String dataDir; + private final float lengthScale; + + private OfflineTtsKokoroModelConfig(Builder builder) { + this.model = builder.model; + this.voices = builder.voices; + this.tokens = builder.tokens; + this.dataDir = builder.dataDir; + this.lengthScale = builder.lengthScale; + } + + public static Builder builder() { + return new Builder(); + } + + public String getModel() { + return model; + } + + public String getVoices() { + return voices; + } + + public String getTokens() { + return tokens; + } + + public String getDataDir() { + return dataDir; + } + + public float getLengthScale() { + return lengthScale; + } + + + public static class Builder { + private String model = ""; + private String voices = ""; + private String tokens = ""; + private String dataDir = ""; + private float lengthScale = 1.0f; + + public OfflineTtsKokoroModelConfig build() { + return new OfflineTtsKokoroModelConfig(this); + } + + public Builder setModel(String model) { + this.model = model; + return this; + } + + public Builder setVoices(String voices) { + this.voices = voices; + return this; + } + + public Builder setTokens(String tokens) { + this.tokens = tokens; + return this; + } + + public Builder setDataDir(String dataDir) { + this.dataDir = dataDir; + return this; + } + + public Builder setLengthScale(float lengthScale) { + this.lengthScale = lengthScale; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java index ff3589b13c..24df8a5d3a 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsModelConfig.java @@ -5,6 +5,7 @@ public class OfflineTtsModelConfig { private final OfflineTtsVitsModelConfig vits; private final OfflineTtsMatchaModelConfig matcha; + private final OfflineTtsKokoroModelConfig kokoro; private final int numThreads; private final boolean debug; private final String provider; @@ -12,6 +13,7 @@ public class OfflineTtsModelConfig { private OfflineTtsModelConfig(Builder builder) { this.vits = builder.vits; this.matcha = builder.matcha; + this.kokoro = builder.kokoro; this.numThreads = builder.numThreads; this.debug = builder.debug; this.provider = builder.provider; @@ -29,9 +31,14 @@ public OfflineTtsMatchaModelConfig getMatcha() { return matcha; } + public OfflineTtsKokoroModelConfig getKokoro() { + return kokoro; + } + public static class Builder { private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build(); private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build(); + private OfflineTtsKokoroModelConfig kokoro = OfflineTtsKokoroModelConfig.builder().build(); private int numThreads = 1; private boolean debug = true; private String provider = "cpu"; @@ -50,6 +57,11 @@ public Builder setMatcha(OfflineTtsMatchaModelConfig matcha) { return this; } + public Builder setKokoro(OfflineTtsKokoroModelConfig kokoro) { + this.kokoro = kokoro; + return this; + } + public Builder setNumThreads(int numThreads) { this.numThreads = numThreads; return this; diff --git a/sherpa-onnx/jni/offline-tts.cc b/sherpa-onnx/jni/offline-tts.cc index f57a766d2a..23a2c7ae79 100644 --- a/sherpa-onnx/jni/offline-tts.cc +++ b/sherpa-onnx/jni/offline-tts.cc @@ -113,6 +113,39 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { fid = env->GetFieldID(matcha_cls, "lengthScale", "F"); ans.model.matcha.length_scale = env->GetFloatField(matcha, fid); + // kokoro + fid = env->GetFieldID(model_config_cls, "kokoro", + "Lcom/k2fsa/sherpa/onnx/OfflineTtsKokoroModelConfig;"); + jobject kokoro = env->GetObjectField(model, fid); + jclass kokoro_cls = env->GetObjectClass(kokoro); + + fid = env->GetFieldID(kokoro_cls, "model", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(kokoro, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model.kokoro.model = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(kokoro_cls, "voices", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(kokoro, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model.kokoro.voices = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(kokoro_cls, "tokens", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(kokoro, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model.kokoro.tokens = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(kokoro, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model.kokoro.data_dir = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(kokoro_cls, "lengthScale", "F"); + ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid); + fid = env->GetFieldID(model_config_cls, "numThreads", "I"); ans.model.num_threads = env->GetIntField(model, fid); @@ -320,8 +353,8 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl( return env->CallIntMethod(should_continue, int_value_mid); }; - auto audio = reinterpret_cast(ptr)->Generate( - p_text, sid, speed, callback_wrapper); + auto tts = reinterpret_cast(ptr); + auto audio = tts->Generate(p_text, sid, speed, callback_wrapper); jfloatArray samples_arr = env->NewFloatArray(audio.samples.size()); env->SetFloatArrayRegion(samples_arr, 0, audio.samples.size(), diff --git a/sherpa-onnx/kotlin-api/Tts.kt b/sherpa-onnx/kotlin-api/Tts.kt index 96f7f8080b..9461b1f372 100644 --- a/sherpa-onnx/kotlin-api/Tts.kt +++ b/sherpa-onnx/kotlin-api/Tts.kt @@ -25,9 +25,18 @@ data class OfflineTtsMatchaModelConfig( var lengthScale: Float = 1.0f, ) +data class OfflineTtsKokoroModelConfig( + var model: String = "", + var voices: String = "", + var tokens: String = "", + var dataDir: String = "", + var lengthScale: Float = 1.0f, +) + data class OfflineTtsModelConfig( var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(), var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(), + var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(), var numThreads: Int = 1, var debug: Boolean = false, var provider: String = "cpu", @@ -198,12 +207,32 @@ fun getOfflineTtsConfig( modelName: String, // for VITS acousticModelName: String, // for Matcha vocoder: String, // for Matcha + voices: String, // for Kokoro lexicon: String, dataDir: String, dictDir: String, ruleFsts: String, - ruleFars: String + ruleFars: String, + numThreads: Int? = null ): OfflineTtsConfig { + // For Matcha TTS, please set + // acousticModelName, vocoder + + // For Kokoro TTS, please set + // modelName, voices + + // For VITS, please set + // modelName + + val numberOfThreads = if (numThreads != null) { + numThreads + } else if (voices.isNotEmpty()) { + // for Kokoro TTS models, we use more threads + 4 + } else { + 2 + } + if (modelName.isEmpty() && acousticModelName.isEmpty()) { throw IllegalArgumentException("Please specify a TTS model") } @@ -215,7 +244,8 @@ fun getOfflineTtsConfig( if (acousticModelName.isNotEmpty() && vocoder.isEmpty()) { throw IllegalArgumentException("Please provide vocoder for Matcha TTS") } - val vits = if (modelName.isNotEmpty()) { + + val vits = if (modelName.isNotEmpty() && voices.isEmpty()) { OfflineTtsVitsModelConfig( model = "$modelDir/$modelName", lexicon = "$modelDir/$lexicon", @@ -240,11 +270,23 @@ fun getOfflineTtsConfig( OfflineTtsMatchaModelConfig() } + val kokoro = if (voices.isNotEmpty()) { + OfflineTtsKokoroModelConfig( + model = "$modelDir/$modelName", + voices = "$modelDir/$voices", + tokens = "$modelDir/tokens.txt", + dataDir = dataDir, + ) + } else { + OfflineTtsKokoroModelConfig() + } + return OfflineTtsConfig( model = OfflineTtsModelConfig( vits = vits, matcha = matcha, - numThreads = 2, + kokoro = kokoro, + numThreads = numberOfThreads, debug = true, provider = "cpu", ), diff --git a/wasm/kws/assets/README.md b/wasm/kws/assets/README.md index ac67fb5a04..18f792b49c 100644 --- a/wasm/kws/assets/README.md +++ b/wasm/kws/assets/README.md @@ -7,21 +7,34 @@ to download a model. # Kws The following is an example: -``` -cd sherpa-onnx/wasm/kws -git clone https://www.modelscope.cn/pkufool/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.git assets +```bash +cd sherpa-onnx/wasm/kws/assets +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 +tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 +rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 + +mv sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx ./ +mv sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx ./ +mv sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx ./ +mv sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt ./ +rm -rf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 ``` You should have the following files in `assets` before you can run `build-wasm-simd-kws.sh` -``` -├── decoder-epoch-12-avg-2-chunk-16-left-64.onnx -├── encoder-epoch-12-avg-2-chunk-16-left-64.onnx -├── joiner-epoch-12-avg-2-chunk-16-left-64.onnx -├── keywords_raw.txt -├── keywords.txt -├── README.md -└── tokens.txt +```bash +fangjuns-MacBook-Pro:assets fangjun$ pwd +/Users/fangjun/open-source/sherpa-onnx/wasm/kws/assets +fangjuns-MacBook-Pro:assets fangjun$ ls -lh +total 25616 +-rw-r--r-- 1 fangjun staff 692B Oct 29 16:53 README.md +-rw-r--r-- 1 fangjun staff 660K Aug 14 15:21 decoder-epoch-12-avg-2-chunk-16-left-64.onnx +-rw-r--r-- 1 fangjun staff 12M Aug 14 15:21 encoder-epoch-12-avg-2-chunk-16-left-64.onnx +-rw-r--r-- 1 fangjun staff 247K Aug 14 15:21 joiner-epoch-12-avg-2-chunk-16-left-64.onnx +-rw-r--r-- 1 fangjun staff 1.6K Aug 14 15:08 tokens.txt ``` + +**Hint**: Remember to remove extra files from ``assets``. For instance, please remember to remove +the file `sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2`.