diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index ce682495e97..e471ec95032 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -5,6 +5,17 @@ This example contains code used to train a [Tacotron2](https://arxiv.org/abs/171 ### Download and Extract Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. +The structure of the folder is listed below. + +```text +└─ Wave + └─ .wav files (audio speech) +└─ PhoneLabeling + └─ .interval files (alignment between phoneme and duration) +└─ ProsodyLabeling + └─ 000001-010000.txt (text with prosodic by pinyin) +``` + ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index 5d1a2484536..86407e7786e 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -171,7 +171,8 @@ def forward( if paddle.sum(att_prev) == 0: # if no bias, 0 0-pad goes 0 att_prev = 1.0 - make_pad_mask(enc_hs_len) - att_prev = att_prev / enc_hs_len.unsqueeze(-1) + att_prev = att_prev / enc_hs_len.unsqueeze(-1).astype( + att_prev.dtype) # att_prev: (utt, frame) -> (utt, 1, 1, frame) # -> (utt, att_conv_chans, 1, frame) diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index 224c82400d2..ac942be0f13 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -162,6 +162,9 @@ def forward(self, xs, ilens=None): return xs.transpose([0, 2, 1]) if not isinstance(ilens, paddle.Tensor): ilens = paddle.to_tensor(ilens) + # check if ilens is 0-dim tensor, if so, add a dimension + if ilens.ndim == 0: + ilens = ilens.unsqueeze(0) xs = xs.transpose([0, 2, 1]) # for dygraph to static graph # self.blstm.flatten_parameters()