diff --git a/README.md b/README.md index 390d68a4..4c5ac1b6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Parallel WaveGAN (+ MelGAN & Multi-band MelGAN) implementation with Pytorch -![](https://github.com/kan-bayashi/ParallelWaveGAN/workflows/CI/badge.svg) [![](https://img.shields.io/pypi/v/parallel-wavegan)](https://pypi.org/project/parallel-wavegan/) ![](https://img.shields.io/pypi/pyversions/parallel-wavegan) ![](https://img.shields.io/pypi/l/parallel-wavegan) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb) +![](https://github.com/kan-bayashi/ParallelWaveGAN/workflows/CI/badge.svg) [![](https://img.shields.io/pypi/v/parallel-wavegan)](https://pypi.org/project/parallel-wavegan/) ![](https://img.shields.io/pypi/pyversions/parallel-wavegan) ![](https://img.shields.io/pypi/l/parallel-wavegan) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb) This repository provides **UNOFFICIAL** [PWG](https://arxiv.org/abs/1910.11480), [MelGAN](https://arxiv.org/abs/1910.06711), and [MB-MelGAN](https://arxiv.org/abs/2005.05106) implementations with Pytorch. You can combine these state-of-the-art non-autoregressive models to build your own great vocoder! @@ -15,12 +15,13 @@ The goal of this repository is to provide real-time neural vocoder, which is com Also, this repository can be combined with [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2)-based implementation (See [this comment](https://github.com/kan-bayashi/ParallelWaveGAN/issues/169#issuecomment-649320778)). You can try the real-time end-to-end text-to-speech demonstration in Google Colab! - -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb) +- Real-time demonstration with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb) +- Real-time demonstration with ESPnet1 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb) ## What's new -- 2020/05/29 **(New!)** [VCTK, JSUT, and CSMSC multi-band MelGAN pretrained model](#Results) is available! +- 2020/08/19 **(New!)** [Real-time demo with ESPnet2](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb) is available! +- 2020/05/29 [VCTK, JSUT, and CSMSC multi-band MelGAN pretrained model](#Results) is available! - 2020/05/27 [New LJSpeech multi-band MelGAN pretrained model](#Results) is available! - 2020/05/24 [LJSpeech full-band MelGAN pretrained model](#Results) is available! - 2020/05/22 [LJSpeech multi-band MelGAN pretrained model](#Results) is available! @@ -47,7 +48,9 @@ This repository is tested on Ubuntu 16.04 with a GPU Titan V. - sox (you can install via `sudo apt install sox` in ubuntu) Different cuda version should be working but not explicitly tested. -All of the codes are tested on Pytorch 1.0.1, 1.1, 1.2, 1.3.1, 1.4, and 1.5. +All of the codes are tested on Pytorch 1.0.1, 1.1, 1.2, 1.3.1, 1.4, and 1.5.1. + +Pytorch 1.6 works but there are some issues in cpu mode (See #198). ## Setup @@ -183,14 +186,14 @@ You can listen to the samples and download pretrained models from the link to ou | [ljspeech_multi_band_melgan.v1](https://drive.google.com/open?id=1ls_YxCccQD-v6ADbG6qXlZ8f30KrrhLT) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/ljspeech/voc1/conf/multi_band_melgan.v1.yaml) | EN | 22.05k | 80-7600 | 1024 / 256 / None | 1000k | | [ljspeech_multi_band_melgan.v2](https://drive.google.com/open?id=1wevYP2HQ7ec2fSixTpZIX0sNBtYZJz_I) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/ljspeech/voc1/conf/multi_band_melgan.v2.yaml) | EN | 22.05k | 80-7600 | 1024 / 256 / None | 1000k | | [jsut_parallel_wavegan.v1](https://drive.google.com/open?id=1UDRL0JAovZ8XZhoH0wi9jj_zeCKb-AIA) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/jsut/voc1/conf/parallel_wavegan.v1.yaml) | JP | 24k | 80-7600 | 2048 / 300 / 1200 | 400k | -| [jsut_multi_band_melgan.v2 (**New!**)](https://drive.google.com/open?id=1E4fe0c5gMLtmSS0Hrzj-9nUbMwzke4PS) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml) | JP | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | +| [jsut_multi_band_melgan.v2](https://drive.google.com/open?id=1E4fe0c5gMLtmSS0Hrzj-9nUbMwzke4PS) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/jsut/voc1/conf/multi_band_melgan.v2.yaml) | JP | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | | [csmsc_parallel_wavegan.v1](https://drive.google.com/open?id=1C2nu9nOFdKcEd-D9xGquQ0bCia0B2v_4) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/csmsc/voc1/conf/parallel_wavegan.v1.yaml) | ZH | 24k | 80-7600 | 2048 / 300 / 1200 | 400k | -| [csmsc_multi_band_melgan.v2 (**New!**)](https://drive.google.com/open?id=1F7FwxGbvSo1Rnb5kp0dhGwimRJstzCrz) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml) | ZH | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | +| [csmsc_multi_band_melgan.v2](https://drive.google.com/open?id=1F7FwxGbvSo1Rnb5kp0dhGwimRJstzCrz) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/csmsc/voc1/conf/multi_band_melgan.v2.yaml) | ZH | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | | [arctic_slt_parallel_wavegan.v1](https://drive.google.com/open?id=1xG9CmSED2TzFdklD6fVxzf7kFV2kPQAJ) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/arctic/voc1/conf/parallel_wavegan.v1.yaml) | EN | 16k | 80-7600 | 1024 / 256 / None | 400k | | [jnas_parallel_wavegan.v1](https://drive.google.com/open?id=1n_hkxPxryVXbp6oHM1NFm08q0TcoDXz1) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/jnas/voc1/conf/parallel_wavegan.v1.yaml) | JP | 16k | 80-7600 | 1024 / 256 / None | 400k | | [vctk_parallel_wavegan.v1](https://drive.google.com/open?id=1dGTu-B7an2P5sEOepLPjpOaasgaSnLpi) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/vctk/voc1/conf/parallel_wavegan.v1.yaml) | EN | 24k | 80-7600 | 2048 / 300 / 1200 | 400k | | [vctk_parallel_wavegan.v1.long](https://drive.google.com/open?id=1qoocM-VQZpjbv5B-zVJpdraazGcPL0So) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/vctk/voc1/conf/parallel_wavegan.v1.long.yaml) | EN | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | -| [vctk_multi_band_melgan.v2 (**New!**)](https://drive.google.com/open?id=17EkB4hSKUEDTYEne-dNHtJT724hdivn4) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/vctk/voc1/conf/multi_band_melgan.v2.yaml) | EN | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | +| [vctk_multi_band_melgan.v2](https://drive.google.com/open?id=17EkB4hSKUEDTYEne-dNHtJT724hdivn4) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/vctk/voc1/conf/multi_band_melgan.v2.yaml) | EN | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | | [libritts_parallel_wavegan.v1](https://drive.google.com/open?id=1pb18Nd2FCYWnXfStszBAEEIMe_EZUJV0) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/libritts/voc1/conf/parallel_wavegan.v1.yaml) | EN | 24k | 80-7600 | 2048 / 300 / 1200 | 400k | | [libritts_parallel_wavegan.v1.long](https://drive.google.com/open?id=15ibzv-uTeprVpwT946Hl1XUYDmg5Afwz) | [link](https://github.com/kan-bayashi/ParallelWaveGAN/blob/master/egs/libritts/voc1/conf/parallel_wavegan.v1.long.yaml) | EN | 24k | 80-7600 | 2048 / 300 / 1200 | 1000k | @@ -207,37 +210,46 @@ Here the minimal code is shown to perform analysis-synthesis using the pretraine # If not, please install via pip $ pip install parallel_wavegan -# Please download pretrained models and put them in `pretrain_model` directory -$ ls pretrain_model +# You can download the pretrained model from terminal +$ python << EOF +from parallel_wavegan.utils import download_pretrained_model +download_pretrained_model("", 'pretrained_model')" +EOF + +# You can get all of available pretrained models as follows: +$ python << EOF +from parallel_wavegan.utils import PRETRAINED_MODEL_LIST +print(PRETRAINED_MODEL_LIST.keys()) +EOF + +# Now you can find downloaded pretrained model in `pretrained_model//` +$ ls pretrain_model/  checkpoint-400000steps.pkl  config.yml  stats.h5 +# These files can also be downloaded manually from the above results + # Please put an audio file in `sample` directory to perform analysis-synthesis $ ls sample/  sample.wav # Then perform feature extraction -> feature normalization -> sysnthesis $ parallel-wavegan-preprocess \ - --config pretrain_model/config.yml \ + --config pretrain_model//config.yml \ --rootdir sample \ --dumpdir dump/sample/raw 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 914.19it/s] -[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers. -[Parallel(n_jobs=16)]: Done 1 out of 1 | elapsed: 1.2s finished $ parallel-wavegan-normalize \ - --config pretrain_model/config.yml \ + --config pretrain_model//config.yml \ --rootdir dump/sample/raw \ --dumpdir dump/sample/norm \ - --stats pretrain_model/stats.h5 + --stats pretrain_model//stats.h5 2019-11-13 13:44:29,574 (normalize:87) INFO: the number of files = 1. 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 513.13it/s] -[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers. -[Parallel(n_jobs=16)]: Done 1 out of 1 | elapsed: 0.6s finished $ parallel-wavegan-decode \ - --checkpoint pretrain_model/checkpoint-400000steps.pkl \ + --checkpoint pretrain_model//checkpoint-400000steps.pkl \ --dumpdir dump/sample/norm \ --outdir sample 2019-11-13 13:44:31,229 (decode:91) INFO: the number of features to be decoded = 1. -2019-11-13 13:44:37,074 (decode:105) INFO: loaded model parameters from pretrain_model/checkpoint-400000steps.pkl. [decode]: 100%|███████████████████| 1/1 [00:00<00:00, 18.33it/s, RTF=0.0146] 2019-11-13 13:44:37,132 (decode:129) INFO: finished generation of 1 utterances (RTF = 0.015). @@ -254,15 +266,35 @@ Here, I show the procedure to generate waveforms with features generated by [ESP # Make sure you already finished running the recipe of ESPnet-TTS. # You must use the same feature settings for both Text2Mel and Mel2Wav models. # Let us move on "ESPnet" recipe directory +$ cd /path/to/espnet/egs//tts1 $ pwd /path/to/espnet/egs//tts1 +# If you use ESPnet2, move on `egs2/` +$ cd /path/to/espnet/egs2//tts1 +$ pwd +/path/to/espnet/egs2//tts1 + # Please install this repository in ESPnet conda (or virtualenv) environment $ . ./path.sh && pip install -U parallel_wavegan -# Please download pretrained models and put them in `pretrain_model` directory -$ ls pretrain_model +# You can download the pretrained model from terminal +$ python << EOF +from parallel_wavegan.utils import download_pretrained_model +download_pretrained_model("", 'pretrained_model')" +EOF + +# You can get all of available pretrained models as follows: +$ python << EOF +from parallel_wavegan.utils import PRETRAINED_MODEL_LIST +print(PRETRAINED_MODEL_LIST.keys()) +EOF + +# You can find downloaded pretrained model in `pretrained_model//` +$ ls pretrain_model/  checkpoint-400000steps.pkl  config.yml  stats.h5 + +# These files can also be downloaded manually from the above results ``` **Case 1**: If you use the same dataset for both Text2Mel and Mel2Wav @@ -270,12 +302,19 @@ $ ls pretrain_model ```bash # In this case, you can directly use generated features for decoding. # Please specify `feats.scp` path for `--feats-scp`, which is located in -# exp//outputs_*_decode//feats.scp. +# exp//outputs_*_decode//feats.scp. # Note that do not use outputs_*decode_denorm//feats.scp since # it is de-normalized features (the input for PWG is normalized features). $ parallel-wavegan-decode \ - --checkpoint pretrain_model/checkpoint-400000steps.pkl \ - --feats-scp exp//outputs_*_decode//feats.scp \ + --checkpoint pretrain_model//checkpoint-400000steps.pkl \ + --feats-scp exp//outputs_*_decode//feats.scp \ + --outdir + +# In the case of ESPnet2, the generated feature can be found in +# exp//decode_*//norm/feats.scp. +$ parallel-wavegan-decode \ + --checkpoint pretrain_model//checkpoint-400000steps.pkl \ + --feats-scp exp//decode_*//norm/feats.scp \ --outdir # You can find the generated waveforms in /. @@ -288,12 +327,21 @@ $ ls ```bash # In this case, you must perform normlization at first. # Please specify `feats.scp` path for `--feats-scp`, which is located in -# exp//outputs_*_decode_denorm//feats.scp. +# exp//outputs_*_decode_denorm//feats.scp. $ parallel-wavegan-normalize \ --skip-wav-copy \ - --config pretrain_model/config.yml \ - --stats pretrain_model/stats.h5 \ - --feats-scp exp//outputs_*_decode_denorm//feats.scp \ + --config pretrain_model//config.yml \ + --stats pretrain_model//stats.h5 \ + --feats-scp exp//outputs_*_decode_denorm//feats.scp \ + --dumpdir + +# In the case of ESPnet2, the denormalized generated feature can be found in +# exp//decode_*//denorm/feats.scp. +$ parallel-wavegan-normalize \ + --skip-wav-copy \ + --config pretrain_model//config.yml \ + --stats pretrain_model//stats.h5 \ + --feats-scp exp//decode_*//denorm/feats.scp \ --dumpdir # Normalized features dumped in /. @@ -302,7 +350,7 @@ $ ls # Then, decode normalzied features with the pretrained model. $ parallel-wavegan-decode \ - --checkpoint pretrain_model/checkpoint-400000steps.pkl \ + --checkpoint pretrain_model//checkpoint-400000steps.pkl \ --dumpdir \ --outdir @@ -312,8 +360,8 @@ $ ls ``` If you want to combine these models in python, you can try the real-time demonstration in Google Colab! - -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb) +- Real-time demonstration with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb) +- Real-time demonstration with ESPnet1 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb) ## References @@ -322,7 +370,7 @@ If you want to combine these models in python, you can try the real-time demonst - [LiyuanLucasLiu/RAdam](https://github.com/LiyuanLucasLiu/RAdam) - [MelGAN](https://arxiv.org/abs/1910.06711) - [descriptinc/melgan-neurips](https://github.com/descriptinc/melgan-neurips) -- [Multi-band MelGAN](arxiv.org/abs/2005.05106) +- [Multi-band MelGAN](https://arxiv.org/abs/2005.05106) ## Acknowledgement diff --git a/parallel_wavegan/datasets/__init__.py b/parallel_wavegan/datasets/__init__.py index f9a52532..e3f7a99a 100644 --- a/parallel_wavegan/datasets/__init__.py +++ b/parallel_wavegan/datasets/__init__.py @@ -1,2 +1,2 @@ -from parallel_wavegan.datasets.audio_mel_dataset import * # NOQA -from parallel_wavegan.datasets.scp_dataset import * # NOQA +from .audio_mel_dataset import * # NOQA +from .scp_dataset import * # NOQA diff --git a/parallel_wavegan/layers/__init__.py b/parallel_wavegan/layers/__init__.py index 74369ba6..d6974e70 100644 --- a/parallel_wavegan/layers/__init__.py +++ b/parallel_wavegan/layers/__init__.py @@ -1,5 +1,5 @@ -from parallel_wavegan.layers.causal_conv import * # NOQA -from parallel_wavegan.layers.pqmf import * # NOQA -from parallel_wavegan.layers.residual_block import * # NOQA -from parallel_wavegan.layers.residual_stack import * # NOQA -from parallel_wavegan.layers.upsample import * # NOQA +from .causal_conv import * # NOQA +from .pqmf import * # NOQA +from .residual_block import * # NOQA +from .residual_stack import * # NOQA +from .upsample import * # NOQA diff --git a/parallel_wavegan/losses/__init__.py b/parallel_wavegan/losses/__init__.py index 86819d20..b03080a9 100644 --- a/parallel_wavegan/losses/__init__.py +++ b/parallel_wavegan/losses/__init__.py @@ -1 +1 @@ -from parallel_wavegan.losses.stft_loss import * # NOQA +from .stft_loss import * # NOQA diff --git a/parallel_wavegan/models/__init__.py b/parallel_wavegan/models/__init__.py index 73ddb3d5..4803ba6b 100644 --- a/parallel_wavegan/models/__init__.py +++ b/parallel_wavegan/models/__init__.py @@ -1,2 +1,2 @@ -from parallel_wavegan.models.melgan import * # NOQA -from parallel_wavegan.models.parallel_wavegan import * # NOQA +from .melgan import * # NOQA +from .parallel_wavegan import * # NOQA diff --git a/parallel_wavegan/optimizers/__init__.py b/parallel_wavegan/optimizers/__init__.py index 3014d860..db777e82 100644 --- a/parallel_wavegan/optimizers/__init__.py +++ b/parallel_wavegan/optimizers/__init__.py @@ -1,2 +1,3 @@ from torch.optim import * # NOQA -from parallel_wavegan.optimizers.radam import * # NOQA + +from .radam import * # NOQA diff --git a/parallel_wavegan/utils/__init__.py b/parallel_wavegan/utils/__init__.py index 824ee7a7..e8fa95a0 100644 --- a/parallel_wavegan/utils/__init__.py +++ b/parallel_wavegan/utils/__init__.py @@ -1 +1 @@ -from parallel_wavegan.utils.utils import * # NOQA +from .utils import * # NOQA diff --git a/parallel_wavegan/utils/utils.py b/parallel_wavegan/utils/utils.py index 65314679..1db8daa5 100644 --- a/parallel_wavegan/utils/utils.py +++ b/parallel_wavegan/utils/utils.py @@ -18,11 +18,6 @@ import torch import yaml -import parallel_wavegan.models - -from parallel_wavegan.layers import PQMF - - PRETRAINED_MODEL_LIST = { "ljspeech_parallel_wavegan.v1": "1PdZv37JhAQH6AwNh31QlqruqrvjTBq7U", "ljspeech_parallel_wavegan.v1.long": "1A9TsrD9fHxFviJVFjCk5W6lkzWXwhftv", @@ -291,6 +286,9 @@ def load_model(checkpoint, config=None): with open(config) as f: config = yaml.load(f, Loader=yaml.Loader) + # lazy load for circular error + import parallel_wavegan.models + # get model and load parameters model_class = getattr( parallel_wavegan.models, @@ -303,6 +301,9 @@ def load_model(checkpoint, config=None): # add pqmf if needed if config["generator_params"]["out_channels"] > 1: + # lazy load for circular error + from parallel_wavegan.layers import PQMF + pqmf_params = {} if LooseVersion(config.get("version", "0.1.0")) <= LooseVersion("0.4.2"): # For compatibility, here we set default values in version <= 0.4.2 @@ -333,10 +334,15 @@ def download_pretrained_model(tag, download_dir=None): output_path = f"{download_dir}/{tag}.tar.gz" os.makedirs(f"{download_dir}", exist_ok=True) if not os.path.exists(output_path): + # lazy load for compatibility import gdown + gdown.download(f"https://drive.google.com/uc?id={id_}", output_path, quiet=False) with tarfile.open(output_path, 'r:*') as tar: - tar.extractall(f"{download_dir}/{tag}") + for member in tar.getmembers(): + if member.isreg(): + member.name = os.path.basename(member.name) + tar.extract(member, f"{download_dir}/{tag}") checkpoint_path = find_files(f"{download_dir}/{tag}", "checkpoint*.pkl") return checkpoint_path[0]