Skip to content

Commit

Permalink
Init ppg extractor and ppg2mel (babysor#375)
Browse files Browse the repository at this point in the history
* Init  ppg extractor and ppg2mel

* add preprocess and training

* FIx known issues

* Update __init__.py

Allow to gen audio

* Fix length issue

* Fix bug of preparing fid

* Fix sample issues

* Add UI usage of PPG-vc
  • Loading branch information
babysor authored Mar 3, 2022
1 parent ad22997 commit b617a87
Show file tree
Hide file tree
Showing 57 changed files with 6,385 additions and 80 deletions.
11 changes: 5 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@
*.toc
*.wav
*.sh
synthesizer/saved_models/*
vocoder/saved_models/*
encoder/saved_models/*
cp_hifigan/*
!vocoder/saved_models/pretrained/*
!encoder/saved_models/pretrained.pt
*/saved_models
!vocoder/saved_models/pretrained/**
!encoder/saved_models/pretrained.pt
wavs
log
18 changes: 18 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
"console": "integratedTerminal",
"args": ["-d","..\\audiodata"]
},
{
"name": "Python: Demo Box VC",
"type": "python",
"request": "launch",
"program": "demo_toolbox.py",
"console": "integratedTerminal",
"args": ["-d","..\\audiodata","-vc"]
},
{
"name": "Python: Synth Train",
"type": "python",
Expand All @@ -43,5 +51,15 @@
"console": "integratedTerminal",
"args": ["my_run", "..\\"]
},
{
"name": "Python: PPG Convert",
"type": "python",
"request": "launch",
"program": "run.py",
"console": "integratedTerminal",
"args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml",
"-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
]
},
]
}
6 changes: 6 additions & 0 deletions demo_toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,18 @@
parser.add_argument("-d", "--datasets_root", type=Path, help= \
"Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
"supported datasets.", default=None)
parser.add_argument("-vc", "--vc_mode", action="store_true",
help="Voice Conversion Mode(PPG based)")
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
help="Directory containing saved encoder models")
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
help="Directory containing saved synthesizer models")
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
help="Directory containing saved vocoder models")
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models",
help="Directory containing saved extrator models")
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models",
help="Directory containing saved convert models")
parser.add_argument("--cpu", action="store_true", help=\
"If True, processing is done on CPU, even when a GPU is available.")
parser.add_argument("--seed", type=int, default=None, help=\
Expand Down
27 changes: 22 additions & 5 deletions encoder/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,16 @@ def load_model(weights_fpath: Path, device=None):
_model.load_state_dict(checkpoint["model_state"])
_model.eval()
print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
return _model


def set_model(model, device=None):
global _model, _device
_model = model
if device is None:
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_device = device
_model.to(device)

def is_loaded():
return _model is not None

Expand All @@ -57,7 +65,7 @@ def embed_frames_batch(frames_batch):


def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
min_pad_coverage=0.75, overlap=0.5):
min_pad_coverage=0.75, overlap=0.5, rate=None):
"""
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
Expand Down Expand Up @@ -85,9 +93,18 @@ def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_fram
assert 0 <= overlap < 1
assert 0 < min_pad_coverage <= 1

samples_per_frame = int((sampling_rate * mel_window_step / 1000))
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
if rate != None:
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
else:
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)

assert 0 < frame_step, "The rate is too high"
assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
(sampling_rate / (samples_per_frame * partials_n_frames))

# Compute the slices
wav_slices, mel_slices = [], []
Expand Down
206 changes: 206 additions & 0 deletions ppg2mel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/usr/bin/env python3

# Copyright 2020 Songxiang Liu
# Apache 2.0

from typing import List

import torch
import torch.nn.functional as F

import numpy as np

from .utils.abs_model import AbsMelDecoder
from .rnn_decoder_mol import Decoder
from .utils.cnn_postnet import Postnet
from .utils.vc_utils import get_mask_from_lengths

from utils.load_yaml import HpsYaml

class MelDecoderMOLv2(AbsMelDecoder):
"""Use an encoder to preprocess ppg."""
def __init__(
self,
num_speakers: int,
spk_embed_dim: int,
bottle_neck_feature_dim: int,
encoder_dim: int = 256,
encoder_downsample_rates: List = [2, 2],
attention_rnn_dim: int = 512,
decoder_rnn_dim: int = 512,
num_decoder_rnn_layer: int = 1,
concat_context_to_last: bool = True,
prenet_dims: List = [256, 128],
num_mixtures: int = 5,
frames_per_step: int = 2,
mask_padding: bool = True,
):
super().__init__()

self.mask_padding = mask_padding
self.bottle_neck_feature_dim = bottle_neck_feature_dim
self.num_mels = 80
self.encoder_down_factor=np.cumprod(encoder_downsample_rates)[-1]
self.frames_per_step = frames_per_step
self.use_spk_dvec = True

input_dim = bottle_neck_feature_dim

# Downsampling convolution
self.bnf_prenet = torch.nn.Sequential(
torch.nn.Conv1d(input_dim, encoder_dim, kernel_size=1, bias=False),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(encoder_dim, affine=False),
torch.nn.Conv1d(
encoder_dim, encoder_dim,
kernel_size=2*encoder_downsample_rates[0],
stride=encoder_downsample_rates[0],
padding=encoder_downsample_rates[0]//2,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(encoder_dim, affine=False),
torch.nn.Conv1d(
encoder_dim, encoder_dim,
kernel_size=2*encoder_downsample_rates[1],
stride=encoder_downsample_rates[1],
padding=encoder_downsample_rates[1]//2,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(encoder_dim, affine=False),
)
decoder_enc_dim = encoder_dim
self.pitch_convs = torch.nn.Sequential(
torch.nn.Conv1d(2, encoder_dim, kernel_size=1, bias=False),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(encoder_dim, affine=False),
torch.nn.Conv1d(
encoder_dim, encoder_dim,
kernel_size=2*encoder_downsample_rates[0],
stride=encoder_downsample_rates[0],
padding=encoder_downsample_rates[0]//2,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(encoder_dim, affine=False),
torch.nn.Conv1d(
encoder_dim, encoder_dim,
kernel_size=2*encoder_downsample_rates[1],
stride=encoder_downsample_rates[1],
padding=encoder_downsample_rates[1]//2,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(encoder_dim, affine=False),
)

self.reduce_proj = torch.nn.Linear(encoder_dim + spk_embed_dim, encoder_dim)

# Decoder
self.decoder = Decoder(
enc_dim=decoder_enc_dim,
num_mels=self.num_mels,
frames_per_step=frames_per_step,
attention_rnn_dim=attention_rnn_dim,
decoder_rnn_dim=decoder_rnn_dim,
num_decoder_rnn_layer=num_decoder_rnn_layer,
prenet_dims=prenet_dims,
num_mixtures=num_mixtures,
use_stop_tokens=True,
concat_context_to_last=concat_context_to_last,
encoder_down_factor=self.encoder_down_factor,
)

# Mel-Spec Postnet: some residual CNN layers
self.postnet = Postnet()

def parse_output(self, outputs, output_lengths=None):
if self.mask_padding and output_lengths is not None:
mask = ~get_mask_from_lengths(output_lengths, outputs[0].size(1))
mask = mask.unsqueeze(2).expand(mask.size(0), mask.size(1), self.num_mels)
outputs[0].data.masked_fill_(mask, 0.0)
outputs[1].data.masked_fill_(mask, 0.0)
return outputs

def forward(
self,
bottle_neck_features: torch.Tensor,
feature_lengths: torch.Tensor,
speech: torch.Tensor,
speech_lengths: torch.Tensor,
logf0_uv: torch.Tensor = None,
spembs: torch.Tensor = None,
output_att_ws: bool = False,
):
decoder_inputs = self.bnf_prenet(
bottle_neck_features.transpose(1, 2)
).transpose(1, 2)
logf0_uv = self.pitch_convs(logf0_uv.transpose(1, 2)).transpose(1, 2)
decoder_inputs = decoder_inputs + logf0_uv

assert spembs is not None
spk_embeds = F.normalize(
spembs).unsqueeze(1).expand(-1, decoder_inputs.size(1), -1)
decoder_inputs = torch.cat([decoder_inputs, spk_embeds], dim=-1)
decoder_inputs = self.reduce_proj(decoder_inputs)

# (B, num_mels, T_dec)
T_dec = torch.div(feature_lengths, int(self.encoder_down_factor), rounding_mode='floor')
mel_outputs, predicted_stop, alignments = self.decoder(
decoder_inputs, speech, T_dec)
## Post-processing
mel_outputs_postnet = self.postnet(mel_outputs.transpose(1, 2)).transpose(1, 2)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
if output_att_ws:
return self.parse_output(
[mel_outputs, mel_outputs_postnet, predicted_stop, alignments], speech_lengths)
else:
return self.parse_output(
[mel_outputs, mel_outputs_postnet, predicted_stop], speech_lengths)

# return mel_outputs, mel_outputs_postnet

def inference(
self,
bottle_neck_features: torch.Tensor,
logf0_uv: torch.Tensor = None,
spembs: torch.Tensor = None,
):
decoder_inputs = self.bnf_prenet(bottle_neck_features.transpose(1, 2)).transpose(1, 2)
logf0_uv = self.pitch_convs(logf0_uv.transpose(1, 2)).transpose(1, 2)
decoder_inputs = decoder_inputs + logf0_uv

assert spembs is not None
spk_embeds = F.normalize(
spembs).unsqueeze(1).expand(-1, decoder_inputs.size(1), -1)
bottle_neck_features = torch.cat([decoder_inputs, spk_embeds], dim=-1)
bottle_neck_features = self.reduce_proj(bottle_neck_features)

## Decoder
if bottle_neck_features.size(0) > 1:
mel_outputs, alignments = self.decoder.inference_batched(bottle_neck_features)
else:
mel_outputs, alignments = self.decoder.inference(bottle_neck_features,)
## Post-processing
mel_outputs_postnet = self.postnet(mel_outputs.transpose(1, 2)).transpose(1, 2)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
# outputs = mel_outputs_postnet[0]

return mel_outputs[0], mel_outputs_postnet[0], alignments[0]

def load_model(train_config, model_file, device=None):

if device is None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_config = HpsYaml(train_config)
ppg2mel_model = MelDecoderMOLv2(
**model_config["model"]
).to(device)
ckpt = torch.load(model_file, map_location=device)
ppg2mel_model.load_state_dict(ckpt["model"])
ppg2mel_model.eval()
return ppg2mel_model
Loading

0 comments on commit b617a87

Please sign in to comment.