forked from babysor/MockingBird
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Init ppg extractor and ppg2mel (babysor#375)
* Init ppg extractor and ppg2mel * add preprocess and training * FIx known issues * Update __init__.py Allow to gen audio * Fix length issue * Fix bug of preparing fid * Fix sample issues * Add UI usage of PPG-vc
- Loading branch information
Showing
57 changed files
with
6,385 additions
and
80 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright 2020 Songxiang Liu | ||
# Apache 2.0 | ||
|
||
from typing import List | ||
|
||
import torch | ||
import torch.nn.functional as F | ||
|
||
import numpy as np | ||
|
||
from .utils.abs_model import AbsMelDecoder | ||
from .rnn_decoder_mol import Decoder | ||
from .utils.cnn_postnet import Postnet | ||
from .utils.vc_utils import get_mask_from_lengths | ||
|
||
from utils.load_yaml import HpsYaml | ||
|
||
class MelDecoderMOLv2(AbsMelDecoder): | ||
"""Use an encoder to preprocess ppg.""" | ||
def __init__( | ||
self, | ||
num_speakers: int, | ||
spk_embed_dim: int, | ||
bottle_neck_feature_dim: int, | ||
encoder_dim: int = 256, | ||
encoder_downsample_rates: List = [2, 2], | ||
attention_rnn_dim: int = 512, | ||
decoder_rnn_dim: int = 512, | ||
num_decoder_rnn_layer: int = 1, | ||
concat_context_to_last: bool = True, | ||
prenet_dims: List = [256, 128], | ||
num_mixtures: int = 5, | ||
frames_per_step: int = 2, | ||
mask_padding: bool = True, | ||
): | ||
super().__init__() | ||
|
||
self.mask_padding = mask_padding | ||
self.bottle_neck_feature_dim = bottle_neck_feature_dim | ||
self.num_mels = 80 | ||
self.encoder_down_factor=np.cumprod(encoder_downsample_rates)[-1] | ||
self.frames_per_step = frames_per_step | ||
self.use_spk_dvec = True | ||
|
||
input_dim = bottle_neck_feature_dim | ||
|
||
# Downsampling convolution | ||
self.bnf_prenet = torch.nn.Sequential( | ||
torch.nn.Conv1d(input_dim, encoder_dim, kernel_size=1, bias=False), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(encoder_dim, affine=False), | ||
torch.nn.Conv1d( | ||
encoder_dim, encoder_dim, | ||
kernel_size=2*encoder_downsample_rates[0], | ||
stride=encoder_downsample_rates[0], | ||
padding=encoder_downsample_rates[0]//2, | ||
), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(encoder_dim, affine=False), | ||
torch.nn.Conv1d( | ||
encoder_dim, encoder_dim, | ||
kernel_size=2*encoder_downsample_rates[1], | ||
stride=encoder_downsample_rates[1], | ||
padding=encoder_downsample_rates[1]//2, | ||
), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(encoder_dim, affine=False), | ||
) | ||
decoder_enc_dim = encoder_dim | ||
self.pitch_convs = torch.nn.Sequential( | ||
torch.nn.Conv1d(2, encoder_dim, kernel_size=1, bias=False), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(encoder_dim, affine=False), | ||
torch.nn.Conv1d( | ||
encoder_dim, encoder_dim, | ||
kernel_size=2*encoder_downsample_rates[0], | ||
stride=encoder_downsample_rates[0], | ||
padding=encoder_downsample_rates[0]//2, | ||
), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(encoder_dim, affine=False), | ||
torch.nn.Conv1d( | ||
encoder_dim, encoder_dim, | ||
kernel_size=2*encoder_downsample_rates[1], | ||
stride=encoder_downsample_rates[1], | ||
padding=encoder_downsample_rates[1]//2, | ||
), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(encoder_dim, affine=False), | ||
) | ||
|
||
self.reduce_proj = torch.nn.Linear(encoder_dim + spk_embed_dim, encoder_dim) | ||
|
||
# Decoder | ||
self.decoder = Decoder( | ||
enc_dim=decoder_enc_dim, | ||
num_mels=self.num_mels, | ||
frames_per_step=frames_per_step, | ||
attention_rnn_dim=attention_rnn_dim, | ||
decoder_rnn_dim=decoder_rnn_dim, | ||
num_decoder_rnn_layer=num_decoder_rnn_layer, | ||
prenet_dims=prenet_dims, | ||
num_mixtures=num_mixtures, | ||
use_stop_tokens=True, | ||
concat_context_to_last=concat_context_to_last, | ||
encoder_down_factor=self.encoder_down_factor, | ||
) | ||
|
||
# Mel-Spec Postnet: some residual CNN layers | ||
self.postnet = Postnet() | ||
|
||
def parse_output(self, outputs, output_lengths=None): | ||
if self.mask_padding and output_lengths is not None: | ||
mask = ~get_mask_from_lengths(output_lengths, outputs[0].size(1)) | ||
mask = mask.unsqueeze(2).expand(mask.size(0), mask.size(1), self.num_mels) | ||
outputs[0].data.masked_fill_(mask, 0.0) | ||
outputs[1].data.masked_fill_(mask, 0.0) | ||
return outputs | ||
|
||
def forward( | ||
self, | ||
bottle_neck_features: torch.Tensor, | ||
feature_lengths: torch.Tensor, | ||
speech: torch.Tensor, | ||
speech_lengths: torch.Tensor, | ||
logf0_uv: torch.Tensor = None, | ||
spembs: torch.Tensor = None, | ||
output_att_ws: bool = False, | ||
): | ||
decoder_inputs = self.bnf_prenet( | ||
bottle_neck_features.transpose(1, 2) | ||
).transpose(1, 2) | ||
logf0_uv = self.pitch_convs(logf0_uv.transpose(1, 2)).transpose(1, 2) | ||
decoder_inputs = decoder_inputs + logf0_uv | ||
|
||
assert spembs is not None | ||
spk_embeds = F.normalize( | ||
spembs).unsqueeze(1).expand(-1, decoder_inputs.size(1), -1) | ||
decoder_inputs = torch.cat([decoder_inputs, spk_embeds], dim=-1) | ||
decoder_inputs = self.reduce_proj(decoder_inputs) | ||
|
||
# (B, num_mels, T_dec) | ||
T_dec = torch.div(feature_lengths, int(self.encoder_down_factor), rounding_mode='floor') | ||
mel_outputs, predicted_stop, alignments = self.decoder( | ||
decoder_inputs, speech, T_dec) | ||
## Post-processing | ||
mel_outputs_postnet = self.postnet(mel_outputs.transpose(1, 2)).transpose(1, 2) | ||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet | ||
if output_att_ws: | ||
return self.parse_output( | ||
[mel_outputs, mel_outputs_postnet, predicted_stop, alignments], speech_lengths) | ||
else: | ||
return self.parse_output( | ||
[mel_outputs, mel_outputs_postnet, predicted_stop], speech_lengths) | ||
|
||
# return mel_outputs, mel_outputs_postnet | ||
|
||
def inference( | ||
self, | ||
bottle_neck_features: torch.Tensor, | ||
logf0_uv: torch.Tensor = None, | ||
spembs: torch.Tensor = None, | ||
): | ||
decoder_inputs = self.bnf_prenet(bottle_neck_features.transpose(1, 2)).transpose(1, 2) | ||
logf0_uv = self.pitch_convs(logf0_uv.transpose(1, 2)).transpose(1, 2) | ||
decoder_inputs = decoder_inputs + logf0_uv | ||
|
||
assert spembs is not None | ||
spk_embeds = F.normalize( | ||
spembs).unsqueeze(1).expand(-1, decoder_inputs.size(1), -1) | ||
bottle_neck_features = torch.cat([decoder_inputs, spk_embeds], dim=-1) | ||
bottle_neck_features = self.reduce_proj(bottle_neck_features) | ||
|
||
## Decoder | ||
if bottle_neck_features.size(0) > 1: | ||
mel_outputs, alignments = self.decoder.inference_batched(bottle_neck_features) | ||
else: | ||
mel_outputs, alignments = self.decoder.inference(bottle_neck_features,) | ||
## Post-processing | ||
mel_outputs_postnet = self.postnet(mel_outputs.transpose(1, 2)).transpose(1, 2) | ||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet | ||
# outputs = mel_outputs_postnet[0] | ||
|
||
return mel_outputs[0], mel_outputs_postnet[0], alignments[0] | ||
|
||
def load_model(train_config, model_file, device=None): | ||
|
||
if device is None: | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
model_config = HpsYaml(train_config) | ||
ppg2mel_model = MelDecoderMOLv2( | ||
**model_config["model"] | ||
).to(device) | ||
ckpt = torch.load(model_file, map_location=device) | ||
ppg2mel_model.load_state_dict(ckpt["model"]) | ||
ppg2mel_model.eval() | ||
return ppg2mel_model |
Oops, something went wrong.