Skip to content

Commit

Permalink
autoencoder shape mismatch
Browse files Browse the repository at this point in the history
  • Loading branch information
NilsDem committed Dec 4, 2024
1 parent 9d81e53 commit 80dc395
Show file tree
Hide file tree
Showing 8 changed files with 237 additions and 223 deletions.
Binary file removed Paper.pdf
Binary file not shown.
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

Official repository for _Combining audio control and style transfer using latent diffusion_ by Nils Demerlé, Philippe Esling, Guillaume Doras and David Genova accepted at ISMIR 2024 ([paper link](https://arxiv.org/pdf/2408.00196)).

This diffusion-based generative model creates new audio by blending two inputs: one audio sample that sets the style or timbre, and another input (either audio or MIDI) that defines the structure over time. In this repository, you will find instructions to train your own model as well as model checkpoints trained on the two datasets presented in the paper.

We are currently working on a real-time implementation of this model called AFTER. You can already experiment with a real-time version of the model in MaxMSP on the official AFTER [repository](https://github.com/acids-ircam/AFTER).



## Model training
Training the model requires three steps : processing the dataset, training an autoencoder, then training the diffusion model.
Expand All @@ -15,7 +20,7 @@ python dataset/split_to_lmdb.py --input_path /path/to/audio_dataset --output_pat
Or to use slakh with midi processing (after downloading Slakh2100 [here](http://www.slakh.com/)) :

```bash
python dataset/split_to_lmdb_midi.py --input_path /path/to/slakh --output_path /path/to/slakh/out_lmdb_midi --slakh True
python dataset/split_to_lmdb.py --input_path /path/to/slakh --output_path /path/to/slakh/out_lmdb_midi --slakh True
```

### Autoencoder training
Expand All @@ -36,16 +41,17 @@ The model training is configured with gin config files.

To train the audio to audio model :
```bash
python train_diffusion.py --db_path /path/to/lmdb --config midi --dataset_type midi --gpu #
python train_diffusion.py --name my_audio_model --db_path /path/to/lmdb --config main --dataset_type waveform --gpu #
```

To train the midi-to-audio model :
```bash
python train_diffusion.py --db_path /path/to/lmdb --config main --dataset_type waveform --gpu #
python train_diffusion.py --name my_midi_audio_model --db_path /path/to/lmdb_midi --config midi --dataset_type midi --gpu #
```

## Inference and pretrained models

Three pretraiend models are now available :
Three pretrained models are currently available :
1. Audio to audio transfer model trained on [Slakh](http://www.slakh.com/)
2. Audio to audio transfer model trained on multiple datasets (Maestro, URMP, Filobass, GuitarSet...)
3. MIDI-to-audio model trained on [Slakh](http://www.slakh.com/)
Expand Down
78 changes: 40 additions & 38 deletions autoencoder/networks/SimpleNets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import cached_conv as cc


#from .utils import closest_power_2, default, exists, groupby, prefix_dict, prod, to_list
"""
Convolutional Modules
Expand Down Expand Up @@ -49,7 +48,8 @@ def Downsample1d(in_channels: int,
out_channels=out_channels,
kernel_size=factor * kernel_multiplier,
stride=factor,
padding=cc.get_padding(2 * factor + 1, factor), #math.ceil(factor / 2),
padding=cc.get_padding(2 * factor,
factor), #math.ceil(factor / 2),
))


Expand Down Expand Up @@ -308,12 +308,13 @@ def __init__(self,
self.to_out = nn.Sequential(
activation(dim=channels * multipliers[-1]),
normalization(
cc.Conv1d(in_channels=channels * multipliers[-1],
out_channels=out_channels,
kernel_size=3,
padding=cc.get_padding(kernel_size=3, dilation=1),
#padding="same"
)))
cc.Conv1d(
in_channels=channels * multipliers[-1],
out_channels=out_channels,
kernel_size=3,
padding=cc.get_padding(kernel_size=3, dilation=1),
#padding="same"
)))

def forward(self, x: Tensor, with_info: bool = False) -> Tensor:
x = self.to_in(x)
Expand Down Expand Up @@ -361,7 +362,7 @@ def __init__(
kernel_size=kernel_size,
padding=cc.get_padding(kernel_size, dilation=1)
#padding="same"
))
))

self.upsamples = nn.ModuleList([
UpsampleBlock1d(in_channels=channels * multipliers[i],
Expand All @@ -382,10 +383,14 @@ def __init__(
activation=activation,
use_norm=use_norm,
kernel_size=kernel_size)

if self.use_noise:
self.noise_module = NoiseGeneratorV2(in_size = channels * multipliers[-1], hidden_size = 128, data_size = out_channels, ratios = [2,2,2], noise_bands = 5)

self.noise_module = NoiseGeneratorV2(in_size=channels *
multipliers[-1],
hidden_size=128,
data_size=out_channels,
ratios=[2, 2, 2],
noise_bands=5)

self.recurrent_layer = recurent_layer(in_size=in_channels,
out_size=in_channels)
Expand All @@ -397,7 +402,6 @@ def forward(self, x: Tensor, with_info: bool = False) -> Tensor:
for upsample in self.upsamples:
x = upsample(x)


if self.use_noise:
noise = self.noise_module(x)
else:
Expand All @@ -408,14 +412,13 @@ def forward(self, x: Tensor, with_info: bool = False) -> Tensor:
if self.use_loudness:
x, amplitude = x.split(x.shape[1] // 2, 1)
x = x * torch.sigmoid(amplitude)

x = torch.tanh(x)

if self.use_noise:
x = x + noise

return x

return x


def amp_to_impulse_response(amp, target_size):
Expand All @@ -441,6 +444,7 @@ def amp_to_impulse_response(amp, target_size):

return amp


def fft_convolve(signal, kernel):
"""
convolves signal by kernel on the last dimension
Expand All @@ -457,14 +461,14 @@ def fft_convolve(signal, kernel):
class NoiseGeneratorV2(nn.Module):

def __init__(
self,
in_size: int,
hidden_size: int,
data_size: int,
ratios: int,
noise_bands: int,
n_channels: int = 1,
activation = lambda dim: nn.LeakyReLU(.2),
self,
in_size: int,
hidden_size: int,
data_size: int,
ratios: int,
noise_bands: int,
n_channels: int = 1,
activation=lambda dim: nn.LeakyReLU(.2),
):
super().__init__()
net = []
Expand Down Expand Up @@ -524,9 +528,10 @@ def __init__(self,
self.enabled = True

self.to_out = normalization(
cc.Conv1d(hidden_size, out_size, kernel_size=3, padding=cc.get_padding(3, dilation=1))) #padding same


cc.Conv1d(hidden_size,
out_size,
kernel_size=3,
padding=cc.get_padding(3, dilation=1))) #padding same

def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x.permute(0, 2, 1)
Expand Down Expand Up @@ -557,7 +562,6 @@ def __init__(
pqmf_bands=0,
use_loudness: bool = False,
use_noise: bool = False,

):
super().__init__()
out_channels = in_channels
Expand Down Expand Up @@ -599,19 +603,19 @@ def __init__(
recurent_layer=recurrent_layer,
activation=activation,
use_norm=use_norm,
use_loudness=use_loudness,
use_loudness=use_loudness,
use_noise=use_noise)

def forward(self,
x: Tensor,
with_z: bool = False,
with_multi: bool = False) -> Tensor:

if self.pqmf_bands > 1:
x = self.pqmf(x)
z = self.encoder(x)

z = self.encoder(x)

x = self.decoder(z)

if self.pqmf_bands > 1:
Expand All @@ -629,8 +633,7 @@ def encode(
x_multiband = self.pqmf(x)
else:
x_multiband = x



z = self.encoder(x_multiband)

if with_multi:
Expand All @@ -648,6 +651,5 @@ def decode(self, z: Tensor, with_multi: bool = False) -> Tensor:

if with_multi:
return x, x_multiband

return x

return x
Loading

0 comments on commit 80dc395

Please sign in to comment.