autoencoder shape mismatch

NilsDem · Dec 4, 2024 · 80dc395 · 80dc395
1 parent 9d81e53
commit 80dc395
Show file tree

Hide file tree

Showing 8 changed files with 237 additions and 223 deletions.
diff --git a/Paper.pdf b/Paper.pdf
diff --git a/README.md b/README.md
@@ -2,6 +2,11 @@
 
 Official repository for  _Combining audio control and style transfer using latent diffusion_ by Nils Demerlé, Philippe Esling, Guillaume Doras and David Genova accepted at ISMIR 2024 ([paper link](https://arxiv.org/pdf/2408.00196)).
 
+This diffusion-based generative model creates new audio by blending two inputs: one audio sample that sets the style or timbre, and another input (either audio or MIDI) that defines the structure over time. In this repository, you will find instructions to train your own model as well as model checkpoints trained on the two datasets presented in the paper. 
+
+We are currently working on a real-time implementation of this model called AFTER. You can already experiment with a real-time version of the model in MaxMSP on the official AFTER [repository](https://github.com/acids-ircam/AFTER). 
+
+
 
 ## Model training
 Training the model requires three steps : processing the dataset, training an autoencoder, then training the diffusion model.
@@ -15,7 +20,7 @@ python dataset/split_to_lmdb.py --input_path /path/to/audio_dataset --output_pat
 Or to use slakh with midi processing (after downloading Slakh2100  [here](http://www.slakh.com/)) :
 
 ```bash
-python dataset/split_to_lmdb_midi.py --input_path /path/to/slakh --output_path /path/to/slakh/out_lmdb_midi --slakh True
+python dataset/split_to_lmdb.py --input_path /path/to/slakh --output_path /path/to/slakh/out_lmdb_midi --slakh True
 ```
 
 ### Autoencoder training
@@ -36,16 +41,17 @@ The model training is configured with gin config files.
 
 To train the audio to audio model :
 ```bash
- python train_diffusion.py --db_path /path/to/lmdb --config midi --dataset_type midi --gpu #
+ python train_diffusion.py --name my_audio_model --db_path /path/to/lmdb --config main --dataset_type waveform --gpu #
 ```
+
 To train the midi-to-audio model : 
 ```bash
- python train_diffusion.py --db_path /path/to/lmdb --config main --dataset_type waveform --gpu #
+ python train_diffusion.py --name my_midi_audio_model  --db_path /path/to/lmdb_midi --config midi --dataset_type midi --gpu #
 ```
 
 ## Inference and pretrained models
 
-Three pretraiend models are now available : 
+Three pretrained models are currently available : 
 1. Audio to audio transfer model trained on [Slakh](http://www.slakh.com/)
 2. Audio to audio transfer model trained on multiple datasets (Maestro, URMP, Filobass, GuitarSet...)
 3. MIDI-to-audio model trained on [Slakh](http://www.slakh.com/)

diff --git a/autoencoder/networks/SimpleNets.py b/autoencoder/networks/SimpleNets.py
@@ -18,7 +18,6 @@
 
 import cached_conv as cc
 
-
 #from .utils import closest_power_2, default, exists, groupby, prefix_dict, prod, to_list
 """
 Convolutional Modules
@@ -49,7 +48,8 @@ def Downsample1d(in_channels: int,
             out_channels=out_channels,
             kernel_size=factor * kernel_multiplier,
             stride=factor,
-            padding=cc.get_padding(2 * factor + 1, factor), #math.ceil(factor / 2),
+            padding=cc.get_padding(2 * factor,
+                                   factor),  #math.ceil(factor / 2),
         ))
 
 
@@ -308,12 +308,13 @@ def __init__(self,
         self.to_out = nn.Sequential(
             activation(dim=channels * multipliers[-1]),
             normalization(
-                cc.Conv1d(in_channels=channels * multipliers[-1],
-                          out_channels=out_channels,
-                          kernel_size=3,
-                          padding=cc.get_padding(kernel_size=3, dilation=1),
-                          #padding="same"
-                           )))
+                cc.Conv1d(
+                    in_channels=channels * multipliers[-1],
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=cc.get_padding(kernel_size=3, dilation=1),
+                    #padding="same"
+                )))
 
     def forward(self, x: Tensor, with_info: bool = False) -> Tensor:
         x = self.to_in(x)
@@ -361,7 +362,7 @@ def __init__(
                    kernel_size=kernel_size,
                    padding=cc.get_padding(kernel_size, dilation=1)
                    #padding="same"
-                    ))
+                   ))
 
         self.upsamples = nn.ModuleList([
             UpsampleBlock1d(in_channels=channels * multipliers[i],
@@ -382,10 +383,14 @@ def __init__(
                                     activation=activation,
                                     use_norm=use_norm,
                                     kernel_size=kernel_size)
-        
+
         if self.use_noise:
-            self.noise_module = NoiseGeneratorV2(in_size = channels * multipliers[-1], hidden_size = 128, data_size = out_channels, ratios = [2,2,2], noise_bands = 5)
-
+            self.noise_module = NoiseGeneratorV2(in_size=channels *
+                                                 multipliers[-1],
+                                                 hidden_size=128,
+                                                 data_size=out_channels,
+                                                 ratios=[2, 2, 2],
+                                                 noise_bands=5)
 
         self.recurrent_layer = recurent_layer(in_size=in_channels,
                                               out_size=in_channels)
@@ -397,7 +402,6 @@ def forward(self, x: Tensor, with_info: bool = False) -> Tensor:
         for upsample in self.upsamples:
             x = upsample(x)
 
-
         if self.use_noise:
             noise = self.noise_module(x)
         else:
@@ -408,14 +412,13 @@ def forward(self, x: Tensor, with_info: bool = False) -> Tensor:
         if self.use_loudness:
             x, amplitude = x.split(x.shape[1] // 2, 1)
             x = x * torch.sigmoid(amplitude)
-            
+
         x = torch.tanh(x)
-        
+
         if self.use_noise:
             x = x + noise
-
-        return x
 
+        return x
 
 
 def amp_to_impulse_response(amp, target_size):
@@ -441,6 +444,7 @@ def amp_to_impulse_response(amp, target_size):
 
     return amp
 
+
 def fft_convolve(signal, kernel):
     """
     convolves signal by kernel on the last dimension
@@ -457,14 +461,14 @@ def fft_convolve(signal, kernel):
 class NoiseGeneratorV2(nn.Module):
 
     def __init__(
-        self,
-        in_size: int,
-        hidden_size: int,
-        data_size: int,
-        ratios: int,
-        noise_bands: int,
-        n_channels: int = 1,
-        activation = lambda dim: nn.LeakyReLU(.2),
+            self,
+            in_size: int,
+            hidden_size: int,
+            data_size: int,
+            ratios: int,
+            noise_bands: int,
+            n_channels: int = 1,
+            activation=lambda dim: nn.LeakyReLU(.2),
     ):
         super().__init__()
         net = []
@@ -524,9 +528,10 @@ def __init__(self,
         self.enabled = True
 
         self.to_out = normalization(
-            cc.Conv1d(hidden_size, out_size, kernel_size=3, padding=cc.get_padding(3, dilation=1))) #padding same
-
-
+            cc.Conv1d(hidden_size,
+                      out_size,
+                      kernel_size=3,
+                      padding=cc.get_padding(3, dilation=1)))  #padding same
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x.permute(0, 2, 1)
@@ -557,7 +562,6 @@ def __init__(
         pqmf_bands=0,
         use_loudness: bool = False,
         use_noise: bool = False,
-
     ):
         super().__init__()
         out_channels = in_channels
@@ -599,19 +603,19 @@ def __init__(
             recurent_layer=recurrent_layer,
             activation=activation,
             use_norm=use_norm,
-            use_loudness=use_loudness, 
+            use_loudness=use_loudness,
             use_noise=use_noise)
 
     def forward(self,
                 x: Tensor,
                 with_z: bool = False,
                 with_multi: bool = False) -> Tensor:
-        
+
         if self.pqmf_bands > 1:
             x = self.pqmf(x)
-        
-        z = self.encoder(x) 
-            
+
+        z = self.encoder(x)
+
         x = self.decoder(z)
 
         if self.pqmf_bands > 1:
@@ -629,8 +633,7 @@ def encode(
             x_multiband = self.pqmf(x)
         else:
             x_multiband = x
-
-
+
         z = self.encoder(x_multiband)
 
         if with_multi:
@@ -648,6 +651,5 @@ def decode(self, z: Tensor, with_multi: bool = False) -> Tensor:
 
         if with_multi:
             return x, x_multiband
-
-        return x
 
+        return x