Added Momentum and Nesterov modifications (#148)

* Added Momentum and Nesterov modifications * Resolved dummy argument error * Changes in update formulas * Corrected formulae, velocity allocation changes * Added concrete implementation of RMSProp * Report RMSE every 10% of num_epochs; Fix xtest calculation * Initialize networks with same weights; larger batch; larger test array * Start putting RMS and velocity structures in place; yet to be allocated and initialized * WIP: SGD and RMSprop optimizers plumbing at the network % update level * Added get_gradients() method (draft) * Clean up formatting and docstrings * Flush gradients to zero; code compiles but segfaults * Set default value for batch_size; tests pass in debug mode but segfault in optimized mode * Update learning rates in simple and sine examples because the default changed * Added draft test suite for optimizers * Store the optimizer as a member of the network type * Don't print to stdout; indentation * Added convergence tests * Resolved comments * Clean up * Import RMSProp * Remove old code * Add optimizer support notes --------- Co-authored-by: milancurcic <[email protected]>
modern-fortran · Jul 14, 2023 · e9bfbd6 · e9bfbd6
1 parent 6bbc28d
commit e9bfbd6
Show file tree

Hide file tree

Showing 16 changed files with 470 additions and 151 deletions.
diff --git a/README.md b/README.md
@@ -17,22 +17,26 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 
 * Training and inference of dense (fully connected) and convolutional neural
   networks
+* Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum,
+  and RMSProp
+* More than a dozen activation functions and their derivatives
 * Loading dense and convolutional models from Keras HDF5 (.h5) files
-* Stochastic and mini-batch gradient descent for back-propagation
 * Data-based parallelism
-* Several activation functions and their derivatives
 
 ### Available layers
 
 | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 3 | n/a | n/a |
 | Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 | ✅ | ✅ |
-| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
+| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ❌ |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
 | Flatten | `flatten` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
+**Note:** The training of convolutional layers has been discovered to be broken
+as of release 0.13.0. This will be fixed in a future (hopefully next) release.
+
 ## Getting started
 
 Get the code:

diff --git a/example/quadratic.f90 b/example/quadratic.f90
@@ -7,28 +7,27 @@ program quadratic_fit
   use nf_optimizers, only: sgd
 
   implicit none
-  type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
+  type(network) :: net(6)
 
   ! Training parameters
   integer, parameter :: num_epochs = 1000
   integer, parameter :: train_size = 1000
-  integer, parameter :: test_size = 30
-  integer, parameter :: batch_size = 10
+  integer, parameter :: test_size = 100
+  integer, parameter :: batch_size = 100
   real, parameter :: learning_rate = 0.01
   real, parameter :: decay_rate = 0.9
 
   ! Input and output data
   real, allocatable :: x(:), y(:) ! training data
   real, allocatable :: xtest(:), ytest(:) ! testing data
-  real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:)
 
   integer :: i, n
 
   print '("Fitting quadratic function")'
   print '(60("="))'
 
   allocate(xtest(test_size), ytest(test_size))
-  xtest = [((i - 1) * 2 / test_size, i = 1, test_size)]
+  xtest = [(real(i - 1) * 2 / test_size, i = 1, test_size)]
   ytest = quadratic(xtest)
 
   ! x and y as 1-D arrays
@@ -41,38 +40,30 @@ program quadratic_fit
   end do
   y = quadratic(x)
 
-  ! Instantiate a separate network for each optimization method.
-  net_sgd = network([input(1), dense(3), dense(1)])
-  net_batch_sgd = network([input(1), dense(3), dense(1)])
-  net_minibatch_sgd = network([input(1), dense(3), dense(1)])
-  net_rms_prop = network([input(1), dense(3), dense(1)])
+  ! Instantiate a network and copy an instance to the rest of the array
+  net(1) = network([input(1), dense(3), dense(1)])
+  net(2:) = net(1)
 
   ! Print network info to stdout; this will be the same for all three networks.
-  call net_sgd % print_info()
+  call net(1) % print_info()
 
-  ! SGD optimizer
-  call sgd_optimizer(net_sgd, x, y, learning_rate, num_epochs)
+  ! SGD, no momentum
+  call sgd_optimizer(net(1), x, y, xtest, ytest, learning_rate, num_epochs)
+
+  ! SGD, momentum
+  call sgd_optimizer(net(2), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9)
+
+  ! SGD, momentum with Nesterov
+  call sgd_optimizer(net(3), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9, nesterov=.true.)
 
   ! Batch SGD optimizer
-  call batch_gd_optimizer(net_batch_sgd, x, y, learning_rate, num_epochs)
+  call batch_gd_optimizer(net(4), x, y, xtest, ytest, learning_rate, num_epochs)
 
   ! Mini-batch SGD optimizer
-  call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)
+  call minibatch_gd_optimizer(net(5), x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
 
   ! RMSProp optimizer
-  call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate)
-
-  ! Calculate predictions on the test set
-  ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1, test_size)]
-  ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1, test_size)]
-  ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1, test_size)]
-  ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1, test_size)]
-
-  ! Print the mean squared error
-  print '("Stochastic gradient descent MSE:", f9.6)', sum((ypred_sgd - ytest)**2) / size(ytest)
-  print '("     Batch gradient descent MSE: ", f9.6)', sum((ypred_batch_sgd - ytest)**2) / size(ytest)
-  print '(" Minibatch gradient descent MSE: ", f9.6)', sum((ypred_minibatch_sgd - ytest)**2) / size(ytest)
-  print '("                    RMSProp MSE: ", f9.6)', sum((ypred_rms_prop - ytest)**2) / size(ytest)
+  call rmsprop_optimizer(net(6), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
 
 contains
 
@@ -82,65 +73,107 @@ real elemental function quadratic(x) result(y)
     y = (x**2 / 2 + x / 2 + 1) / 2
   end function quadratic
 
-  subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs)
+  subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, momentum, nesterov)
     ! In the stochastic gradient descent (SGD) optimizer, we run the forward
     ! and backward passes and update the weights for each training sample,
     ! one at a time.
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate
     integer, intent(in) :: num_epochs
+    real, intent(in), optional :: momentum
+    logical, intent(in), optional :: nesterov
+    real, allocatable :: ypred(:)
+    real :: momentum_value
+    logical :: nesterov_value
     integer :: i, n
 
-    print *, "Running SGD optimizer..."
+    print '(a)', 'Stochastic gradient descent'
+    print '(34("-"))'
+
+    ! Set default values for momentum and nesterov
+    if (.not. present(momentum)) then
+      momentum_value = 0.0
+    else
+      momentum_value = momentum
+    end if
+
+    if (.not. present(nesterov)) then
+      nesterov_value = .false.
+    else
+      nesterov_value = nesterov
+    end if
 
     do n = 1, num_epochs
       do i = 1, size(x)
         call net % forward([x(i)])
         call net % backward([y(i)])
-        call net % update(sgd(learning_rate=learning_rate))
+        call net % update(sgd(learning_rate=learning_rate, momentum=momentum_value, nesterov=nesterov_value))
       end do
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
 
+    print *, ''
+
   end subroutine sgd_optimizer
 
-  subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs)
+  subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs)
     ! Like the stochastic gradient descent (SGD) optimizer, except that here we
     ! accumulate the weight gradients for all training samples and update the
     ! weights once per epoch.
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate
     integer, intent(in) :: num_epochs
+    real, allocatable :: ypred(:)
     integer :: i, n
 
-    print *, "Running batch GD optimizer..."
+    print '(a)', 'Batch gradient descent'
+    print '(34("-"))'
 
     do n = 1, num_epochs
       do i = 1, size(x)
         call net % forward([x(i)])
         call net % backward([y(i)])
       end do
       call net % update(sgd(learning_rate=learning_rate / size(x)))
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
 
+    print *, ''
+
   end subroutine batch_gd_optimizer
 
-  subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_size)
+  subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
     ! Like the batch SGD optimizer, except that here we accumulate the weight
     ! over a number of mini batches and update the weights once per mini batch.
     !
     ! Note: -O3 on GFortran must be accompanied with -fno-frontend-optimize for
     ! this subroutine to converge to a solution.
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate
     integer, intent(in) :: num_epochs, batch_size
     integer :: i, j, n, num_samples, num_batches, start_index, end_index
     real, allocatable :: batch_x(:), batch_y(:)
     integer, allocatable :: batch_indices(:)
+    real, allocatable :: ypred(:)
 
-    print *, "Running mini-batch GD optimizer..."
+    print '(a)', 'Minibatch gradient descent'
+    print '(34("-"))'
 
     num_samples = size(x)
     num_batches = num_samples / batch_size
@@ -167,17 +200,28 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
 
         call net % update(sgd(learning_rate=learning_rate / batch_size))
       end do
+
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
+
+    print *, ''
+
   end subroutine minibatch_gd_optimizer
 
-  subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
+  subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
     ! RMSprop optimizer for updating weights using root mean square
     type(network), intent(inout) :: net
     real, intent(in) :: x(:), y(:)
+    real, intent(in) :: xtest(:), ytest(:)
     real, intent(in) :: learning_rate, decay_rate
     integer, intent(in) :: num_epochs
     integer :: i, j, n
     real, parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
+    real, allocatable :: ypred(:)
 
     ! Define a dedicated type to store the RMSprop gradients.
     ! This is needed because array sizes vary between layers and we need to
@@ -191,7 +235,8 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
 
     type(rms_gradient_dense), allocatable :: rms(:)
 
-    print *, "Running RMSprop optimizer..."
+    print '(a)', 'RMSProp optimizer'
+    print '(34("-"))'
 
     ! Here we allocate the array or RMS gradient derived types.
     ! We need one for each dense layer, however we will allocate it to the
@@ -237,8 +282,15 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
         end select
       end do
 
+      if (mod(n, num_epochs / 10) == 0) then
+        ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
+        print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
+      end if
+
     end do
 
+    print *, ''
+
   end subroutine rmsprop_optimizer
 
   subroutine shuffle(arr)

diff --git a/example/simple.f90 b/example/simple.f90
@@ -1,5 +1,5 @@
 program simple
-  use nf, only: dense, input, network
+  use nf, only: dense, input, network, sgd
   implicit none
   type(network) :: net
   real, allocatable :: x(:), y(:)
@@ -24,7 +24,7 @@ program simple
 
     call net % forward(x)
     call net % backward(y)
-    call net % update()
+    call net % update(optimizer=sgd(learning_rate=1.))
 
     if (mod(n, 50) == 0) &
       print '(i4,2(3x,f8.6))', n, net % predict(x)

diff --git a/example/sine.f90 b/example/sine.f90
@@ -1,5 +1,5 @@
 program sine
-  use nf, only: dense, input, network
+  use nf, only: dense, input, network, sgd
   implicit none
   type(network) :: net
   real :: x(1), y(1)
@@ -31,7 +31,7 @@ program sine
 
     call net % forward(x)
     call net % backward(y)
-    call net % update()
+    call net % update(optimizer=sgd(learning_rate=1.))
 
     if (mod(n, 10000) == 0) then
       ypred = [(net % predict([xtest(i)]), i = 1, test_size)]

diff --git a/src/nf.f90 b/src/nf.f90
@@ -5,7 +5,7 @@ module nf
   use nf_layer_constructors, only: &
     conv2d, dense, flatten, input, maxpool2d, reshape
   use nf_network, only: network
-  use nf_optimizers, only: sgd
+  use nf_optimizers, only: sgd, rmsprop
   use nf_activation, only: activation_function, elu, exponential,  &
                            gaussian, linear, relu, leaky_relu,     &
                            sigmoid, softmax, softplus, step, tanhf, &

diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
@@ -30,11 +30,12 @@ module nf_conv2d_layer
 
   contains
 
-    procedure :: init
     procedure :: forward
     procedure :: backward
+    procedure :: get_gradients
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: init
     procedure :: set_params
 
   end type conv2d_layer
@@ -89,13 +90,23 @@ pure module function get_num_params(self) result(num_params)
     end function get_num_params
 
     pure module function get_params(self) result(params)
-      !! Get the parameters of the layer.
+      !! Return the parameters (weights and biases) of this layer.
+      !! The parameters are ordered as weights first, biases second.
       class(conv2d_layer), intent(in) :: self
         !! A `conv2d_layer` instance
       real, allocatable :: params(:)
         !! Parameters to get
     end function get_params
 
+    pure module function get_gradients(self) result(gradients)
+      !! Return the gradients of this layer.
+      !! The gradients are ordered as weights first, biases second.
+      class(conv2d_layer), intent(in) :: self
+        !! A `conv2d_layer` instance
+      real, allocatable :: gradients(:)
+        !! Gradients to get
+    end function get_gradients
+
     module subroutine set_params(self, params)
       !! Set the parameters of the layer.
       class(conv2d_layer), intent(in out) :: self

diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
@@ -202,6 +202,18 @@ pure module function get_params(self) result(params)
   end function get_params
 
 
+  pure module function get_gradients(self) result(gradients)
+    class(conv2d_layer), intent(in) :: self
+    real, allocatable :: gradients(:)
+
+    gradients = [ &
+      pack(self % dw, .true.), &
+      pack(self % db, .true.) &
+    ]
+
+  end function get_gradients
+
+
   module subroutine set_params(self, params)
     class(conv2d_layer), intent(in out) :: self
     real, intent(in) :: params(:)