Skip to content

Commit

Permalink
Added Momentum and Nesterov modifications (#148)
Browse files Browse the repository at this point in the history
* Added Momentum and Nesterov modifications

* Resolved dummy argument error

* Changes in update formulas

* Corrected formulae, velocity allocation changes

* Added concrete implementation of RMSProp

* Report RMSE every 10% of num_epochs; Fix xtest calculation

* Initialize networks with same weights; larger batch; larger test array

* Start putting RMS and velocity structures in place; yet to be allocated and initialized

* WIP: SGD and RMSprop optimizers plumbing at the network % update level

* Added get_gradients() method (draft)

* Clean up formatting and docstrings

* Flush gradients to zero; code compiles but segfaults

* Set default value for batch_size; tests pass in debug mode but segfault in optimized mode

* Update learning rates in simple and sine examples because the default changed

* Added draft test suite for optimizers

* Store the optimizer as a member of the network type

* Don't print to stdout; indentation

* Added convergence tests

* Resolved comments

* Clean up

* Import RMSProp

* Remove old code

* Add optimizer support notes

---------

Co-authored-by: milancurcic <[email protected]>
  • Loading branch information
Spnetic-5 and milancurcic authored Jul 14, 2023
1 parent 6bbc28d commit e9bfbd6
Show file tree
Hide file tree
Showing 16 changed files with 470 additions and 151 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,26 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).

* Training and inference of dense (fully connected) and convolutional neural
networks
* Stochastic gradient descent optimizers: Classic, momentum, Nesterov momentum,
and RMSProp
* More than a dozen activation functions and their derivatives
* Loading dense and convolutional models from Keras HDF5 (.h5) files
* Stochastic and mini-batch gradient descent for back-propagation
* Data-based parallelism
* Several activation functions and their derivatives

### Available layers

| Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
|------------|------------------|------------------------|----------------------|--------------|---------------|
| Input | `input` | n/a | 1, 3 | n/a | n/a |
| Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 |||
| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 || |
| Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 || |
| Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 |||
| Flatten | `flatten` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 |||
| Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 |||

**Note:** The training of convolutional layers has been discovered to be broken
as of release 0.13.0. This will be fixed in a future (hopefully next) release.

## Getting started

Get the code:
Expand Down
126 changes: 89 additions & 37 deletions example/quadratic.f90
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,27 @@ program quadratic_fit
use nf_optimizers, only: sgd

implicit none
type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
type(network) :: net(6)

! Training parameters
integer, parameter :: num_epochs = 1000
integer, parameter :: train_size = 1000
integer, parameter :: test_size = 30
integer, parameter :: batch_size = 10
integer, parameter :: test_size = 100
integer, parameter :: batch_size = 100
real, parameter :: learning_rate = 0.01
real, parameter :: decay_rate = 0.9

! Input and output data
real, allocatable :: x(:), y(:) ! training data
real, allocatable :: xtest(:), ytest(:) ! testing data
real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:)

integer :: i, n

print '("Fitting quadratic function")'
print '(60("="))'

allocate(xtest(test_size), ytest(test_size))
xtest = [((i - 1) * 2 / test_size, i = 1, test_size)]
xtest = [(real(i - 1) * 2 / test_size, i = 1, test_size)]
ytest = quadratic(xtest)

! x and y as 1-D arrays
Expand All @@ -41,38 +40,30 @@ program quadratic_fit
end do
y = quadratic(x)

! Instantiate a separate network for each optimization method.
net_sgd = network([input(1), dense(3), dense(1)])
net_batch_sgd = network([input(1), dense(3), dense(1)])
net_minibatch_sgd = network([input(1), dense(3), dense(1)])
net_rms_prop = network([input(1), dense(3), dense(1)])
! Instantiate a network and copy an instance to the rest of the array
net(1) = network([input(1), dense(3), dense(1)])
net(2:) = net(1)

! Print network info to stdout; this will be the same for all three networks.
call net_sgd % print_info()
call net(1) % print_info()

! SGD optimizer
call sgd_optimizer(net_sgd, x, y, learning_rate, num_epochs)
! SGD, no momentum
call sgd_optimizer(net(1), x, y, xtest, ytest, learning_rate, num_epochs)

! SGD, momentum
call sgd_optimizer(net(2), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9)

! SGD, momentum with Nesterov
call sgd_optimizer(net(3), x, y, xtest, ytest, learning_rate, num_epochs, momentum=0.9, nesterov=.true.)

! Batch SGD optimizer
call batch_gd_optimizer(net_batch_sgd, x, y, learning_rate, num_epochs)
call batch_gd_optimizer(net(4), x, y, xtest, ytest, learning_rate, num_epochs)

! Mini-batch SGD optimizer
call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)
call minibatch_gd_optimizer(net(5), x, y, xtest, ytest, learning_rate, num_epochs, batch_size)

! RMSProp optimizer
call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate)

! Calculate predictions on the test set
ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1, test_size)]
ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1, test_size)]
ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1, test_size)]
ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1, test_size)]

! Print the mean squared error
print '("Stochastic gradient descent MSE:", f9.6)', sum((ypred_sgd - ytest)**2) / size(ytest)
print '(" Batch gradient descent MSE: ", f9.6)', sum((ypred_batch_sgd - ytest)**2) / size(ytest)
print '(" Minibatch gradient descent MSE: ", f9.6)', sum((ypred_minibatch_sgd - ytest)**2) / size(ytest)
print '(" RMSProp MSE: ", f9.6)', sum((ypred_rms_prop - ytest)**2) / size(ytest)
call rmsprop_optimizer(net(6), x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)

contains

Expand All @@ -82,65 +73,107 @@ real elemental function quadratic(x) result(y)
y = (x**2 / 2 + x / 2 + 1) / 2
end function quadratic

subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs)
subroutine sgd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, momentum, nesterov)
! In the stochastic gradient descent (SGD) optimizer, we run the forward
! and backward passes and update the weights for each training sample,
! one at a time.
type(network), intent(inout) :: net
real, intent(in) :: x(:), y(:)
real, intent(in) :: xtest(:), ytest(:)
real, intent(in) :: learning_rate
integer, intent(in) :: num_epochs
real, intent(in), optional :: momentum
logical, intent(in), optional :: nesterov
real, allocatable :: ypred(:)
real :: momentum_value
logical :: nesterov_value
integer :: i, n

print *, "Running SGD optimizer..."
print '(a)', 'Stochastic gradient descent'
print '(34("-"))'

! Set default values for momentum and nesterov
if (.not. present(momentum)) then
momentum_value = 0.0
else
momentum_value = momentum
end if

if (.not. present(nesterov)) then
nesterov_value = .false.
else
nesterov_value = nesterov
end if

do n = 1, num_epochs
do i = 1, size(x)
call net % forward([x(i)])
call net % backward([y(i)])
call net % update(sgd(learning_rate=learning_rate))
call net % update(sgd(learning_rate=learning_rate, momentum=momentum_value, nesterov=nesterov_value))
end do

if (mod(n, num_epochs / 10) == 0) then
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
end if

end do

print *, ''

end subroutine sgd_optimizer

subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs)
subroutine batch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs)
! Like the stochastic gradient descent (SGD) optimizer, except that here we
! accumulate the weight gradients for all training samples and update the
! weights once per epoch.
type(network), intent(inout) :: net
real, intent(in) :: x(:), y(:)
real, intent(in) :: xtest(:), ytest(:)
real, intent(in) :: learning_rate
integer, intent(in) :: num_epochs
real, allocatable :: ypred(:)
integer :: i, n

print *, "Running batch GD optimizer..."
print '(a)', 'Batch gradient descent'
print '(34("-"))'

do n = 1, num_epochs
do i = 1, size(x)
call net % forward([x(i)])
call net % backward([y(i)])
end do
call net % update(sgd(learning_rate=learning_rate / size(x)))

if (mod(n, num_epochs / 10) == 0) then
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
end if

end do

print *, ''

end subroutine batch_gd_optimizer

subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_size)
subroutine minibatch_gd_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, batch_size)
! Like the batch SGD optimizer, except that here we accumulate the weight
! over a number of mini batches and update the weights once per mini batch.
!
! Note: -O3 on GFortran must be accompanied with -fno-frontend-optimize for
! this subroutine to converge to a solution.
type(network), intent(inout) :: net
real, intent(in) :: x(:), y(:)
real, intent(in) :: xtest(:), ytest(:)
real, intent(in) :: learning_rate
integer, intent(in) :: num_epochs, batch_size
integer :: i, j, n, num_samples, num_batches, start_index, end_index
real, allocatable :: batch_x(:), batch_y(:)
integer, allocatable :: batch_indices(:)
real, allocatable :: ypred(:)

print *, "Running mini-batch GD optimizer..."
print '(a)', 'Minibatch gradient descent'
print '(34("-"))'

num_samples = size(x)
num_batches = num_samples / batch_size
Expand All @@ -167,17 +200,28 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si

call net % update(sgd(learning_rate=learning_rate / batch_size))
end do

if (mod(n, num_epochs / 10) == 0) then
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
end if

end do

print *, ''

end subroutine minibatch_gd_optimizer

subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
subroutine rmsprop_optimizer(net, x, y, xtest, ytest, learning_rate, num_epochs, decay_rate)
! RMSprop optimizer for updating weights using root mean square
type(network), intent(inout) :: net
real, intent(in) :: x(:), y(:)
real, intent(in) :: xtest(:), ytest(:)
real, intent(in) :: learning_rate, decay_rate
integer, intent(in) :: num_epochs
integer :: i, j, n
real, parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
real, allocatable :: ypred(:)

! Define a dedicated type to store the RMSprop gradients.
! This is needed because array sizes vary between layers and we need to
Expand All @@ -191,7 +235,8 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)

type(rms_gradient_dense), allocatable :: rms(:)

print *, "Running RMSprop optimizer..."
print '(a)', 'RMSProp optimizer'
print '(34("-"))'

! Here we allocate the array or RMS gradient derived types.
! We need one for each dense layer, however we will allocate it to the
Expand Down Expand Up @@ -237,8 +282,15 @@ subroutine rmsprop_optimizer(net, x, y, learning_rate, num_epochs, decay_rate)
end select
end do

if (mod(n, num_epochs / 10) == 0) then
ypred = [(net % predict([xtest(i)]), i = 1, size(xtest))]
print '("Epoch: ", i4,"/",i4,", RMSE = ", f9.6)', n, num_epochs, sum((ypred - ytest)**2) / size(ytest)
end if

end do

print *, ''

end subroutine rmsprop_optimizer

subroutine shuffle(arr)
Expand Down
4 changes: 2 additions & 2 deletions example/simple.f90
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
program simple
use nf, only: dense, input, network
use nf, only: dense, input, network, sgd
implicit none
type(network) :: net
real, allocatable :: x(:), y(:)
Expand All @@ -24,7 +24,7 @@ program simple

call net % forward(x)
call net % backward(y)
call net % update()
call net % update(optimizer=sgd(learning_rate=1.))

if (mod(n, 50) == 0) &
print '(i4,2(3x,f8.6))', n, net % predict(x)
Expand Down
4 changes: 2 additions & 2 deletions example/sine.f90
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
program sine
use nf, only: dense, input, network
use nf, only: dense, input, network, sgd
implicit none
type(network) :: net
real :: x(1), y(1)
Expand Down Expand Up @@ -31,7 +31,7 @@ program sine

call net % forward(x)
call net % backward(y)
call net % update()
call net % update(optimizer=sgd(learning_rate=1.))

if (mod(n, 10000) == 0) then
ypred = [(net % predict([xtest(i)]), i = 1, test_size)]
Expand Down
2 changes: 1 addition & 1 deletion src/nf.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module nf
use nf_layer_constructors, only: &
conv2d, dense, flatten, input, maxpool2d, reshape
use nf_network, only: network
use nf_optimizers, only: sgd
use nf_optimizers, only: sgd, rmsprop
use nf_activation, only: activation_function, elu, exponential, &
gaussian, linear, relu, leaky_relu, &
sigmoid, softmax, softplus, step, tanhf, &
Expand Down
15 changes: 13 additions & 2 deletions src/nf/nf_conv2d_layer.f90
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ module nf_conv2d_layer

contains

procedure :: init
procedure :: forward
procedure :: backward
procedure :: get_gradients
procedure :: get_num_params
procedure :: get_params
procedure :: init
procedure :: set_params

end type conv2d_layer
Expand Down Expand Up @@ -89,13 +90,23 @@ pure module function get_num_params(self) result(num_params)
end function get_num_params

pure module function get_params(self) result(params)
!! Get the parameters of the layer.
!! Return the parameters (weights and biases) of this layer.
!! The parameters are ordered as weights first, biases second.
class(conv2d_layer), intent(in) :: self
!! A `conv2d_layer` instance
real, allocatable :: params(:)
!! Parameters to get
end function get_params

pure module function get_gradients(self) result(gradients)
!! Return the gradients of this layer.
!! The gradients are ordered as weights first, biases second.
class(conv2d_layer), intent(in) :: self
!! A `conv2d_layer` instance
real, allocatable :: gradients(:)
!! Gradients to get
end function get_gradients

module subroutine set_params(self, params)
!! Set the parameters of the layer.
class(conv2d_layer), intent(in out) :: self
Expand Down
12 changes: 12 additions & 0 deletions src/nf/nf_conv2d_layer_submodule.f90
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,18 @@ pure module function get_params(self) result(params)
end function get_params


pure module function get_gradients(self) result(gradients)
class(conv2d_layer), intent(in) :: self
real, allocatable :: gradients(:)

gradients = [ &
pack(self % dw, .true.), &
pack(self % db, .true.) &
]

end function get_gradients


module subroutine set_params(self, params)
class(conv2d_layer), intent(in out) :: self
real, intent(in) :: params(:)
Expand Down
Loading

0 comments on commit e9bfbd6

Please sign in to comment.