diff --git a/pyro/optim/clipped_adam.py b/pyro/optim/clipped_adam.py index 12779d4f8b..14ac268129 100644 --- a/pyro/optim/clipped_adam.py +++ b/pyro/optim/clipped_adam.py @@ -23,17 +23,17 @@ class ClippedAdam(Optimizer): Small modification to the Adam algorithm implemented in torch.optim.Adam to include gradient clipping and learning rate decay and an option to use - the centered variance. + the centered variance (see equation 2 in [2]). - References + **References** - `A Method for Stochastic Optimization`, Diederik P. Kingma, Jimmy Ba - https://arxiv.org/abs/1412.6980 + [1] `A Method for Stochastic Optimization`, Diederik P. Kingma, Jimmy Ba + https://arxiv.org/abs/1412.6980 - `A Two-Step Machine Learning Method for Predicting the Formation Energy of Ternary Compounds`, - Varadarajan Rengaraj, Sebastian Jost, Franz Bethke, Christian Plessl, - Hossein Mirhosseini, Andrea Walther, Thomas D. Kühne - https://doi.org/10.3390/computation11050095 + [2] `A Two-Step Machine Learning Method for Predicting the Formation Energy of Ternary Compounds`, + Varadarajan Rengaraj, Sebastian Jost, Franz Bethke, Christian Plessl, + Hossein Mirhosseini, Andrea Walther, Thomas D. Kühne + https://doi.org/10.3390/computation11050095 """ def __init__( diff --git a/tests/optim/test_optim.py b/tests/optim/test_optim.py index 91430420aa..5e745efddf 100644 --- a/tests/optim/test_optim.py +++ b/tests/optim/test_optim.py @@ -458,6 +458,12 @@ def fit(lr, centered_variance, num_iter=5000): return torch.Tensor(loss_vec) def calc_convergence(loss_vec, tail_len=100, threshold=0.01): + """ + Calculate the number of iterations needed in order to reach the + ultimate loss plus a small threshold, and the convergence rate + which is the mean per iteration improvement of the gap between + the loss and the ultimate loss. + """ ultimate_loss = loss_vec[-tail_len:].mean() convergence_iter = (loss_vec < (ultimate_loss + threshold)).nonzero().min() convergence_vec = loss_vec[:convergence_iter] - ultimate_loss @@ -465,6 +471,10 @@ def calc_convergence(loss_vec, tail_len=100, threshold=0.01): return ultimate_loss, convergence_rate, convergence_iter def get_convergence_vec(lr_vec, centered_variance): + """ + Fit parameters for a vector of learning rates, with or without centered variance, + and calculate the convergence properties for each learning rate. + """ ultimate_loss_vec, convergence_rate_vec, convergence_iter_vec = [], [], [] for lr in lr_vec: loss_vec = fit(lr=lr, centered_variance=centered_variance)