Added more detailed comments on ClippedAdam with centered variance an…

…d its tests.
pyro-ppl · Jan 24, 2025 · efa56d9 · efa56d9
1 parent b51a14a
commit efa56d9
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 8 deletions.
diff --git a/pyro/optim/clipped_adam.py b/pyro/optim/clipped_adam.py
@@ -23,17 +23,17 @@ class ClippedAdam(Optimizer):
 
     Small modification to the Adam algorithm implemented in torch.optim.Adam
     to include gradient clipping and learning rate decay and an option to use
-    the centered variance.
+    the centered variance (see equation 2 in [2]).
 
-    References
+    **References**
 
-    `A Method for Stochastic Optimization`, Diederik P. Kingma, Jimmy Ba
-    https://arxiv.org/abs/1412.6980
+    [1] `A Method for Stochastic Optimization`, Diederik P. Kingma, Jimmy Ba
+        https://arxiv.org/abs/1412.6980
 
-    `A Two-Step Machine Learning Method for Predicting the Formation Energy of Ternary Compounds`,
-    Varadarajan Rengaraj, Sebastian Jost, Franz Bethke, Christian Plessl,
-    Hossein Mirhosseini, Andrea Walther, Thomas D. Kühne
-    https://doi.org/10.3390/computation11050095
+    [2] `A Two-Step Machine Learning Method for Predicting the Formation Energy of Ternary Compounds`,
+        Varadarajan Rengaraj, Sebastian Jost, Franz Bethke, Christian Plessl,
+        Hossein Mirhosseini, Andrea Walther, Thomas D. Kühne
+        https://doi.org/10.3390/computation11050095
     """
 
     def __init__(

diff --git a/tests/optim/test_optim.py b/tests/optim/test_optim.py
@@ -458,13 +458,23 @@ def fit(lr, centered_variance, num_iter=5000):
         return torch.Tensor(loss_vec)
 
     def calc_convergence(loss_vec, tail_len=100, threshold=0.01):
+        """
+        Calculate the number of iterations needed in order to reach the
+        ultimate loss plus a small threshold, and the convergence rate
+        which is the mean per iteration improvement of the gap between
+        the loss and the ultimate loss.
+        """
         ultimate_loss = loss_vec[-tail_len:].mean()
         convergence_iter = (loss_vec < (ultimate_loss + threshold)).nonzero().min()
         convergence_vec = loss_vec[:convergence_iter] - ultimate_loss
         convergence_rate = (convergence_vec[:-1] / convergence_vec[1:]).log().mean()
         return ultimate_loss, convergence_rate, convergence_iter
 
     def get_convergence_vec(lr_vec, centered_variance):
+        """
+        Fit parameters for a vector of learning rates, with or without centered variance,
+        and calculate the convergence properties for each learning rate.
+        """
         ultimate_loss_vec, convergence_rate_vec, convergence_iter_vec = [], [], []
         for lr in lr_vec:
             loss_vec = fit(lr=lr, centered_variance=centered_variance)