diff --git a/pyro/optim/clipped_adam.py b/pyro/optim/clipped_adam.py
index 12779d4f8b..14ac268129 100644
--- a/pyro/optim/clipped_adam.py
+++ b/pyro/optim/clipped_adam.py
@@ -23,17 +23,17 @@ class ClippedAdam(Optimizer):
 
     Small modification to the Adam algorithm implemented in torch.optim.Adam
     to include gradient clipping and learning rate decay and an option to use
-    the centered variance.
+    the centered variance (see equation 2 in [2]).
 
-    References
+    **References**
 
-    `A Method for Stochastic Optimization`, Diederik P. Kingma, Jimmy Ba
-    https://arxiv.org/abs/1412.6980
+    [1] `A Method for Stochastic Optimization`, Diederik P. Kingma, Jimmy Ba
+        https://arxiv.org/abs/1412.6980
 
-    `A Two-Step Machine Learning Method for Predicting the Formation Energy of Ternary Compounds`,
-    Varadarajan Rengaraj, Sebastian Jost, Franz Bethke, Christian Plessl,
-    Hossein Mirhosseini, Andrea Walther, Thomas D. Kühne
-    https://doi.org/10.3390/computation11050095
+    [2] `A Two-Step Machine Learning Method for Predicting the Formation Energy of Ternary Compounds`,
+        Varadarajan Rengaraj, Sebastian Jost, Franz Bethke, Christian Plessl,
+        Hossein Mirhosseini, Andrea Walther, Thomas D. Kühne
+        https://doi.org/10.3390/computation11050095
     """
 
     def __init__(
diff --git a/tests/optim/test_optim.py b/tests/optim/test_optim.py
index 91430420aa..5e745efddf 100644
--- a/tests/optim/test_optim.py
+++ b/tests/optim/test_optim.py
@@ -458,6 +458,12 @@ def fit(lr, centered_variance, num_iter=5000):
         return torch.Tensor(loss_vec)
 
     def calc_convergence(loss_vec, tail_len=100, threshold=0.01):
+        """
+        Calculate the number of iterations needed in order to reach the
+        ultimate loss plus a small threshold, and the convergence rate
+        which is the mean per iteration improvement of the gap between
+        the loss and the ultimate loss.
+        """
         ultimate_loss = loss_vec[-tail_len:].mean()
         convergence_iter = (loss_vec < (ultimate_loss + threshold)).nonzero().min()
         convergence_vec = loss_vec[:convergence_iter] - ultimate_loss
@@ -465,6 +471,10 @@ def calc_convergence(loss_vec, tail_len=100, threshold=0.01):
         return ultimate_loss, convergence_rate, convergence_iter
 
     def get_convergence_vec(lr_vec, centered_variance):
+        """
+        Fit parameters for a vector of learning rates, with or without centered variance,
+        and calculate the convergence properties for each learning rate.
+        """
         ultimate_loss_vec, convergence_rate_vec, convergence_iter_vec = [], [], []
         for lr in lr_vec:
             loss_vec = fit(lr=lr, centered_variance=centered_variance)