diff --git a/README.md b/README.md
index c1a1e1a..a6782d1 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # The Inherent Limits of Pretrained LLMs: The Unexpected Convergence of Instruction Tuning and In-Context Learning Capabilities
-[![Arxiv](https://img.shields.io/badge/Arxiv-YYMM.NNNNN-red?style=flat-square&logo=arxiv&logoColor=white)](https://put-here-your-paper.com)
+[![Arxiv](https://img.shields.io/badge/Arxiv-2501.08716-red?style=flat-square&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2501.08716)
 [![License](https://img.shields.io/github/license/UKPLab/arxiv2025-inherent-limits-plms)](https://github.com/UKPLab/arxiv2025-inherent-limits-plms/blob/main/LICENSE)
 [![Python Versions](https://img.shields.io/badge/Python-3.9-blue.svg?style=flat&logo=python&logoColor=white)](https://www.python.org/)
 
@@ -171,17 +171,16 @@ The results are written to `eval_logs.csv` and `bertscore_evals.csv`
 If you found this repository helpful, please cite our paper:
 
 ```
-@InProceedings{smith:20xx:CONFERENCE_TITLE,
-  author    = {},
-  title     = {},
-  booktitle = {},
-  month     = mmm,
-  year      = {20xx},
-  address   = {},
-  publisher = {},
-  pages     = {XXXX--XXXX},
-  url       = {http://xxxx.xxx}
+@misc{bigoulaeva2025inherentlimitspretrainedllms,
+      title={The Inherent Limits of Pretrained LLMs: The Unexpected Convergence of Instruction Tuning and In-Context Learning Capabilities}, 
+      author={Irina Bigoulaeva and Harish Tayyar Madabushi and Iryna Gurevych},
+      year={2025},
+      eprint={2501.08716},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2501.08716}, 
 }
+
 ```
 
 ## Disclaimer
diff --git a/data/manual_downloads/README.md b/data/manual_downloads/README.md
new file mode 100644
index 0000000..afd4195
--- /dev/null
+++ b/data/manual_downloads/README.md
@@ -0,0 +1,19 @@
+# Manually-Downloaded Datasets
+Some datasets contained within FLAN must be downloaded manually from the creators.
+
+These are:
+
+* Newsroom: https://lil.nlp.cornell.edu/newsroom/
+    * Fill out the form provided by the authors to get access to the dataset
+
+Paper: [Newsroom: A Dataset of 1.3 Million Summaries with Diverse Extractive Strategies](https://aclanthology.org/N18-1065/) (Grusky et al., 2018)
+
+* Opin iDebate & Opin Movie: http://www.ccs.neu.edu/home/luwang/
+
+Paper: [Neural Network-Based Abstract Generation for Opinions and Arguments](https://aclanthology.org/N16-1007/) (Wang & Ling, 2016)
+
+* Story Cloze: https://cs.rochester.edu/nlp/rocstories/
+    * Fill out the form provided by the authors to get access to the test dataset
+    * Following the original FLAN, we use the 2016 version.
+
+Paper: [A Corpus and Cloze Evaluation for Deeper Understanding of Commonsense Stories](https://aclanthology.org/N16-1098/) (Mostafazadeh et al., 2016)
\ No newline at end of file
diff --git a/data_utils.py b/data_utils.py
index c7b3dd0..679f7d2 100644
--- a/data_utils.py
+++ b/data_utils.py
@@ -277,7 +277,7 @@ def make_label_field(example):
 
 def load_hf_dataset(dataset_name, split=None, do_partitioning=False):
     print("LOADING DATA SPLIT:", split)
-    cutoff_num = 100000
+    cutoff_num = 100000        # Set this to avoid fully loading massive datasets
     args_dict = {"split": split,
                  "trust_remote_code": True}
 
@@ -305,7 +305,7 @@ def load_hf_dataset(dataset_name, split=None, do_partitioning=False):
     
     elif dataset_name == "cnn_dailymail":
         args_dict["path"] = dataset_name
-        args_dict["name"] = "3.0.0"   # FLAN uses 3.1.0??
+        args_dict["name"] = "3.0.0"   # FLAN uses 3.1.0, but this is unavailable on HF
     
     elif dataset_name == "web_nlg":
         args_dict["path"] = dataset_name
@@ -537,7 +537,7 @@ def enumerate_lines(line_list):
 
     # Some datasets require post-processing and filtering
     if dataset_name == "snli":
-        # Remove "unsure" samples. Source: (TODO: CITE)
+        # Remove samples without a consensus label. Source: https://huggingface.co/datasets/stanfordnlp/snli
         dataset = dataset.filter(lambda item: item["label"] != -1)  
         dataset = dataset.add_column("options", [""] * len(dataset))
     elif dataset_name == "fix_punct":
diff --git a/test.py b/test.py
index 049a4d4..f95c176 100644
--- a/test.py
+++ b/test.py
@@ -135,6 +135,8 @@ def format_ic_bigbench(sample, inner_template, task_name, task_dataset, ic_examp
     return sample
 
 def filter_bad_samples(example):
+    # These samples caused the regex search to hang, since they had too many matches.
+    # We can simply remove these.
     val = True
     if "Applied for full membership" in example["text"]:
         val = False
@@ -605,11 +607,11 @@ def run_test(model_name,
                  }
 
     # Load a trained or base model
-    saved_model = "/storage/ukp/work/bigoulaeva/CoT_Recovery/src/saved_models/" + config.args.run_name
+    saved_model = config.path + "saved_models/" + config.args.run_name
     if config.args.run_name == "base":
         saved_model = None
         if not from_samplegen:
-            out_file = "/storage/ukp/work/bigoulaeva/CoT_Recovery/src/saved_models/base_model_evals/" + eval_name + ".csv"
+            out_file = config.path + "saved_models/base_model_evals/" + eval_name + ".csv"
         elif from_samplegen:
             target_folder = config.path + "saved_models/" + config.samplegen_model + "/"
             samplegen_eval_path = "samplegen_pipeline_evals"
@@ -861,7 +863,7 @@ def run_test(model_name,
                 responses["gold_options"].append(orig_options[idx])
             elif from_samplegen:
                 if config.sample_source == "model":
-                    if orig_options[0] != "":   # Otherwise it breaks for tasks without options (batch_start+idx too high by 1).....
+                    if orig_options[0] != "":   # Otherwise it breaks for tasks without options (batch_start+idx too high by 1)
                         responses["gold_options"].append(orig_options[batch_start+idx][0])
                     else:
                         responses["gold_options"].append("")