diff --git a/README.md b/README.md index c1a1e1a..a6782d1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # The Inherent Limits of Pretrained LLMs: The Unexpected Convergence of Instruction Tuning and In-Context Learning Capabilities -[![Arxiv](https://img.shields.io/badge/Arxiv-YYMM.NNNNN-red?style=flat-square&logo=arxiv&logoColor=white)](https://put-here-your-paper.com) +[![Arxiv](https://img.shields.io/badge/Arxiv-2501.08716-red?style=flat-square&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2501.08716) [![License](https://img.shields.io/github/license/UKPLab/arxiv2025-inherent-limits-plms)](https://github.com/UKPLab/arxiv2025-inherent-limits-plms/blob/main/LICENSE) [![Python Versions](https://img.shields.io/badge/Python-3.9-blue.svg?style=flat&logo=python&logoColor=white)](https://www.python.org/) @@ -171,17 +171,16 @@ The results are written to `eval_logs.csv` and `bertscore_evals.csv` If you found this repository helpful, please cite our paper: ``` -@InProceedings{smith:20xx:CONFERENCE_TITLE, - author = {}, - title = {}, - booktitle = {}, - month = mmm, - year = {20xx}, - address = {}, - publisher = {}, - pages = {XXXX--XXXX}, - url = {http://xxxx.xxx} +@misc{bigoulaeva2025inherentlimitspretrainedllms, + title={The Inherent Limits of Pretrained LLMs: The Unexpected Convergence of Instruction Tuning and In-Context Learning Capabilities}, + author={Irina Bigoulaeva and Harish Tayyar Madabushi and Iryna Gurevych}, + year={2025}, + eprint={2501.08716}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2501.08716}, } + ``` ## Disclaimer diff --git a/data/manual_downloads/README.md b/data/manual_downloads/README.md new file mode 100644 index 0000000..afd4195 --- /dev/null +++ b/data/manual_downloads/README.md @@ -0,0 +1,19 @@ +# Manually-Downloaded Datasets +Some datasets contained within FLAN must be downloaded manually from the creators. + +These are: + +* Newsroom: https://lil.nlp.cornell.edu/newsroom/ + * Fill out the form provided by the authors to get access to the dataset + +Paper: [Newsroom: A Dataset of 1.3 Million Summaries with Diverse Extractive Strategies](https://aclanthology.org/N18-1065/) (Grusky et al., 2018) + +* Opin iDebate & Opin Movie: http://www.ccs.neu.edu/home/luwang/ + +Paper: [Neural Network-Based Abstract Generation for Opinions and Arguments](https://aclanthology.org/N16-1007/) (Wang & Ling, 2016) + +* Story Cloze: https://cs.rochester.edu/nlp/rocstories/ + * Fill out the form provided by the authors to get access to the test dataset + * Following the original FLAN, we use the 2016 version. + +Paper: [A Corpus and Cloze Evaluation for Deeper Understanding of Commonsense Stories](https://aclanthology.org/N16-1098/) (Mostafazadeh et al., 2016) \ No newline at end of file diff --git a/data_utils.py b/data_utils.py index c7b3dd0..679f7d2 100644 --- a/data_utils.py +++ b/data_utils.py @@ -277,7 +277,7 @@ def make_label_field(example): def load_hf_dataset(dataset_name, split=None, do_partitioning=False): print("LOADING DATA SPLIT:", split) - cutoff_num = 100000 + cutoff_num = 100000 # Set this to avoid fully loading massive datasets args_dict = {"split": split, "trust_remote_code": True} @@ -305,7 +305,7 @@ def load_hf_dataset(dataset_name, split=None, do_partitioning=False): elif dataset_name == "cnn_dailymail": args_dict["path"] = dataset_name - args_dict["name"] = "3.0.0" # FLAN uses 3.1.0?? + args_dict["name"] = "3.0.0" # FLAN uses 3.1.0, but this is unavailable on HF elif dataset_name == "web_nlg": args_dict["path"] = dataset_name @@ -537,7 +537,7 @@ def enumerate_lines(line_list): # Some datasets require post-processing and filtering if dataset_name == "snli": - # Remove "unsure" samples. Source: (TODO: CITE) + # Remove samples without a consensus label. Source: https://huggingface.co/datasets/stanfordnlp/snli dataset = dataset.filter(lambda item: item["label"] != -1) dataset = dataset.add_column("options", [""] * len(dataset)) elif dataset_name == "fix_punct": diff --git a/test.py b/test.py index 049a4d4..f95c176 100644 --- a/test.py +++ b/test.py @@ -135,6 +135,8 @@ def format_ic_bigbench(sample, inner_template, task_name, task_dataset, ic_examp return sample def filter_bad_samples(example): + # These samples caused the regex search to hang, since they had too many matches. + # We can simply remove these. val = True if "Applied for full membership" in example["text"]: val = False @@ -605,11 +607,11 @@ def run_test(model_name, } # Load a trained or base model - saved_model = "/storage/ukp/work/bigoulaeva/CoT_Recovery/src/saved_models/" + config.args.run_name + saved_model = config.path + "saved_models/" + config.args.run_name if config.args.run_name == "base": saved_model = None if not from_samplegen: - out_file = "/storage/ukp/work/bigoulaeva/CoT_Recovery/src/saved_models/base_model_evals/" + eval_name + ".csv" + out_file = config.path + "saved_models/base_model_evals/" + eval_name + ".csv" elif from_samplegen: target_folder = config.path + "saved_models/" + config.samplegen_model + "/" samplegen_eval_path = "samplegen_pipeline_evals" @@ -861,7 +863,7 @@ def run_test(model_name, responses["gold_options"].append(orig_options[idx]) elif from_samplegen: if config.sample_source == "model": - if orig_options[0] != "": # Otherwise it breaks for tasks without options (batch_start+idx too high by 1)..... + if orig_options[0] != "": # Otherwise it breaks for tasks without options (batch_start+idx too high by 1) responses["gold_options"].append(orig_options[batch_start+idx][0]) else: responses["gold_options"].append("")