From 21b83a916b8e8dde447ede4572c429c1904a0cc3 Mon Sep 17 00:00:00 2001 From: Zdenek Kasner Date: Tue, 14 Mar 2023 16:20:36 +0100 Subject: [PATCH] Add Huggingface dataset upload scripts --- README.md | 4 +- misc/hf_scripts/README.md | 9 ++ misc/hf_scripts/cacapo.py | 146 ++++++++++++++++++++++++++++++ misc/hf_scripts/charttotext-s.py | 117 ++++++++++++++++++++++++ misc/hf_scripts/eventnarrative.py | 83 +++++++++++++++++ misc/hf_scripts/hitab.py | 89 ++++++++++++++++++ misc/hf_scripts/logic2text.py | 86 ++++++++++++++++++ misc/hf_scripts/logicnlg.py | 98 ++++++++++++++++++++ misc/hf_scripts/numericnlg.py | 119 ++++++++++++++++++++++++ misc/hf_scripts/scigen.py | 80 ++++++++++++++++ misc/hf_scripts/wikitabletext.py | 89 ++++++++++++++++++ 11 files changed, 919 insertions(+), 1 deletion(-) create mode 100644 misc/hf_scripts/README.md create mode 100755 misc/hf_scripts/cacapo.py create mode 100755 misc/hf_scripts/charttotext-s.py create mode 100755 misc/hf_scripts/eventnarrative.py create mode 100755 misc/hf_scripts/hitab.py create mode 100755 misc/hf_scripts/logic2text.py create mode 100755 misc/hf_scripts/logicnlg.py create mode 100755 misc/hf_scripts/numericnlg.py create mode 100755 misc/hf_scripts/scigen.py create mode 100755 misc/hf_scripts/wikitabletext.py diff --git a/README.md b/README.md index 0871b2b..441dd35 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,9 @@ The datasets are stored to `HF_DATASETS_CACHE` directory which defaults to `~/.c The datasets are all loaded from [HuggingFace datasets](https://huggingface.co/datasets) instead of their original repositories which allows to use preprocessed datasets and a single unified loader. -Note that there may be some minor changes in the data w.r.t. to the original datasets due to unification, such as adding "subject", "predicate" and "object" headings to RDF triple-to-text datasets. +**Note that preparing the datasets for the first time may take some time since the datasets have to be downloaded to cache and preprocessed.** This process takes several minutes based on the dataset size. However, it only a one-time process (until the dataset is updated or the cache is deleted). + +Also note that there may be some minor changes in the data w.r.t. to the original datasets due to unification, such as adding "subject", "predicate" and "object" headings to RDF triple-to-text datasets. ## Adding datasets For adding a new dataset: diff --git a/misc/hf_scripts/README.md b/misc/hf_scripts/README.md new file mode 100644 index 0000000..f6d13a6 --- /dev/null +++ b/misc/hf_scripts/README.md @@ -0,0 +1,9 @@ +# Huggingface datasets scripts + +This folder contains scripts used to newly upload some of the datasets in TabGenie to [Huggingface Datasets](https://huggingface.co/datasets). + +The structure of the scripts is based on the existing scripts and a tutorial which may be found [here](https://huggingface.co/docs/datasets/dataset_script). + +The field `DATASET_PATH` has to point to the original dataset repository. + +Note that some newer versions of the `datasets` package create an empty `README.md` file which overrides the information in the loading script and the info then does not show up in the app. To prevent this behavior, use either `datasets==2.5.1` or delete the `README.md` after dataset upload. \ No newline at end of file diff --git a/misc/hf_scripts/cacapo.py b/misc/hf_scripts/cacapo.py new file mode 100755 index 0000000..cde7add --- /dev/null +++ b/misc/hf_scripts/cacapo.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +""" +The script used to load the dataset from the original source. +""" + +import os +import xml.etree.cElementTree as ET +from collections import defaultdict +from glob import glob +from os.path import join as pjoin +from pathlib import Path + +import datasets + +DATASET_PATH = None + +_CITATION = """\ +@inproceedings{van2020cacapo, + title={The CACAPO dataset: A multilingual, multi-domain dataset for neural pipeline and end-to-end data-to-text generation}, + author={van der Lee, Chris and Emmery, Chris and Wubben, Sander and Krahmer, Emiel}, + booktitle={Proceedings of the 13th International Conference on Natural Language Generation}, + pages={68--79}, + year={2020} +} +""" + +_DESCRIPTION = """\ +CACAPO is a data-to-text dataset that contains sentences from news reports for the sports, weather, stock, and incidents domain in English and Dutch, aligned with relevant attribute-value paired data. This is the first data-to-text dataset based on "naturally occurring" human-written texts (i.e., texts that were not collected in a task-based setting), that covers various domains, as well as multiple languages. """ +_URL = "https://github.com/TallChris91/CACAPO-Dataset" +_LICENSE = "CC BY 4.0" + + +def et_to_dict(tree): + dct = {tree.tag: {} if tree.attrib else None} + children = list(tree) + if children: + dd = defaultdict(list) + for dc in map(et_to_dict, children): + for k, v in dc.items(): + dd[k].append(v) + dct = {tree.tag: dd} + if tree.attrib: + dct[tree.tag].update((k, v) for k, v in tree.attrib.items()) + if tree.text: + text = tree.text.strip() + if children or tree.attrib: + if text: + dct[tree.tag]["text"] = text + else: + dct[tree.tag] = text + return dct + + +def parse_entry(entry): + res = {} + otriple_set_list = entry["originaltripleset"] + res["original_triple_sets"] = [{"otriple_set": otriple_set["otriple"]} for otriple_set in otriple_set_list] + mtriple_set_list = entry["modifiedtripleset"] + res["modified_triple_sets"] = [{"mtriple_set": mtriple_set["mtriple"]} for mtriple_set in mtriple_set_list] + res["category"] = entry["category"] + res["eid"] = entry["eid"] + res["size"] = int(entry["size"]) + res["lex"] = { + "comment": [ex.get("comment", "") for ex in entry.get("lex", [])], + "lid": [ex.get("lid", "") for ex in entry.get("lex", [])], + "text": [ex.get("text", "") for ex in entry.get("lex", [])], + } + return res + + +def xml_file_to_examples(filename): + tree = ET.parse(filename).getroot() + + examples = et_to_dict(tree)["benchmark"]["entries"][0]["entry"] + return [parse_entry(entry) for entry in examples] + + +class CACAPO(datasets.GeneratorBasedBuilder): + VERSION = datasets.Version("1.0.0") + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "category": datasets.Value("string"), + "lang": datasets.Value("string"), + "size": datasets.Value("int32"), + "eid": datasets.Value("string"), + "original_triple_sets": datasets.Sequence( + {"otriple_set": datasets.Sequence(datasets.Value("string"))} + ), + "modified_triple_sets": datasets.Sequence( + {"mtriple_set": datasets.Sequence(datasets.Value("string"))} + ), + "lex": datasets.Sequence( + { + "comment": datasets.Value("string"), + "lid": datasets.Value("string"), + "text": datasets.Value("string"), + } + ), + } + ), + supervised_keys=None, + homepage=_URL, + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "dev"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "test"}, + ), + ] + + def _generate_examples(self, filedirs, split): + """Yields examples.""" + id_ = 0 + + for lang in ["en", "nl"]: + for filedir in filedirs: + xml_file = os.path.join(DATASET_PATH, lang, filedir, f"WebNLGFormat{split.title()}.xml") + + for exple_dict in xml_file_to_examples(xml_file): + exple_dict["category"] = filedir + exple_dict["lang"] = lang + id_ += 1 + yield id_, exple_dict + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + dataset.push_to_hub("kasnerz/cacapo") diff --git a/misc/hf_scripts/charttotext-s.py b/misc/hf_scripts/charttotext-s.py new file mode 100755 index 0000000..e29183b --- /dev/null +++ b/misc/hf_scripts/charttotext-s.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +""" +The script used to load the dataset from the original source. +""" + +import json +import datasets +import os +import csv + +DATASET_PATH = None + +_CITATION = """\ +@inproceedings{kantharaj2022chart, + title={Chart-to-Text: A Large-Scale Benchmark for Chart Summarization}, + author={Kantharaj, Shankar and Leong, Rixie Tiffany and Lin, Xiang and Masry, Ahmed and Thakkar, Megh and Hoque, Enamul and Joty, Shafiq}, + booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages={4005--4023}, + year={2022} +} +""" +_DESCRIPTION = """\ +Chart-to-Text is a large-scale benchmark with two datasets and a total of 44,096 charts covering a wide range of topics and chart types. +This dataset CONTAINS ONLY the Statista subset from the benchmark. +Statista (statista.com) is an online platform that regularly publishes charts on a wide range of topics including economics, market and opinion research. + +Statistics: +Total charts: 27868 + +=== Chart Type Information === +Number of charts of each chart type +column: 16319 +bar: 8272 +line: 2646 +pie: 408 +table: 223 + +=== Token Information === +Average token count per summary: 53.65027989091431 +Total tokens: 1495126 +Total types (unique tokens): 39598 +=== Sentence Information === +Average sentence count per summary: 2.5596741782689825 +""" + +_URL = "https://github.com/vis-nlp/Chart-to-text/tree/main/statista_dataset/dataset" +_LICENSE = "GNU General Public License v3.0" + + +class ChartToTextS(datasets.GeneratorBasedBuilder): + VERSION = "1.0.0" + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "title": datasets.Value(dtype="string"), + "ref": datasets.Value(dtype="string"), + "content": datasets.Value(dtype="large_string"), + } + ), + supervised_keys=None, + homepage=_URL, + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "train"} + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "dev"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "test"} + ), + ] + + def _generate_examples(self, filepath, split): + data = [] + mapping_file = split if split != "dev" else "val" + + with open(os.path.join(filepath, "dataset_split", f"{mapping_file}_index_mapping.csv")) as f: + next(f) + for i, line in enumerate(f): + subdir = "." if line.startswith("two_col") else "multiColumn" + filename = line.split("-")[1].split(".")[0] + + with open(os.path.join(filepath, subdir, "data", filename + ".csv")) as g: + content = [] + reader = csv.reader(g, delimiter=",", quotechar='"') + for row in reader: + content.append(row) + + with open(os.path.join(filepath, subdir, "captions", filename + ".txt")) as g: + ref = g.read().rstrip("\n") + + with open(os.path.join(filepath, subdir, "titles", filename + ".txt")) as g: + title = g.read().rstrip("\n") + + data.append({"content": content, "ref": ref, "title": title}) + + if i % 1000 == 0: + print(f"Loaded {i} items") + + for example_idx, entry in enumerate(data): + yield example_idx, {key: str(value) for key, value in entry.items()} + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + dataset.push_to_hub("kasnerz/charttotext-s") diff --git a/misc/hf_scripts/eventnarrative.py b/misc/hf_scripts/eventnarrative.py new file mode 100755 index 0000000..a9f74e6 --- /dev/null +++ b/misc/hf_scripts/eventnarrative.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +""" +The script used to load the dataset from the original source. +""" + +import os +from collections import defaultdict + +import json +import datasets + +DATASET_PATH = None + +_CITATION = """\ +@inproceedings{colas2021eventnarrative, + title={EventNarrative: A Large-scale Event-centric Dataset for Knowledge Graph-to-Text Generation}, + author={Colas, Anthony and Sadeghian, Ali and Wang, Yue and Wang, Daisy Zhe}, + booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)}, + year={2021} +} +""" + +_DESCRIPTION = """\ +EventNarrative is a knowledge graph-to-text dataset from publicly available open-world knowledge graphs, focusing on event-centric data. +EventNarrative consists of approximately 230,000 graphs and their corresponding natural language text, 6 times larger than the current largest parallel dataset. +It makes use of a rich ontology and all of the KGs entities are linked to the text.""" + +_URL = "https://www.kaggle.com/datasets/acolas1/eventnarration" +_LICENSE = "CC BY 4.0" + + +class EventNarrative(datasets.GeneratorBasedBuilder): + VERSION = datasets.Version("1.0.0") + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "Event_Name": datasets.Value("string"), + "entity_ref_dict": datasets.Value("large_string"), + "keep_triples": datasets.Value("large_string"), + "narration": datasets.Value("large_string"), + "types": datasets.Value("string"), + "wikipediaLabel": datasets.Value("string"), + } + ), + supervised_keys=None, + homepage=_URL, + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"split": "train"}), + datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"split": "dev"}), + datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"split": "test"}), + ] + + def _generate_examples(self, split): + """Yields examples.""" + id_ = 0 + + with open(DATASET_PATH + "/" + f"{split}_data.json") as f: + j = json.load(f) + + for example in j: + e = {key: str(value) for key, value in example.items()} + id_ += 1 + yield id_, e + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + + import pdb + + pdb.set_trace() # breakpoint ffb6df83 // + + # dataset.push_to_hub("kasnerz/eventnarrative") diff --git a/misc/hf_scripts/hitab.py b/misc/hf_scripts/hitab.py new file mode 100755 index 0000000..2477269 --- /dev/null +++ b/misc/hf_scripts/hitab.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +""" +The script used to load the dataset from the original source. +""" + +import json +import datasets +import glob +import os + +DATASET_PATH = None + +_CITATION = """\ +@article{cheng2021hitab, + title={HiTab: A Hierarchical Table Dataset for Question Answering and Natural Language Generation}, + author={Cheng, Zhoujun and Dong, Haoyu and Wang, Zhiruo and Jia, Ran and Guo, Jiaqi and Gao, Yan and Han, Shi and Lou, Jian-Guang and Zhang, Dongmei}, + journal={arXiv preprint arXiv:2108.06712}, + year={2021} +} +""" +_DESCRIPTION = """\ +HiTab is a dataset for question answering and data-to-text over hierarchical tables. It contains 10,672 samples and 3,597 tables from statistical reports (StatCan, NSF) and Wikipedia (ToTTo). 98.1% of the tables in HiTab are with hierarchies. +""" + +_URL = "https://github.com/microsoft/HiTab" +_LICENSE = "C-UDA 1.0" + + +class HiTab(datasets.GeneratorBasedBuilder): + VERSION = datasets.Version("2022.2.7") + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "id": datasets.Value(dtype="string"), + "table_id": datasets.Value(dtype="string"), + "table_source": datasets.Value(dtype="string"), + "sentence_id": datasets.Value(dtype="string"), + "sub_sentence_id": datasets.Value(dtype="string"), + "sub_sentence": datasets.Value(dtype="string"), + "question": datasets.Value(dtype="string"), + "answer": datasets.Value(dtype="large_string"), + "aggregation": datasets.Value(dtype="large_string"), + "linked_cells": datasets.Value(dtype="large_string"), + "answer_formulas": datasets.Value(dtype="large_string"), + "reference_cells_map": datasets.Value(dtype="large_string"), + "table_content": datasets.Value(dtype="large_string"), + } + ), + supervised_keys=None, + homepage="https://www.microsoft.com/en-us/research/publication/hitab-a-hierarchical-table-dataset-for-question-answering-and-natural-language-generation/", + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "train"}), + datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "dev"}), + datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "test"}), + ] + + def _generate_examples(self, filepath, split): + table_content = {} + data = [] + + for filename in glob.glob(os.path.join(filepath, "tables", "raw", "*.json")): + with open(filename) as f: + j = json.load(f) + table_name = os.path.basename(filename).rstrip(".json") + table_content[table_name] = j + + with open(os.path.join(filepath, f"{split}_samples.jsonl")) as f: + for i, line in enumerate(f.readlines()): + j = json.loads(line) + data.append(j) + + for example_idx, entry in enumerate(data): + entry["table_content"] = table_content.get(entry["table_id"]) + yield example_idx, {key: str(value) for key, value in entry.items()} + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + dataset.push_to_hub("kasnerz/hitab") diff --git a/misc/hf_scripts/logic2text.py b/misc/hf_scripts/logic2text.py new file mode 100755 index 0000000..94d9707 --- /dev/null +++ b/misc/hf_scripts/logic2text.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +""" +The script used to load the dataset from the original source. +""" + + +import json +import datasets +import glob +import os + +DATASET_PATH = None + +_CITATION = """\ +@inproceedings{chen2020logic2text, + title={Logic2Text: High-Fidelity Natural Language Generation from Logical Forms}, + author={Chen, Zhiyu and Chen, Wenhu and Zha, Hanwen and Zhou, Xiyou and Zhang, Yunkai and Sundaresan, Sairam and Wang, William Yang}, + booktitle={Findings of the Association for Computational Linguistics: EMNLP 2020}, + pages={2096--2111}, + year={2020} +} +""" +_DESCRIPTION = """\ +Logic2Text is a large-scale dataset with 10,753 descriptions involving common logic types paired with the underlying logical forms. +The logical forms show diversified graph structure of free schema, which poses great challenges on the model's ability to understand the semantics. +""" + +_URL = "https://github.com/czyssrs/Logic2Text" +_LICENSE = "MIT" + + +class Logic2Text(datasets.GeneratorBasedBuilder): + VERSION = "1.0.0" + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "topic": datasets.Value(dtype="string"), + "wiki": datasets.Value(dtype="string"), + "url": datasets.Value(dtype="string"), + "action": datasets.Value(dtype="string"), + "sent": datasets.Value(dtype="string"), + "annotation": datasets.Value(dtype="string"), + "logic": datasets.Value(dtype="string"), + "logic_str": datasets.Value(dtype="string"), + "interpret": datasets.Value(dtype="string"), + "num_func": datasets.Value(dtype="string"), + "nid": datasets.Value(dtype="string"), + "g_ids": datasets.Value(dtype="string"), + "g_ids_features": datasets.Value(dtype="string"), + "g_adj": datasets.Value(dtype="string"), + "table_header": datasets.Value(dtype="string"), + "table_cont": datasets.Value(dtype="large_string"), + } + ), + supervised_keys=None, + homepage=_URL, + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "train"}), + datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "dev"}), + datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "test"}), + ] + + def _generate_examples(self, filepath, split): + data = [] + filename = split if split != "dev" else "valid" + + with open(os.path.join(filepath, f"{filename}.json")) as f: + data = json.load(f) + + for example_idx, entry in enumerate(data): + yield example_idx, {key: str(value) for key, value in entry.items()} + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + dataset.push_to_hub("kasnerz/logic2text") diff --git a/misc/hf_scripts/logicnlg.py b/misc/hf_scripts/logicnlg.py new file mode 100755 index 0000000..94f7da7 --- /dev/null +++ b/misc/hf_scripts/logicnlg.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +""" +The script used to load the dataset from the original source. +""" + +import json +import datasets +import glob +import os + +DATASET_PATH = None + +_CITATION = """\ +@inproceedings{chen2020logical, + title={Logical Natural Language Generation from Open-Domain Tables}, + author={Chen, Wenhu and Chen, Jianshu and Su, Yu and Chen, Zhiyu and Wang, William Yang}, + booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, + pages={7929--7942}, + year={2020} +} +""" +_DESCRIPTION = """\ +LogicNLG is a dataset for natural language generation from open-domain tables. +LogicNLG is based on TabFact (Chen et al., 2019), which is a table-based fact-checking dataset with rich logical inferences in the annotated statements. +""" + +_URL = "https://github.com/wenhuchen/LogicNLG" +_LICENSE = "MIT" + + +class LogicNLG(datasets.GeneratorBasedBuilder): + VERSION = "1.0.0" + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "table": datasets.Value(dtype="large_string"), + "ref": datasets.Value(dtype="string"), + "linked_columns": datasets.Value(dtype="string"), + "title": datasets.Value(dtype="string"), + "template": datasets.Value(dtype="string"), + "table_id": datasets.Value(dtype="string"), + } + ), + supervised_keys=None, + homepage="https://wenhuchen.github.io/logicnlg.github.io/", + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "train"} + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "dev"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "test"} + ), + ] + + def _generate_examples(self, filepath, split): + filename = split if split != "dev" else "val" + data = [] + + with open(os.path.join(filepath, f"{filename}_lm.json")) as f: + j = json.load(f) + + for i, (table_id, examples) in enumerate(j.items()): + table = [] + with open(os.path.join(filepath, "all_csv", table_id)) as f: + for line in f.readlines(): + table.append(line.rstrip("\n").split("#")) + + for example in examples: + data.append( + { + "table": table, + "ref": example[0], + "linked_columns": example[1], + "title": example[2], + "template": example[3], + "table_id": table_id, + } + ) + for example_idx, entry in enumerate(data): + yield example_idx, {key: str(value) for key, value in entry.items()} + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + dataset.push_to_hub("kasnerz/logicnlg") diff --git a/misc/hf_scripts/numericnlg.py b/misc/hf_scripts/numericnlg.py new file mode 100755 index 0000000..45ad01a --- /dev/null +++ b/misc/hf_scripts/numericnlg.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 + +"""NumericNLG: Towards Table-to-Text Generation with Numerical Reasoning""" + +import json +import datasets +import glob +import os + +DATASET_PATH = None + +_CITATION = """\ +@inproceedings{suadaa-etal-2021-towards, + title = "Towards Table-to-Text Generation with Numerical Reasoning", + author = "Suadaa, Lya Hulliyyatus and + Kamigaito, Hidetaka and + Funakoshi, Kotaro and + Okumura, Manabu and + Takamura, Hiroya", + booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.acl-long.115", + doi = "10.18653/v1/2021.acl-long.115", + pages = "1451--1465" +} +""" +_DESCRIPTION = """\ +NumericNLG is a dataset for table-totext generation focusing on numerical reasoning. +The dataset consists of textual descriptions of numerical tables from scientific papers. +""" + +_URL = "https://github.com/titech-nlp/numeric-nlg" +_LICENSE = "CC BY-SA 4.0" + +DATASET_PATH = None + + +class NumericNLG(datasets.GeneratorBasedBuilder): + VERSION = "1.0.0" + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "table_id_paper": datasets.Value(dtype="string"), + "caption": datasets.Value(dtype="string"), + "row_header_level": datasets.Value(dtype="int32"), + "row_headers": datasets.Value(dtype="large_string"), + "column_header_level": datasets.Value(dtype="int32"), + "column_headers": datasets.Value(dtype="large_string"), + "contents": datasets.Value(dtype="large_string"), + "metrics_loc": datasets.Value(dtype="string"), + "metrics_type": datasets.Value(dtype="large_string"), + "target_entity": datasets.Value(dtype="large_string"), + "table_html_clean": datasets.Value(dtype="large_string"), + "table_name": datasets.Value(dtype="string"), + "table_id": datasets.Value(dtype="string"), + "paper_id": datasets.Value(dtype="string"), + "page_no": datasets.Value(dtype="int32"), + "dir": datasets.Value(dtype="string"), + "description": datasets.Value(dtype="large_string"), + "class_sentence": datasets.Value(dtype="string"), + "sentences": datasets.Value(dtype="large_string"), + "header_mention": datasets.Value(dtype="string"), + "valid": datasets.Value(dtype="int32"), + } + ), + supervised_keys=None, + homepage="https://github.com/titech-nlp/numeric-nlg", + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "dev"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": DATASET_PATH + "/" + "data", "split": "test"}, + ), + ] + + def _generate_examples(self, filepath, split): + filename = split if split != "dev" else "val" + + with open(os.path.join(filepath, f"table_{filename}.json")) as f: + j_tables = json.load(f) + + with open(os.path.join(filepath, f"table_desc_{filename}.json")) as f: + j_desc = json.load(f) + + for example_idx, (entry, desc) in enumerate(zip(j_tables, j_desc)): + + assert entry["table_id_paper"] == desc["table_id_paper"] + + e = {key: str(value) for key, value in entry.items()} + + for key in ["description", "class_sentence", "header_mention", "sentences"]: + e[key] = str(desc[key]) + + yield example_idx, e + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + + dataset.push_to_hub("kasnerz/numericnlg") diff --git a/misc/hf_scripts/scigen.py b/misc/hf_scripts/scigen.py new file mode 100755 index 0000000..8599f66 --- /dev/null +++ b/misc/hf_scripts/scigen.py @@ -0,0 +1,80 @@ +"""Scigen: dataset for reasoning-aware data-to-text generation from scientific tables""" + +import json +import datasets +import glob +import os + +DATASET_PATH = None + +_CITATION = """\ +@article{moosavi:2021:SciGen, + author = {Nafise Sadat Moosavi, Andreas R{\"u}ckl{\'e}, Dan Roth, Iryna Gurevych}, + title = {Learning to Reason for Text Generation from Scientific Tables}, + journal = {arXiv preprint arXiv:2104.08296}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08296} +} +""" +_DESCRIPTION = """\ +SciGen is dataset for the task of reasoning-aware data-to-text generation consisting of tables from scientific articles and their corresponding descriptions. +""" + +_URL = "https://github.com/UKPLab/SciGen" +_LICENSE = "CC BY-NC-SA 4.0" + + +class SciGen(datasets.GeneratorBasedBuilder): + VERSION = "1.0.0" + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "paper": datasets.Value(dtype="string"), + "paper_id": datasets.Value(dtype="string"), + "table_caption": datasets.Value(dtype="string"), + "table_column_names": datasets.Value(dtype="large_string"), + "table_content_values": datasets.Value(dtype="large_string"), + "text": datasets.Value(dtype="large_string"), + } + ), + supervised_keys=None, + homepage=_URL, + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "train"} + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "dev"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "test"} + ), + ] + + def _generate_examples(self, filepath, split): + data_dir = "development" if split == "dev" else split + + if split in ["train", "dev"]: + file_path = os.path.join(filepath, data_dir, "medium", f"{split}.json") + else: + # there is also "test-Other.json", should be looked into + file_path = os.path.join(filepath, data_dir, f"test-CL.json") + + with open(file_path) as f: + j = json.load(f) + for example_idx, entry in enumerate(list(j.values())): + yield example_idx, {key: str(value) for key, value in entry.items()} + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + dataset.push_to_hub("kasnerz/scigen") diff --git a/misc/hf_scripts/wikitabletext.py b/misc/hf_scripts/wikitabletext.py new file mode 100755 index 0000000..ddaf24a --- /dev/null +++ b/misc/hf_scripts/wikitabletext.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +""" +The script used to load the dataset from the original source. +""" + + +import os +from collections import defaultdict + +import json +import datasets + +DATASET_PATH = None + +_CITATION = """\ +@inproceedings{bao2018table, + title={Table-to-Text: Describing Table Region with Natural Language}, + author={Junwei Bao and Duyu Tang and Nan Duan and Zhao Yan and Yuanhua Lv and Ming Zhou and Tiejun Zhao}, + booktitle={AAAI}, + url={https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/download/16138/16782}, + year={2018} +} +""" + +_DESCRIPTION = """\ +WikiTableText contains 5,000 tables from Wikipedia, each of which has at least 3 rows and 2 columns. +For each table, three rows are selected resulting in 15,000 rows that are further used for manual annotation.""" + +_URL = "https://github.com/msra-nlc/Table2Text" +_LICENSE = "CC BY 4.0" + + +class WikiTableText(datasets.GeneratorBasedBuilder): + VERSION = datasets.Version("1.0.0") + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "headers": datasets.Value("string"), + "content": datasets.Value("string"), + "row_number": datasets.Value("string"), + "reference": datasets.Value("string"), + } + ), + supervised_keys=None, + homepage=_URL, + citation=_CITATION, + license=_LICENSE, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"split": "train"}), + datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"split": "dev"}), + datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"split": "test"}), + ] + + def _normalize(self, lst): + lst = lst.split("_||_") + lst = [x.replace("_$$_", " ") for x in lst] + lst = [x.replace("_", "").strip() for x in lst] + + return lst + + def _generate_examples(self, split): + """Yields examples.""" + id_ = 0 + + with open(DATASET_PATH + "/" + f"MSRA_NLC.Table2Text.{split}") as f: + for line in f.readlines(): + items = line.split("\t") + e = { + "row_number": items[0], + "headers": self._normalize(items[1]), + "content": self._normalize(items[2]), + "reference": self._normalize(items[3])[0], + } + + id_ += 1 + yield id_, e + + +if __name__ == "__main__": + dataset = datasets.load_dataset(__file__) + dataset.push_to_hub("kasnerz/wikitabletext")