Add Huggingface dataset upload scripts

kasnerz · Mar 14, 2023 · 21b83a9 · 21b83a9
1 parent 184b2de
commit 21b83a9
Show file tree

Hide file tree

Showing 11 changed files with 919 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -133,7 +133,9 @@ The datasets are stored to `HF_DATASETS_CACHE` directory which defaults to `~/.c
 
 The datasets are all loaded from [HuggingFace datasets](https://huggingface.co/datasets) instead of their original repositories which allows to use preprocessed datasets and a single unified loader.
 
-Note that there may be some minor changes in the data w.r.t. to the original datasets due to unification, such as adding "subject", "predicate" and "object" headings to RDF triple-to-text datasets.
+**Note that preparing the datasets for the first time may take some time since the datasets have to be downloaded to cache and preprocessed.** This process takes several minutes based on the dataset size. However, it only a one-time process (until the dataset is updated or the cache is deleted).
+
+Also note that there may be some minor changes in the data w.r.t. to the original datasets due to unification, such as adding "subject", "predicate" and "object" headings to RDF triple-to-text datasets.
 
 ## Adding datasets
 For adding a new dataset:

diff --git a/misc/hf_scripts/README.md b/misc/hf_scripts/README.md
@@ -0,0 +1,9 @@
+# Huggingface datasets scripts
+
+This folder contains scripts used to newly upload some of the datasets in TabGenie to [Huggingface Datasets](https://huggingface.co/datasets). 
+
+The structure of the scripts is based on the existing scripts and a tutorial which may be found [here](https://huggingface.co/docs/datasets/dataset_script).
+
+The field `DATASET_PATH` has to point to the original dataset repository.
+
+Note that some newer versions of the `datasets` package create an empty `README.md` file which overrides the information in the loading script and the info then does not show up in the app. To prevent this behavior, use either `datasets==2.5.1` or delete the `README.md` after dataset upload.
diff --git a/misc/hf_scripts/cacapo.py b/misc/hf_scripts/cacapo.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+"""
+The script used to load the dataset from the original source.
+"""
+
+import os
+import xml.etree.cElementTree as ET
+from collections import defaultdict
+from glob import glob
+from os.path import join as pjoin
+from pathlib import Path
+
+import datasets
+
+DATASET_PATH = None
+
+_CITATION = """\
+@inproceedings{van2020cacapo,
+  title={The CACAPO dataset: A multilingual, multi-domain dataset for neural pipeline and end-to-end data-to-text generation},
+  author={van der Lee, Chris and Emmery, Chris and Wubben, Sander and Krahmer, Emiel},
+  booktitle={Proceedings of the 13th International Conference on Natural Language Generation},
+  pages={68--79},
+  year={2020}
+}
+"""
+
+_DESCRIPTION = """\
+CACAPO is a data-to-text dataset that contains sentences from news reports for the sports, weather, stock, and incidents domain in English and Dutch, aligned with relevant attribute-value paired data. This is the first data-to-text dataset based on "naturally occurring" human-written texts (i.e., texts that were not collected in a task-based setting), that covers various domains, as well as multiple languages. """
+_URL = "https://github.com/TallChris91/CACAPO-Dataset"
+_LICENSE = "CC BY 4.0"
+
+
+def et_to_dict(tree):
+    dct = {tree.tag: {} if tree.attrib else None}
+    children = list(tree)
+    if children:
+        dd = defaultdict(list)
+        for dc in map(et_to_dict, children):
+            for k, v in dc.items():
+                dd[k].append(v)
+        dct = {tree.tag: dd}
+    if tree.attrib:
+        dct[tree.tag].update((k, v) for k, v in tree.attrib.items())
+    if tree.text:
+        text = tree.text.strip()
+        if children or tree.attrib:
+            if text:
+                dct[tree.tag]["text"] = text
+        else:
+            dct[tree.tag] = text
+    return dct
+
+
+def parse_entry(entry):
+    res = {}
+    otriple_set_list = entry["originaltripleset"]
+    res["original_triple_sets"] = [{"otriple_set": otriple_set["otriple"]} for otriple_set in otriple_set_list]
+    mtriple_set_list = entry["modifiedtripleset"]
+    res["modified_triple_sets"] = [{"mtriple_set": mtriple_set["mtriple"]} for mtriple_set in mtriple_set_list]
+    res["category"] = entry["category"]
+    res["eid"] = entry["eid"]
+    res["size"] = int(entry["size"])
+    res["lex"] = {
+        "comment": [ex.get("comment", "") for ex in entry.get("lex", [])],
+        "lid": [ex.get("lid", "") for ex in entry.get("lex", [])],
+        "text": [ex.get("text", "") for ex in entry.get("lex", [])],
+    }
+    return res
+
+
+def xml_file_to_examples(filename):
+    tree = ET.parse(filename).getroot()
+
+    examples = et_to_dict(tree)["benchmark"]["entries"][0]["entry"]
+    return [parse_entry(entry) for entry in examples]
+
+
+class CACAPO(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("1.0.0")
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "category": datasets.Value("string"),
+                    "lang": datasets.Value("string"),
+                    "size": datasets.Value("int32"),
+                    "eid": datasets.Value("string"),
+                    "original_triple_sets": datasets.Sequence(
+                        {"otriple_set": datasets.Sequence(datasets.Value("string"))}
+                    ),
+                    "modified_triple_sets": datasets.Sequence(
+                        {"mtriple_set": datasets.Sequence(datasets.Value("string"))}
+                    ),
+                    "lex": datasets.Sequence(
+                        {
+                            "comment": datasets.Value("string"),
+                            "lid": datasets.Value("string"),
+                            "text": datasets.Value("string"),
+                        }
+                    ),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_URL,
+            citation=_CITATION,
+            license=_LICENSE,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "train"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "dev"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "test"},
+            ),
+        ]
+
+    def _generate_examples(self, filedirs, split):
+        """Yields examples."""
+        id_ = 0
+
+        for lang in ["en", "nl"]:
+            for filedir in filedirs:
+                xml_file = os.path.join(DATASET_PATH, lang, filedir, f"WebNLGFormat{split.title()}.xml")
+
+                for exple_dict in xml_file_to_examples(xml_file):
+                    exple_dict["category"] = filedir
+                    exple_dict["lang"] = lang
+                    id_ += 1
+                    yield id_, exple_dict
+
+
+if __name__ == "__main__":
+    dataset = datasets.load_dataset(__file__)
+    dataset.push_to_hub("kasnerz/cacapo")
diff --git a/misc/hf_scripts/charttotext-s.py b/misc/hf_scripts/charttotext-s.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+"""
+The script used to load the dataset from the original source.
+"""
+
+import json
+import datasets
+import os
+import csv
+
+DATASET_PATH = None
+
+_CITATION = """\
+@inproceedings{kantharaj2022chart,
+  title={Chart-to-Text: A Large-Scale Benchmark for Chart Summarization},
+  author={Kantharaj, Shankar and Leong, Rixie Tiffany and Lin, Xiang and Masry, Ahmed and Thakkar, Megh and Hoque, Enamul and Joty, Shafiq},
+  booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages={4005--4023},
+  year={2022}
+}
+"""
+_DESCRIPTION = """\
+Chart-to-Text is a large-scale benchmark with two datasets and a total of 44,096 charts covering a wide range of topics and chart types.
+This dataset CONTAINS ONLY the Statista subset from the benchmark. 
+Statista (statista.com) is an online platform that regularly publishes charts on a wide range of topics including economics, market and opinion research.
+
+Statistics:
+Total charts: 27868
+
+=== Chart Type Information ===
+Number of charts of each chart type
+column: 16319
+bar: 8272
+line: 2646
+pie: 408
+table: 223
+
+=== Token Information ===
+Average token count per summary: 53.65027989091431
+Total tokens: 1495126
+Total types (unique tokens): 39598
+=== Sentence Information ===
+Average sentence count per summary: 2.5596741782689825
+"""
+
+_URL = "https://github.com/vis-nlp/Chart-to-text/tree/main/statista_dataset/dataset"
+_LICENSE = "GNU General Public License v3.0"
+
+
+class ChartToTextS(datasets.GeneratorBasedBuilder):
+    VERSION = "1.0.0"
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "title": datasets.Value(dtype="string"),
+                    "ref": datasets.Value(dtype="string"),
+                    "content": datasets.Value(dtype="large_string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_URL,
+            citation=_CITATION,
+            license=_LICENSE,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "train"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "dev"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "test"}
+            ),
+        ]
+
+    def _generate_examples(self, filepath, split):
+        data = []
+        mapping_file = split if split != "dev" else "val"
+
+        with open(os.path.join(filepath, "dataset_split", f"{mapping_file}_index_mapping.csv")) as f:
+            next(f)
+            for i, line in enumerate(f):
+                subdir = "." if line.startswith("two_col") else "multiColumn"
+                filename = line.split("-")[1].split(".")[0]
+
+                with open(os.path.join(filepath, subdir, "data", filename + ".csv")) as g:
+                    content = []
+                    reader = csv.reader(g, delimiter=",", quotechar='"')
+                    for row in reader:
+                        content.append(row)
+
+                with open(os.path.join(filepath, subdir, "captions", filename + ".txt")) as g:
+                    ref = g.read().rstrip("\n")
+
+                with open(os.path.join(filepath, subdir, "titles", filename + ".txt")) as g:
+                    title = g.read().rstrip("\n")
+
+                data.append({"content": content, "ref": ref, "title": title})
+
+                if i % 1000 == 0:
+                    print(f"Loaded {i} items")
+
+        for example_idx, entry in enumerate(data):
+            yield example_idx, {key: str(value) for key, value in entry.items()}
+
+
+if __name__ == "__main__":
+    dataset = datasets.load_dataset(__file__)
+    dataset.push_to_hub("kasnerz/charttotext-s")
diff --git a/misc/hf_scripts/eventnarrative.py b/misc/hf_scripts/eventnarrative.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+"""
+The script used to load the dataset from the original source.
+"""
+
+import os
+from collections import defaultdict
+
+import json
+import datasets
+
+DATASET_PATH = None
+
+_CITATION = """\
+@inproceedings{colas2021eventnarrative,
+  title={EventNarrative: A Large-scale Event-centric Dataset for Knowledge Graph-to-Text Generation},
+  author={Colas, Anthony and Sadeghian, Ali and Wang, Yue and Wang, Daisy Zhe},
+  booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)},
+  year={2021}
+}
+"""
+
+_DESCRIPTION = """\
+EventNarrative is a knowledge graph-to-text dataset from publicly available open-world knowledge graphs, focusing on event-centric data. 
+EventNarrative consists of approximately 230,000 graphs and their corresponding natural language text, 6 times larger than the current largest parallel dataset. 
+It makes use of a rich ontology and all of the KGs entities are linked to the text."""
+
+_URL = "https://www.kaggle.com/datasets/acolas1/eventnarration"
+_LICENSE = "CC BY 4.0"
+
+
+class EventNarrative(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("1.0.0")
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "Event_Name": datasets.Value("string"),
+                    "entity_ref_dict": datasets.Value("large_string"),
+                    "keep_triples": datasets.Value("large_string"),
+                    "narration": datasets.Value("large_string"),
+                    "types": datasets.Value("string"),
+                    "wikipediaLabel": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_URL,
+            citation=_CITATION,
+            license=_LICENSE,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"split": "train"}),
+            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"split": "dev"}),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"split": "test"}),
+        ]
+
+    def _generate_examples(self, split):
+        """Yields examples."""
+        id_ = 0
+
+        with open(DATASET_PATH + "/" + f"{split}_data.json") as f:
+            j = json.load(f)
+
+            for example in j:
+                e = {key: str(value) for key, value in example.items()}
+                id_ += 1
+                yield id_, e
+
+
+if __name__ == "__main__":
+    dataset = datasets.load_dataset(__file__)
+
+    import pdb
+
+    pdb.set_trace()  # breakpoint ffb6df83 //
+
+    # dataset.push_to_hub("kasnerz/eventnarrative")