Skip to content

Commit

Permalink
Add Huggingface dataset upload scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
kasnerz committed Mar 14, 2023
1 parent 184b2de commit 21b83a9
Show file tree
Hide file tree
Showing 11 changed files with 919 additions and 1 deletion.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ The datasets are stored to `HF_DATASETS_CACHE` directory which defaults to `~/.c

The datasets are all loaded from [HuggingFace datasets](https://huggingface.co/datasets) instead of their original repositories which allows to use preprocessed datasets and a single unified loader.

Note that there may be some minor changes in the data w.r.t. to the original datasets due to unification, such as adding "subject", "predicate" and "object" headings to RDF triple-to-text datasets.
**Note that preparing the datasets for the first time may take some time since the datasets have to be downloaded to cache and preprocessed.** This process takes several minutes based on the dataset size. However, it only a one-time process (until the dataset is updated or the cache is deleted).

Also note that there may be some minor changes in the data w.r.t. to the original datasets due to unification, such as adding "subject", "predicate" and "object" headings to RDF triple-to-text datasets.

## Adding datasets
For adding a new dataset:
Expand Down
9 changes: 9 additions & 0 deletions misc/hf_scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Huggingface datasets scripts

This folder contains scripts used to newly upload some of the datasets in TabGenie to [Huggingface Datasets](https://huggingface.co/datasets).

The structure of the scripts is based on the existing scripts and a tutorial which may be found [here](https://huggingface.co/docs/datasets/dataset_script).

The field `DATASET_PATH` has to point to the original dataset repository.

Note that some newer versions of the `datasets` package create an empty `README.md` file which overrides the information in the loading script and the info then does not show up in the app. To prevent this behavior, use either `datasets==2.5.1` or delete the `README.md` after dataset upload.
146 changes: 146 additions & 0 deletions misc/hf_scripts/cacapo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/usr/bin/env python3

"""
The script used to load the dataset from the original source.
"""

import os
import xml.etree.cElementTree as ET
from collections import defaultdict
from glob import glob
from os.path import join as pjoin
from pathlib import Path

import datasets

DATASET_PATH = None

_CITATION = """\
@inproceedings{van2020cacapo,
title={The CACAPO dataset: A multilingual, multi-domain dataset for neural pipeline and end-to-end data-to-text generation},
author={van der Lee, Chris and Emmery, Chris and Wubben, Sander and Krahmer, Emiel},
booktitle={Proceedings of the 13th International Conference on Natural Language Generation},
pages={68--79},
year={2020}
}
"""

_DESCRIPTION = """\
CACAPO is a data-to-text dataset that contains sentences from news reports for the sports, weather, stock, and incidents domain in English and Dutch, aligned with relevant attribute-value paired data. This is the first data-to-text dataset based on "naturally occurring" human-written texts (i.e., texts that were not collected in a task-based setting), that covers various domains, as well as multiple languages. """
_URL = "https://github.com/TallChris91/CACAPO-Dataset"
_LICENSE = "CC BY 4.0"


def et_to_dict(tree):
dct = {tree.tag: {} if tree.attrib else None}
children = list(tree)
if children:
dd = defaultdict(list)
for dc in map(et_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
dct = {tree.tag: dd}
if tree.attrib:
dct[tree.tag].update((k, v) for k, v in tree.attrib.items())
if tree.text:
text = tree.text.strip()
if children or tree.attrib:
if text:
dct[tree.tag]["text"] = text
else:
dct[tree.tag] = text
return dct


def parse_entry(entry):
res = {}
otriple_set_list = entry["originaltripleset"]
res["original_triple_sets"] = [{"otriple_set": otriple_set["otriple"]} for otriple_set in otriple_set_list]
mtriple_set_list = entry["modifiedtripleset"]
res["modified_triple_sets"] = [{"mtriple_set": mtriple_set["mtriple"]} for mtriple_set in mtriple_set_list]
res["category"] = entry["category"]
res["eid"] = entry["eid"]
res["size"] = int(entry["size"])
res["lex"] = {
"comment": [ex.get("comment", "") for ex in entry.get("lex", [])],
"lid": [ex.get("lid", "") for ex in entry.get("lex", [])],
"text": [ex.get("text", "") for ex in entry.get("lex", [])],
}
return res


def xml_file_to_examples(filename):
tree = ET.parse(filename).getroot()

examples = et_to_dict(tree)["benchmark"]["entries"][0]["entry"]
return [parse_entry(entry) for entry in examples]


class CACAPO(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.0.0")

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"category": datasets.Value("string"),
"lang": datasets.Value("string"),
"size": datasets.Value("int32"),
"eid": datasets.Value("string"),
"original_triple_sets": datasets.Sequence(
{"otriple_set": datasets.Sequence(datasets.Value("string"))}
),
"modified_triple_sets": datasets.Sequence(
{"mtriple_set": datasets.Sequence(datasets.Value("string"))}
),
"lex": datasets.Sequence(
{
"comment": datasets.Value("string"),
"lid": datasets.Value("string"),
"text": datasets.Value("string"),
}
),
}
),
supervised_keys=None,
homepage=_URL,
citation=_CITATION,
license=_LICENSE,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "dev"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "test"},
),
]

def _generate_examples(self, filedirs, split):
"""Yields examples."""
id_ = 0

for lang in ["en", "nl"]:
for filedir in filedirs:
xml_file = os.path.join(DATASET_PATH, lang, filedir, f"WebNLGFormat{split.title()}.xml")

for exple_dict in xml_file_to_examples(xml_file):
exple_dict["category"] = filedir
exple_dict["lang"] = lang
id_ += 1
yield id_, exple_dict


if __name__ == "__main__":
dataset = datasets.load_dataset(__file__)
dataset.push_to_hub("kasnerz/cacapo")
117 changes: 117 additions & 0 deletions misc/hf_scripts/charttotext-s.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python3

"""
The script used to load the dataset from the original source.
"""

import json
import datasets
import os
import csv

DATASET_PATH = None

_CITATION = """\
@inproceedings{kantharaj2022chart,
title={Chart-to-Text: A Large-Scale Benchmark for Chart Summarization},
author={Kantharaj, Shankar and Leong, Rixie Tiffany and Lin, Xiang and Masry, Ahmed and Thakkar, Megh and Hoque, Enamul and Joty, Shafiq},
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
pages={4005--4023},
year={2022}
}
"""
_DESCRIPTION = """\
Chart-to-Text is a large-scale benchmark with two datasets and a total of 44,096 charts covering a wide range of topics and chart types.
This dataset CONTAINS ONLY the Statista subset from the benchmark.
Statista (statista.com) is an online platform that regularly publishes charts on a wide range of topics including economics, market and opinion research.
Statistics:
Total charts: 27868
=== Chart Type Information ===
Number of charts of each chart type
column: 16319
bar: 8272
line: 2646
pie: 408
table: 223
=== Token Information ===
Average token count per summary: 53.65027989091431
Total tokens: 1495126
Total types (unique tokens): 39598
=== Sentence Information ===
Average sentence count per summary: 2.5596741782689825
"""

_URL = "https://github.com/vis-nlp/Chart-to-text/tree/main/statista_dataset/dataset"
_LICENSE = "GNU General Public License v3.0"


class ChartToTextS(datasets.GeneratorBasedBuilder):
VERSION = "1.0.0"

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"title": datasets.Value(dtype="string"),
"ref": datasets.Value(dtype="string"),
"content": datasets.Value(dtype="large_string"),
}
),
supervised_keys=None,
homepage=_URL,
citation=_CITATION,
license=_LICENSE,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "train"}
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "dev"}
),
datasets.SplitGenerator(
name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "test"}
),
]

def _generate_examples(self, filepath, split):
data = []
mapping_file = split if split != "dev" else "val"

with open(os.path.join(filepath, "dataset_split", f"{mapping_file}_index_mapping.csv")) as f:
next(f)
for i, line in enumerate(f):
subdir = "." if line.startswith("two_col") else "multiColumn"
filename = line.split("-")[1].split(".")[0]

with open(os.path.join(filepath, subdir, "data", filename + ".csv")) as g:
content = []
reader = csv.reader(g, delimiter=",", quotechar='"')
for row in reader:
content.append(row)

with open(os.path.join(filepath, subdir, "captions", filename + ".txt")) as g:
ref = g.read().rstrip("\n")

with open(os.path.join(filepath, subdir, "titles", filename + ".txt")) as g:
title = g.read().rstrip("\n")

data.append({"content": content, "ref": ref, "title": title})

if i % 1000 == 0:
print(f"Loaded {i} items")

for example_idx, entry in enumerate(data):
yield example_idx, {key: str(value) for key, value in entry.items()}


if __name__ == "__main__":
dataset = datasets.load_dataset(__file__)
dataset.push_to_hub("kasnerz/charttotext-s")
83 changes: 83 additions & 0 deletions misc/hf_scripts/eventnarrative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python3

"""
The script used to load the dataset from the original source.
"""

import os
from collections import defaultdict

import json
import datasets

DATASET_PATH = None

_CITATION = """\
@inproceedings{colas2021eventnarrative,
title={EventNarrative: A Large-scale Event-centric Dataset for Knowledge Graph-to-Text Generation},
author={Colas, Anthony and Sadeghian, Ali and Wang, Yue and Wang, Daisy Zhe},
booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)},
year={2021}
}
"""

_DESCRIPTION = """\
EventNarrative is a knowledge graph-to-text dataset from publicly available open-world knowledge graphs, focusing on event-centric data.
EventNarrative consists of approximately 230,000 graphs and their corresponding natural language text, 6 times larger than the current largest parallel dataset.
It makes use of a rich ontology and all of the KGs entities are linked to the text."""

_URL = "https://www.kaggle.com/datasets/acolas1/eventnarration"
_LICENSE = "CC BY 4.0"


class EventNarrative(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.0.0")

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"Event_Name": datasets.Value("string"),
"entity_ref_dict": datasets.Value("large_string"),
"keep_triples": datasets.Value("large_string"),
"narration": datasets.Value("large_string"),
"types": datasets.Value("string"),
"wikipediaLabel": datasets.Value("string"),
}
),
supervised_keys=None,
homepage=_URL,
citation=_CITATION,
license=_LICENSE,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"split": "train"}),
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"split": "dev"}),
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"split": "test"}),
]

def _generate_examples(self, split):
"""Yields examples."""
id_ = 0

with open(DATASET_PATH + "/" + f"{split}_data.json") as f:
j = json.load(f)

for example in j:
e = {key: str(value) for key, value in example.items()}
id_ += 1
yield id_, e


if __name__ == "__main__":
dataset = datasets.load_dataset(__file__)

import pdb

pdb.set_trace() # breakpoint ffb6df83 //

# dataset.push_to_hub("kasnerz/eventnarrative")
Loading

0 comments on commit 21b83a9

Please sign in to comment.