-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Huggingface dataset upload scripts
- Loading branch information
Showing
11 changed files
with
919 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# Huggingface datasets scripts | ||
|
||
This folder contains scripts used to newly upload some of the datasets in TabGenie to [Huggingface Datasets](https://huggingface.co/datasets). | ||
|
||
The structure of the scripts is based on the existing scripts and a tutorial which may be found [here](https://huggingface.co/docs/datasets/dataset_script). | ||
|
||
The field `DATASET_PATH` has to point to the original dataset repository. | ||
|
||
Note that some newer versions of the `datasets` package create an empty `README.md` file which overrides the information in the loading script and the info then does not show up in the app. To prevent this behavior, use either `datasets==2.5.1` or delete the `README.md` after dataset upload. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
The script used to load the dataset from the original source. | ||
""" | ||
|
||
import os | ||
import xml.etree.cElementTree as ET | ||
from collections import defaultdict | ||
from glob import glob | ||
from os.path import join as pjoin | ||
from pathlib import Path | ||
|
||
import datasets | ||
|
||
DATASET_PATH = None | ||
|
||
_CITATION = """\ | ||
@inproceedings{van2020cacapo, | ||
title={The CACAPO dataset: A multilingual, multi-domain dataset for neural pipeline and end-to-end data-to-text generation}, | ||
author={van der Lee, Chris and Emmery, Chris and Wubben, Sander and Krahmer, Emiel}, | ||
booktitle={Proceedings of the 13th International Conference on Natural Language Generation}, | ||
pages={68--79}, | ||
year={2020} | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """\ | ||
CACAPO is a data-to-text dataset that contains sentences from news reports for the sports, weather, stock, and incidents domain in English and Dutch, aligned with relevant attribute-value paired data. This is the first data-to-text dataset based on "naturally occurring" human-written texts (i.e., texts that were not collected in a task-based setting), that covers various domains, as well as multiple languages. """ | ||
_URL = "https://github.com/TallChris91/CACAPO-Dataset" | ||
_LICENSE = "CC BY 4.0" | ||
|
||
|
||
def et_to_dict(tree): | ||
dct = {tree.tag: {} if tree.attrib else None} | ||
children = list(tree) | ||
if children: | ||
dd = defaultdict(list) | ||
for dc in map(et_to_dict, children): | ||
for k, v in dc.items(): | ||
dd[k].append(v) | ||
dct = {tree.tag: dd} | ||
if tree.attrib: | ||
dct[tree.tag].update((k, v) for k, v in tree.attrib.items()) | ||
if tree.text: | ||
text = tree.text.strip() | ||
if children or tree.attrib: | ||
if text: | ||
dct[tree.tag]["text"] = text | ||
else: | ||
dct[tree.tag] = text | ||
return dct | ||
|
||
|
||
def parse_entry(entry): | ||
res = {} | ||
otriple_set_list = entry["originaltripleset"] | ||
res["original_triple_sets"] = [{"otriple_set": otriple_set["otriple"]} for otriple_set in otriple_set_list] | ||
mtriple_set_list = entry["modifiedtripleset"] | ||
res["modified_triple_sets"] = [{"mtriple_set": mtriple_set["mtriple"]} for mtriple_set in mtriple_set_list] | ||
res["category"] = entry["category"] | ||
res["eid"] = entry["eid"] | ||
res["size"] = int(entry["size"]) | ||
res["lex"] = { | ||
"comment": [ex.get("comment", "") for ex in entry.get("lex", [])], | ||
"lid": [ex.get("lid", "") for ex in entry.get("lex", [])], | ||
"text": [ex.get("text", "") for ex in entry.get("lex", [])], | ||
} | ||
return res | ||
|
||
|
||
def xml_file_to_examples(filename): | ||
tree = ET.parse(filename).getroot() | ||
|
||
examples = et_to_dict(tree)["benchmark"]["entries"][0]["entry"] | ||
return [parse_entry(entry) for entry in examples] | ||
|
||
|
||
class CACAPO(datasets.GeneratorBasedBuilder): | ||
VERSION = datasets.Version("1.0.0") | ||
|
||
def _info(self): | ||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"category": datasets.Value("string"), | ||
"lang": datasets.Value("string"), | ||
"size": datasets.Value("int32"), | ||
"eid": datasets.Value("string"), | ||
"original_triple_sets": datasets.Sequence( | ||
{"otriple_set": datasets.Sequence(datasets.Value("string"))} | ||
), | ||
"modified_triple_sets": datasets.Sequence( | ||
{"mtriple_set": datasets.Sequence(datasets.Value("string"))} | ||
), | ||
"lex": datasets.Sequence( | ||
{ | ||
"comment": datasets.Value("string"), | ||
"lid": datasets.Value("string"), | ||
"text": datasets.Value("string"), | ||
} | ||
), | ||
} | ||
), | ||
supervised_keys=None, | ||
homepage=_URL, | ||
citation=_CITATION, | ||
license=_LICENSE, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "train"}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "dev"}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={"filedirs": ["Incidents", "Sports", "Stocks", "Weather"], "split": "test"}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filedirs, split): | ||
"""Yields examples.""" | ||
id_ = 0 | ||
|
||
for lang in ["en", "nl"]: | ||
for filedir in filedirs: | ||
xml_file = os.path.join(DATASET_PATH, lang, filedir, f"WebNLGFormat{split.title()}.xml") | ||
|
||
for exple_dict in xml_file_to_examples(xml_file): | ||
exple_dict["category"] = filedir | ||
exple_dict["lang"] = lang | ||
id_ += 1 | ||
yield id_, exple_dict | ||
|
||
|
||
if __name__ == "__main__": | ||
dataset = datasets.load_dataset(__file__) | ||
dataset.push_to_hub("kasnerz/cacapo") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
The script used to load the dataset from the original source. | ||
""" | ||
|
||
import json | ||
import datasets | ||
import os | ||
import csv | ||
|
||
DATASET_PATH = None | ||
|
||
_CITATION = """\ | ||
@inproceedings{kantharaj2022chart, | ||
title={Chart-to-Text: A Large-Scale Benchmark for Chart Summarization}, | ||
author={Kantharaj, Shankar and Leong, Rixie Tiffany and Lin, Xiang and Masry, Ahmed and Thakkar, Megh and Hoque, Enamul and Joty, Shafiq}, | ||
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, | ||
pages={4005--4023}, | ||
year={2022} | ||
} | ||
""" | ||
_DESCRIPTION = """\ | ||
Chart-to-Text is a large-scale benchmark with two datasets and a total of 44,096 charts covering a wide range of topics and chart types. | ||
This dataset CONTAINS ONLY the Statista subset from the benchmark. | ||
Statista (statista.com) is an online platform that regularly publishes charts on a wide range of topics including economics, market and opinion research. | ||
Statistics: | ||
Total charts: 27868 | ||
=== Chart Type Information === | ||
Number of charts of each chart type | ||
column: 16319 | ||
bar: 8272 | ||
line: 2646 | ||
pie: 408 | ||
table: 223 | ||
=== Token Information === | ||
Average token count per summary: 53.65027989091431 | ||
Total tokens: 1495126 | ||
Total types (unique tokens): 39598 | ||
=== Sentence Information === | ||
Average sentence count per summary: 2.5596741782689825 | ||
""" | ||
|
||
_URL = "https://github.com/vis-nlp/Chart-to-text/tree/main/statista_dataset/dataset" | ||
_LICENSE = "GNU General Public License v3.0" | ||
|
||
|
||
class ChartToTextS(datasets.GeneratorBasedBuilder): | ||
VERSION = "1.0.0" | ||
|
||
def _info(self): | ||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"title": datasets.Value(dtype="string"), | ||
"ref": datasets.Value(dtype="string"), | ||
"content": datasets.Value(dtype="large_string"), | ||
} | ||
), | ||
supervised_keys=None, | ||
homepage=_URL, | ||
citation=_CITATION, | ||
license=_LICENSE, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "train"} | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "dev"} | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, gen_kwargs={"filepath": DATASET_PATH + "/" + "dataset", "split": "test"} | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath, split): | ||
data = [] | ||
mapping_file = split if split != "dev" else "val" | ||
|
||
with open(os.path.join(filepath, "dataset_split", f"{mapping_file}_index_mapping.csv")) as f: | ||
next(f) | ||
for i, line in enumerate(f): | ||
subdir = "." if line.startswith("two_col") else "multiColumn" | ||
filename = line.split("-")[1].split(".")[0] | ||
|
||
with open(os.path.join(filepath, subdir, "data", filename + ".csv")) as g: | ||
content = [] | ||
reader = csv.reader(g, delimiter=",", quotechar='"') | ||
for row in reader: | ||
content.append(row) | ||
|
||
with open(os.path.join(filepath, subdir, "captions", filename + ".txt")) as g: | ||
ref = g.read().rstrip("\n") | ||
|
||
with open(os.path.join(filepath, subdir, "titles", filename + ".txt")) as g: | ||
title = g.read().rstrip("\n") | ||
|
||
data.append({"content": content, "ref": ref, "title": title}) | ||
|
||
if i % 1000 == 0: | ||
print(f"Loaded {i} items") | ||
|
||
for example_idx, entry in enumerate(data): | ||
yield example_idx, {key: str(value) for key, value in entry.items()} | ||
|
||
|
||
if __name__ == "__main__": | ||
dataset = datasets.load_dataset(__file__) | ||
dataset.push_to_hub("kasnerz/charttotext-s") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
The script used to load the dataset from the original source. | ||
""" | ||
|
||
import os | ||
from collections import defaultdict | ||
|
||
import json | ||
import datasets | ||
|
||
DATASET_PATH = None | ||
|
||
_CITATION = """\ | ||
@inproceedings{colas2021eventnarrative, | ||
title={EventNarrative: A Large-scale Event-centric Dataset for Knowledge Graph-to-Text Generation}, | ||
author={Colas, Anthony and Sadeghian, Ali and Wang, Yue and Wang, Daisy Zhe}, | ||
booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)}, | ||
year={2021} | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """\ | ||
EventNarrative is a knowledge graph-to-text dataset from publicly available open-world knowledge graphs, focusing on event-centric data. | ||
EventNarrative consists of approximately 230,000 graphs and their corresponding natural language text, 6 times larger than the current largest parallel dataset. | ||
It makes use of a rich ontology and all of the KGs entities are linked to the text.""" | ||
|
||
_URL = "https://www.kaggle.com/datasets/acolas1/eventnarration" | ||
_LICENSE = "CC BY 4.0" | ||
|
||
|
||
class EventNarrative(datasets.GeneratorBasedBuilder): | ||
VERSION = datasets.Version("1.0.0") | ||
|
||
def _info(self): | ||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"Event_Name": datasets.Value("string"), | ||
"entity_ref_dict": datasets.Value("large_string"), | ||
"keep_triples": datasets.Value("large_string"), | ||
"narration": datasets.Value("large_string"), | ||
"types": datasets.Value("string"), | ||
"wikipediaLabel": datasets.Value("string"), | ||
} | ||
), | ||
supervised_keys=None, | ||
homepage=_URL, | ||
citation=_CITATION, | ||
license=_LICENSE, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
return [ | ||
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"split": "train"}), | ||
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"split": "dev"}), | ||
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"split": "test"}), | ||
] | ||
|
||
def _generate_examples(self, split): | ||
"""Yields examples.""" | ||
id_ = 0 | ||
|
||
with open(DATASET_PATH + "/" + f"{split}_data.json") as f: | ||
j = json.load(f) | ||
|
||
for example in j: | ||
e = {key: str(value) for key, value in example.items()} | ||
id_ += 1 | ||
yield id_, e | ||
|
||
|
||
if __name__ == "__main__": | ||
dataset = datasets.load_dataset(__file__) | ||
|
||
import pdb | ||
|
||
pdb.set_trace() # breakpoint ffb6df83 // | ||
|
||
# dataset.push_to_hub("kasnerz/eventnarrative") |
Oops, something went wrong.