Skip to content

Commit

Permalink
Merge pull request #177 from neulab/datalab_jsonl_reformatting
Browse files Browse the repository at this point in the history
Import new datalab jsonl format
  • Loading branch information
neubig authored May 13, 2022
2 parents fd696fd + c2cfd6f commit edb047f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 28 deletions.
56 changes: 28 additions & 28 deletions backend/src/impl/db_utils/dataset_db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,34 +21,34 @@ def __init__(self, data: dict):
self.task_dict: dict[str, list[int]] = {}
self.id_dict: dict[str, int] = {}
self.metadatas: list[DatasetMetadata] = []
for dataset_name, v_dataset in data.items():
for sub_dataset, v_sub in v_dataset["sub_datasets"].items():
metadata_id = len(self.metadatas)
# Names
if dataset_name not in self.name_dict:
self.name_dict[dataset_name] = []
self.name_dict[dataset_name].append(metadata_id)
# Ids
dataset_id = f"{dataset_name}---{sub_dataset}"
self.id_dict[dataset_id] = metadata_id
# Tasks
tasks = v_dataset.get("tasks")
tasks = set([] if tasks is None else tasks)
task_cats = v_dataset.get("task_categories")
tasks = tasks.union([] if task_cats is None else task_cats)
for task in tasks:
if task not in self.task_dict:
self.task_dict[task] = []
self.task_dict[task].append(metadata_id)
# Create document
doc = {
"dataset_id": dataset_id,
"dataset_name": dataset_name,
"sub_dataset": None if sub_dataset == "__NONE__" else sub_dataset,
"split": v_sub["splits"],
"tasks": tasks,
}
self.metadatas.append(DatasetMetadata.from_dict(doc))
for metadata_id, (dataset_id, v_dataset) in enumerate(data.items()):
# Names
dataset_name = v_dataset["dataset_name"]
sub_dataset = v_dataset.get("sub_dataset")
if dataset_name not in self.name_dict:
self.name_dict[dataset_name] = []
self.name_dict[dataset_name].append(metadata_id)
# Ids
self.id_dict[dataset_id] = metadata_id
# Tasks
tasks = v_dataset.get("tasks")
tasks = set([] if tasks is None else tasks)
task_cats = v_dataset.get("task_categories")
tasks = tasks.union([] if task_cats is None else task_cats)
for task in tasks:
if task not in self.task_dict:
self.task_dict[task] = []
self.task_dict[task].append(metadata_id)
# Create document
doc = {
"dataset_id": dataset_id,
"dataset_name": dataset_name,
"sub_dataset": None if sub_dataset == "__NONE__" else sub_dataset,
"split": v_dataset["splits"],
"tasks": tasks,
"languages": v_dataset.get("languages"),
}
self.metadatas.append(DatasetMetadata.from_dict(doc))
self.name_trie = marisa_trie.Trie(self.name_dict.keys())


Expand Down
11 changes: 11 additions & 0 deletions frontend/src/pages/DatasetsPage/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,17 @@ const columns: ColumnsType<DatasetMetadata> = [
</span>
),
},
{
dataIndex: "languages",
title: "Languages",
render: (value) => (
<span>
{value.map((language: string, i: number) => (
<Tag key={i}>{language}</Tag>
))}
</span>
),
},
{
dataIndex: "",
title: "Leaderboard",
Expand Down

0 comments on commit edb047f

Please sign in to comment.