Skip to content

Commit

Permalink
support GenerateInstructionMapper
Browse files Browse the repository at this point in the history
  • Loading branch information
Cathy0908 committed Jul 4, 2024
1 parent 1244d4f commit de7c3b5
Show file tree
Hide file tree
Showing 15 changed files with 413 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ dist
.idea/
wandb/
__pycache__
.vscode/
5 changes: 5 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ process:
- extract_qa_mapper: # mapper to extract question and answer pair from text.
hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'
- fix_unicode_mapper: # fix unicode errors in text.
- generate_instruction_mapper: # generate new instruction text data.
hf_model: 'Qwen/Qwen-7B-Chat'
seed_file: 'demos/data/demo-dataset-chatml.jsonl'
instruct_num: 3
similarity_threshold: 0.7
- image_blur_mapper: # mapper to blur images.
p: 0.2 # probability of the image being blured
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
Expand Down
9 changes: 8 additions & 1 deletion data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,13 @@ def init_configs(args=None):
help='Path to datasets with optional weights(0.0-1.0), 1.0 as '
'default. Accepted format:<w1> dataset1-path <w2> dataset2-path '
'<w3> dataset3-path ...')
parser.add_argument(
'--dataset_config',
type=Dict,
default=None,
help='Configuration used to create a dataset. '
'The dataset will be created from this configuration if provided. '
'It must contain the `type` field to specify the dataset name.')
parser.add_argument(
'--export_path',
type=str,
Expand Down Expand Up @@ -404,7 +411,7 @@ def init_setup_from_cfg(cfg):
redirect=cfg.executor_type == 'default')

# check and get dataset dir
if os.path.exists(cfg.dataset_path):
if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path):
cfg.dataset_path = os.path.abspath(cfg.dataset_path)
if os.path.isdir(cfg.dataset_path):
cfg.dataset_dir = cfg.dataset_path
Expand Down
1 change: 1 addition & 0 deletions data_juicer/core/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, cfg=None):
# setup formatter
logger.info('Setting up data formatter...')
self.formatter = load_formatter(self.cfg.dataset_path,
self.cfg.dataset_config,
self.cfg.text_keys, self.cfg.suffixes,
self.cfg.add_suffix)

Expand Down
14 changes: 13 additions & 1 deletion data_juicer/core/ray_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ def run_op(self, op, op_cfg, dataset):
'Ray executor only support Filter and Mapper OPs for '
'now')
raise NotImplementedError

return dataset
except: # noqa: E722
logger.error(f'An error occurred during Op [{op_name}].')
import traceback
Expand All @@ -170,7 +172,17 @@ def run(self, load_data_np=None):
"""
# 1. load data
logger.info('Loading dataset with Ray...')
dataset = rd.read_json(self.cfg.dataset_path)

if self.cfg.get('dataset_config', None):
dataset_config = self.cfg.dataset_config
assert isinstance(dataset_config,
dict) and 'type' in dataset_config
args = dataset_config.copy()
obj_name = args.pop('type')
from data_juicer.format.formatter import FORMATTERS
dataset = FORMATTERS.modules[obj_name](**args).load_dataset()
else:
dataset = rd.read_json(self.cfg.dataset_path)

# convert all the path in dataset to absolute path
dataset = set_dataset_to_absolute_path(dataset, self.cfg.dataset_path,
Expand Down
8 changes: 5 additions & 3 deletions data_juicer/format/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from . import (csv_formatter, json_formatter, mixture_formatter,
parquet_formatter, text_formatter, tsv_formatter)
from . import (csv_formatter, empty_formatter, json_formatter,
mixture_formatter, parquet_formatter, text_formatter,
tsv_formatter)
from .csv_formatter import CsvFormatter
from .empty_formatter import EmptyFormatter, RayEmptyFormatter
from .formatter import LocalFormatter, RemoteFormatter
from .json_formatter import JsonFormatter
from .load import load_formatter
Expand All @@ -12,5 +14,5 @@
__all__ = [
'load_formatter', 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter',
'TextFormatter', 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter',
'MixtureFormatter'
'MixtureFormatter', 'EmptyFormatter', 'RayEmptyFormatter'
]
78 changes: 78 additions & 0 deletions data_juicer/format/empty_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import List

import pandas as pd
import ray
from datasets import Dataset, Features, Value

from .formatter import FORMATTERS, BaseFormatter


@FORMATTERS.register_module()
class EmptyFormatter(BaseFormatter):
"""
The class is used to create empty data.
"""

def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
"""
Initialization method.
:param length: The empty dataset length.
:param feature_keys: feature key name list.
"""
self.length = length
self.feature_keys = feature_keys
if isinstance(self.feature_keys, str):
self.feature_keys = [self.feature_keys]

@property
def null_value(self):
return {}

def load_dataset(self, *args, **kwargs):
data_dict = {}
features = Features()

for key in self.feature_keys:
features.update({key: Value('string')})
data_dict.update({key: [self.null_value] * self.length})

empty_dataset = Dataset.from_dict(data_dict, features=features)

return empty_dataset


@FORMATTERS.register_module()
class RayEmptyFormatter(BaseFormatter):
"""
The class is used to create empty data for ray.
"""

def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
"""
Initialization method.
:param length: The empty dataset length.
:param feature_keys: feature key name list.
"""
self.length = length
self.feature_keys = feature_keys
if isinstance(self.feature_keys, str):
self.feature_keys = [self.feature_keys]

@property
def null_value(self):
return {}

def load_dataset(self, *args, **kwargs):
if len(self.feature_keys):
df = pd.DataFrame({
col: [self.null_value for _ in range(self.length)]
for col in self.feature_keys
})
else:
df = pd.DataFrame([self.null_value for _ in range(self.length)])

empty_dataset = ray.data.from_pandas(df)

return empty_dataset
13 changes: 13 additions & 0 deletions data_juicer/format/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@


def load_formatter(dataset_path,
dataset_config=None,
text_keys=None,
suffixes=[],
add_suffix=False,
Expand All @@ -12,13 +13,25 @@ def load_formatter(dataset_path,
weight(default 1.0) according to their formats.
:param dataset_path: path to a dataset file or a dataset directory
:param dataset_config: Configuration used to create a dataset.
The dataset will be created from this configuration if provided.
It must contain the `type` field to specify the dataset name.
:param text_keys: key names of field that stores sample text.
Default: None
:param suffixes: files with specified suffixes to be processed.
:param add_suffix: whether to add the file suffix to dataset meta
info
:return: a dataset formatter.
"""
if dataset_config:
assert isinstance(dataset_config, dict) and 'type' in dataset_config
args = dataset_config.copy()
obj_name = args.pop('type')
args.update(kwargs)

from .formatter import FORMATTERS
return FORMATTERS.modules[obj_name](**args)

formatter = MixtureFormatter(dataset_path=dataset_path,
text_keys=text_keys,
suffixes=suffixes,
Expand Down
2 changes: 2 additions & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from .expand_macro_mapper import ExpandMacroMapper
from .extract_qa_mapper import ExtractQAMapper
from .fix_unicode_mapper import FixUnicodeMapper
from .generate_instruction_mapper import GenerateInstructionMapper
from .image_blur_mapper import ImageBlurMapper
from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper
from .image_captioning_mapper import ImageCaptioningMapper
Expand Down Expand Up @@ -108,6 +109,7 @@
'RemoveWordsWithIncorrectSubstringsMapper',
'VideoCaptioningFromVideoMapper',
'VideoCaptioningFromSummarizerMapper',
'GenerateInstructionMapper',
'FixUnicodeMapper',
'NlpaugEnMapper',
'VideoCaptioningFromFramesMapper',
Expand Down
Loading

0 comments on commit de7c3b5

Please sign in to comment.