support GenerateInstructionMapper

Cathy0908 · Jul 4, 2024 · de7c3b5 · de7c3b5
1 parent 1244d4f
commit de7c3b5
Show file tree

Hide file tree

Showing 15 changed files with 413 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,4 @@ dist
 .idea/
 wandb/
 __pycache__
+.vscode/
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -71,6 +71,11 @@ process:
   - extract_qa_mapper:                                      # mapper to extract question and answer pair from text.
       hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'
   - fix_unicode_mapper:                                     # fix unicode errors in text.
+  - generate_instruction_mapper:                            # generate new instruction text data.
+      hf_model: 'Qwen/Qwen-7B-Chat'
+      seed_file: 'demos/data/demo-dataset-chatml.jsonl'
+      instruct_num: 3
+      similarity_threshold: 0.7
   - image_blur_mapper:                                      # mapper to blur images.
       p: 0.2                                                  # probability of the image being blured
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']

diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -120,6 +120,13 @@ def init_configs(args=None):
         help='Path to datasets with optional weights(0.0-1.0), 1.0 as '
         'default. Accepted format:<w1> dataset1-path <w2> dataset2-path '
         '<w3> dataset3-path ...')
+    parser.add_argument(
+        '--dataset_config',
+        type=Dict,
+        default=None,
+        help='Configuration used to create a dataset. '
+        'The dataset will be created from this configuration if provided. '
+        'It must contain the `type` field to specify the dataset name.')
     parser.add_argument(
         '--export_path',
         type=str,
@@ -404,7 +411,7 @@ def init_setup_from_cfg(cfg):
                  redirect=cfg.executor_type == 'default')
 
     # check and get dataset dir
-    if os.path.exists(cfg.dataset_path):
+    if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path):
         cfg.dataset_path = os.path.abspath(cfg.dataset_path)
         if os.path.isdir(cfg.dataset_path):
             cfg.dataset_dir = cfg.dataset_path

diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py
@@ -53,6 +53,7 @@ def __init__(self, cfg=None):
         # setup formatter
         logger.info('Setting up data formatter...')
         self.formatter = load_formatter(self.cfg.dataset_path,
+                                        self.cfg.dataset_config,
                                         self.cfg.text_keys, self.cfg.suffixes,
                                         self.cfg.add_suffix)
 

diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
@@ -155,6 +155,8 @@ def run_op(self, op, op_cfg, dataset):
                     'Ray executor only support Filter and Mapper OPs for '
                     'now')
                 raise NotImplementedError
+
+            return dataset
         except:  # noqa: E722
             logger.error(f'An error occurred during Op [{op_name}].')
             import traceback
@@ -170,7 +172,17 @@ def run(self, load_data_np=None):
         """
         # 1. load data
         logger.info('Loading dataset with Ray...')
-        dataset = rd.read_json(self.cfg.dataset_path)
+
+        if self.cfg.get('dataset_config', None):
+            dataset_config = self.cfg.dataset_config
+            assert isinstance(dataset_config,
+                              dict) and 'type' in dataset_config
+            args = dataset_config.copy()
+            obj_name = args.pop('type')
+            from data_juicer.format.formatter import FORMATTERS
+            dataset = FORMATTERS.modules[obj_name](**args).load_dataset()
+        else:
+            dataset = rd.read_json(self.cfg.dataset_path)
 
         # convert all the path in dataset to absolute path
         dataset = set_dataset_to_absolute_path(dataset, self.cfg.dataset_path,

diff --git a/data_juicer/format/__init__.py b/data_juicer/format/__init__.py
@@ -1,6 +1,8 @@
-from . import (csv_formatter, json_formatter, mixture_formatter,
-               parquet_formatter, text_formatter, tsv_formatter)
+from . import (csv_formatter, empty_formatter, json_formatter,
+               mixture_formatter, parquet_formatter, text_formatter,
+               tsv_formatter)
 from .csv_formatter import CsvFormatter
+from .empty_formatter import EmptyFormatter, RayEmptyFormatter
 from .formatter import LocalFormatter, RemoteFormatter
 from .json_formatter import JsonFormatter
 from .load import load_formatter
@@ -12,5 +14,5 @@
 __all__ = [
     'load_formatter', 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter',
     'TextFormatter', 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter',
-    'MixtureFormatter'
+    'MixtureFormatter', 'EmptyFormatter', 'RayEmptyFormatter'
 ]
diff --git a/data_juicer/format/empty_formatter.py b/data_juicer/format/empty_formatter.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import pandas as pd
+import ray
+from datasets import Dataset, Features, Value
+
+from .formatter import FORMATTERS, BaseFormatter
+
+
+@FORMATTERS.register_module()
+class EmptyFormatter(BaseFormatter):
+    """
+    The class is used to create empty data.
+    """
+
+    def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+        """
+        Initialization method.
+
+        :param length: The empty dataset length.
+        :param feature_keys: feature key name list.
+        """
+        self.length = length
+        self.feature_keys = feature_keys
+        if isinstance(self.feature_keys, str):
+            self.feature_keys = [self.feature_keys]
+
+    @property
+    def null_value(self):
+        return {}
+
+    def load_dataset(self, *args, **kwargs):
+        data_dict = {}
+        features = Features()
+
+        for key in self.feature_keys:
+            features.update({key: Value('string')})
+            data_dict.update({key: [self.null_value] * self.length})
+
+        empty_dataset = Dataset.from_dict(data_dict, features=features)
+
+        return empty_dataset
+
+
+@FORMATTERS.register_module()
+class RayEmptyFormatter(BaseFormatter):
+    """
+    The class is used to create empty data for ray.
+    """
+
+    def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
+        """
+        Initialization method.
+
+        :param length: The empty dataset length.
+        :param feature_keys: feature key name list.
+        """
+        self.length = length
+        self.feature_keys = feature_keys
+        if isinstance(self.feature_keys, str):
+            self.feature_keys = [self.feature_keys]
+
+    @property
+    def null_value(self):
+        return {}
+
+    def load_dataset(self, *args, **kwargs):
+        if len(self.feature_keys):
+            df = pd.DataFrame({
+                col: [self.null_value for _ in range(self.length)]
+                for col in self.feature_keys
+            })
+        else:
+            df = pd.DataFrame([self.null_value for _ in range(self.length)])
+
+        empty_dataset = ray.data.from_pandas(df)
+
+        return empty_dataset
diff --git a/data_juicer/format/load.py b/data_juicer/format/load.py
@@ -3,6 +3,7 @@
 
 
 def load_formatter(dataset_path,
+                   dataset_config=None,
                    text_keys=None,
                    suffixes=[],
                    add_suffix=False,
@@ -12,13 +13,25 @@ def load_formatter(dataset_path,
     weight(default 1.0) according to their formats.
 
     :param dataset_path: path to a dataset file or a dataset directory
+    :param dataset_config: Configuration used to create a dataset.
+        The dataset will be created from this configuration if provided.
+        It must contain the `type` field to specify the dataset name.
     :param text_keys: key names of field that stores sample text.
         Default: None
     :param suffixes: files with specified suffixes to be processed.
     :param add_suffix: whether to add the file suffix to dataset meta
         info
     :return: a dataset formatter.
     """
+    if dataset_config:
+        assert isinstance(dataset_config, dict) and 'type' in dataset_config
+        args = dataset_config.copy()
+        obj_name = args.pop('type')
+        args.update(kwargs)
+
+        from .formatter import FORMATTERS
+        return FORMATTERS.modules[obj_name](**args)
+
     formatter = MixtureFormatter(dataset_path=dataset_path,
                                  text_keys=text_keys,
                                  suffixes=suffixes,

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -34,6 +34,7 @@
 from .expand_macro_mapper import ExpandMacroMapper
 from .extract_qa_mapper import ExtractQAMapper
 from .fix_unicode_mapper import FixUnicodeMapper
+from .generate_instruction_mapper import GenerateInstructionMapper
 from .image_blur_mapper import ImageBlurMapper
 from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper
 from .image_captioning_mapper import ImageCaptioningMapper
@@ -108,6 +109,7 @@
     'RemoveWordsWithIncorrectSubstringsMapper',
     'VideoCaptioningFromVideoMapper',
     'VideoCaptioningFromSummarizerMapper',
+    'GenerateInstructionMapper',
     'FixUnicodeMapper',
     'NlpaugEnMapper',
     'VideoCaptioningFromFramesMapper',
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,3 +15,4 @@ dist @@
     .idea/
     wandb/
     __pycache__
+    .vscode/