Fix (#571)

modelscope · Feb 8, 2025 · 3408e1f · 3408e1f
1 parent 449cac1
commit 3408e1f
Show file tree

Hide file tree

Showing 9 changed files with 24 additions and 24 deletions.
diff --git a/data_juicer/analysis/draw.py b/data_juicer/analysis/draw.py
@@ -5,7 +5,7 @@
 
 def draw_heatmap(data, xlabels, ylables=None, figsize=None, triangle=False):
     """
-    Draw heatmap of input data with special lables.
+    Draw heatmap of input data with special labels.
 
     :param data: input data, now support
         [`list`, `tuple`, `numpy array`, 'torch tensor']

diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -31,7 +31,7 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None):
         3. environment variables
         4. hard-coded defaults
 
-    :param args: list of params, e.g., ['--conifg', 'cfg.yaml'], defaut None.
+    :param args: list of params, e.g., ['--config', 'cfg.yaml'], default None.
     :param which_entry: which entry to init configs (executor/analyzer)
     :return: a global cfg object used by the Executor or Analyzer
     """

diff --git a/data_juicer/core/tracer.py b/data_juicer/core/tracer.py
@@ -140,7 +140,7 @@ def trace_filter(self, op_name: str, previous_ds: Dataset,
         # index of the current sample in the previous dataset
         i = 0
         filter_dict = []
-        # number of found filtered samples. It's the offset bewteen two
+        # number of found filtered samples. It's the offset between two
         # datasets as well.
         num = 0
         while i < len(previous_ds):

diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -141,7 +141,7 @@ def __init__(self, *args, **kwargs):
             to be processed
         :param video_key: the key name of field that stores sample video list
             to be processed
-        :param query_key: the key name of field that stores sample queris
+        :param query_key: the key name of field that stores sample queries
         :param response_key: the key name of field that stores responses
         :param history_key: the key name of field that stores history of
             queries and responses
@@ -175,7 +175,7 @@ def __init__(self, *args, **kwargs):
         else:
             self.accelerator = self._accelerator
 
-        # parameters to determind the number of procs for this op
+        # parameters to determine the number of procs for this op
         self.num_proc = kwargs.get('num_proc', None)
         self.cpu_required = kwargs.get('cpu_required', 1)
         self.mem_required = kwargs.get('mem_required', 0)
@@ -212,7 +212,7 @@ def runtime_np(self):
 
     def remove_extra_parameters(self, param_dict, keys=None):
         """
-            at the begining of the init of the mapper op, call
+            at the beginning of the init of the mapper op, call
             self.remove_extra_parameters(locals())
             to get the init parameter dict of the op for convenience
 
@@ -280,7 +280,7 @@ def __init__(self, *args, **kwargs):
             to be processed
         :param video_key: the key name of field that stores sample video list
             to be processed
-        :param query_key: the key name of field that stores sample queris
+        :param query_key: the key name of field that stores sample queries
         :param response_key: the key name of field that stores responses
         :param history_key: the key name of field that stores history of
             queries and responses
@@ -373,7 +373,7 @@ def __init__(self, *args, **kwargs):
             to be processed
         :param video_key: the key name of field that stores sample video list
             to be processed
-        :param query_key: the key name of field that stores sample queris
+        :param query_key: the key name of field that stores sample queries
         :param response_key: the key name of field that stores responses
         :param history_key: the key name of field that stores history of
             queries and responses
@@ -500,7 +500,7 @@ def __init__(self, *args, **kwargs):
             to be processed
         :param video_key: the key name of field that stores sample video list
             to be processed
-        :param query_key: the key name of field that stores sample queris
+        :param query_key: the key name of field that stores sample queries
         :param response_key: the key name of field that stores responses
         :param history_key: the key name of field that stores history of
             queries and responses
@@ -569,7 +569,7 @@ def __init__(self, *args, **kwargs):
             to be processed
         :param video_key: the key name of field that stores sample video list
             to be processed
-        :param query_key: the key name of field that stores sample queris
+        :param query_key: the key name of field that stores sample queries
         :param response_key: the key name of field that stores responses
         :param history_key: the key name of field that stores history of
             queries and responses
@@ -607,7 +607,7 @@ def __init__(self, *args, **kwargs):
             to be processed
         :param video_key: the key name of field that stores sample video list
             to be processed
-        :param query_key: the key name of field that stores sample queris
+        :param query_key: the key name of field that stores sample queries
         :param response_key: the key name of field that stores responses
         :param history_key: the key name of field that stores history of
             queries and responses
@@ -647,7 +647,7 @@ def __init__(self, *args, **kwargs):
             to be processed
         :param video_key: the key name of field that stores sample video list
             to be processed
-        :param query_key: the key name of field that stores sample queris
+        :param query_key: the key name of field that stores sample queries
         :param response_key: the key name of field that stores responses
         :param history_key: the key name of field that stores history of
             queries and responses

diff --git a/data_juicer/ops/common/helper_func.py b/data_juicer/ops/common/helper_func.py
@@ -57,7 +57,7 @@ def split_on_whitespace(document, new_line=False, tab=False):
     """
     This method also removes concatenated spaces.
 
-    :param document: document to be splited
+    :param document: document to be split
     :param new_line: whether to split document with '\\\\n'
     :param tag: whether to split document with '\\\\t'
     :return: word list obtained after splitting document
@@ -75,7 +75,7 @@ def split_on_newline_tab_whitespace(document):
     sentences.
 
     First split on "\\\\n", then on "\\\\t", then on " ".
-    :param document: document to be splited
+    :param document: document to be split
     :return: sentence list obtained after splitting document
     """
     sentences = document.split('\n')
@@ -189,7 +189,7 @@ def get_sentences_from_document(document, model_func=None):
 
     :param document: document that need to split sentences
     :param model_func: function of sentence model, if specified, the
-        function will be used for spliting document into different
+        function will be used for splitting document into different
         sentences.
     :return: document with the sentences separated by '\\\\n'
     """
@@ -204,8 +204,8 @@ def split_text_by_punctuation(text):
     """
     Split text by any zh and en punctuation
 
-    :param text: text to be splitted.
-    :return: sub texts splitted by any zh and en punctuation
+    :param text: text to be split.
+    :return: sub texts split by any zh and en punctuation
     """
     # any zh and en punctuation
     punctuation_pattern = r'[\u3000-\u303f\uff00-\uffef]|[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~]'  # noqa: E501

diff --git a/data_juicer/ops/deduplicator/ray_bts_minhash_deduplicator.py b/data_juicer/ops/deduplicator/ray_bts_minhash_deduplicator.py
@@ -297,7 +297,7 @@ def __init__(
             union-find algorithm. Default it's 'auto', and it will be
             determined by half of the number of CPUs.
         :param union_threshold: threshold for minhash values group to
-            perform union-find algorightm. Default it's 256.
+            perform union-find algorithm. Default it's 256.
         :param max_pending_edge_buffer_task: max number of pending edge buffer
             ray tasks. Default it's 20.
         :param num_edge_buffer_task_returns: number of edge buffer tasks for

diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py
@@ -29,7 +29,7 @@ def VideoCapture(*args, **kwargs):
 @OPERATORS.register_module(OP_NAME)
 class VideoMotionScoreFilter(Filter):
     """Filter to keep samples with video motion scores within a specific range. The
-    Farneback's algorith from OpenCV is used to compute dense optical flow.
+    Farneback's algorithm from OpenCV is used to compute dense optical flow.
     """
 
     _default_kwargs = {

diff --git a/data_juicer/ops/mapper/pair_preference_mapper.py b/data_juicer/ops/mapper/pair_preference_mapper.py
@@ -58,7 +58,7 @@ def __init__(self,
             Defaults to 'choices.0.message.content'.
         :param system_prompt: System prompt for guiding the generation task.
         :param input_template: Template for building the model input. It must
-            contain placeholders '{query}' and '{reponse}', and can optionally
+            contain placeholders '{query}' and '{response}', and can optionally
             include '{reference}'.
         :param output_pattern: Regular expression for parsing model output.
         :param rejected_key: The field name in the sample to store the

diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
@@ -353,7 +353,7 @@ def prepare_huggingface_model(pretrained_model_name_or_path,
                               pipe_task='text-generation',
                               **model_params):
     """
-    Prepare and load a HuggingFace model with the correspoding processor.
+    Prepare and load a HuggingFace model with the corresponding processor.
 
     :param pretrained_model_name_or_path: model name or path
     :param return_model: return model or not
@@ -521,7 +521,7 @@ def prepare_sentencepiece_for_lang(lang,
                                    name_pattern='{}.sp.model',
                                    **model_params):
     """
-    Prepare and load a sentencepiece model for specific langauge.
+    Prepare and load a sentencepiece model for specific language.
 
     :param lang: language to render model name
     :param name_pattern: pattern to render the model name
@@ -634,7 +634,7 @@ def prepare_video_blip_model(pretrained_model_name_or_path,
                              return_model=True,
                              **model_params):
     """
-    Prepare and load a video-clip model with the correspoding processor.
+    Prepare and load a video-clip model with the corresponding processor.
 
     :param pretrained_model_name_or_path: model name or path
     :param return_model: return model or not
@@ -775,7 +775,7 @@ def __init__(self, config: transformers.Blip2Config) -> None:
 
 def prepare_vllm_model(pretrained_model_name_or_path, **model_params):
     """
-    Prepare and load a HuggingFace model with the correspoding processor.
+    Prepare and load a HuggingFace model with the corresponding processor.
 
     :param pretrained_model_name_or_path: model name or path
     :param model_params: LLM initialization parameters.