Skip to content

Commit

Permalink
Fix (#571)
Browse files Browse the repository at this point in the history
  • Loading branch information
co63oc authored Feb 8, 2025
1 parent 449cac1 commit 3408e1f
Show file tree
Hide file tree
Showing 9 changed files with 24 additions and 24 deletions.
2 changes: 1 addition & 1 deletion data_juicer/analysis/draw.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def draw_heatmap(data, xlabels, ylables=None, figsize=None, triangle=False):
"""
Draw heatmap of input data with special lables.
Draw heatmap of input data with special labels.
:param data: input data, now support
[`list`, `tuple`, `numpy array`, 'torch tensor']
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None):
3. environment variables
4. hard-coded defaults
:param args: list of params, e.g., ['--conifg', 'cfg.yaml'], defaut None.
:param args: list of params, e.g., ['--config', 'cfg.yaml'], default None.
:param which_entry: which entry to init configs (executor/analyzer)
:return: a global cfg object used by the Executor or Analyzer
"""
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/core/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def trace_filter(self, op_name: str, previous_ds: Dataset,
# index of the current sample in the previous dataset
i = 0
filter_dict = []
# number of found filtered samples. It's the offset bewteen two
# number of found filtered samples. It's the offset between two
# datasets as well.
num = 0
while i < len(previous_ds):
Expand Down
18 changes: 9 additions & 9 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def __init__(self, *args, **kwargs):
to be processed
:param video_key: the key name of field that stores sample video list
to be processed
:param query_key: the key name of field that stores sample queris
:param query_key: the key name of field that stores sample queries
:param response_key: the key name of field that stores responses
:param history_key: the key name of field that stores history of
queries and responses
Expand Down Expand Up @@ -175,7 +175,7 @@ def __init__(self, *args, **kwargs):
else:
self.accelerator = self._accelerator

# parameters to determind the number of procs for this op
# parameters to determine the number of procs for this op
self.num_proc = kwargs.get('num_proc', None)
self.cpu_required = kwargs.get('cpu_required', 1)
self.mem_required = kwargs.get('mem_required', 0)
Expand Down Expand Up @@ -212,7 +212,7 @@ def runtime_np(self):

def remove_extra_parameters(self, param_dict, keys=None):
"""
at the begining of the init of the mapper op, call
at the beginning of the init of the mapper op, call
self.remove_extra_parameters(locals())
to get the init parameter dict of the op for convenience
Expand Down Expand Up @@ -280,7 +280,7 @@ def __init__(self, *args, **kwargs):
to be processed
:param video_key: the key name of field that stores sample video list
to be processed
:param query_key: the key name of field that stores sample queris
:param query_key: the key name of field that stores sample queries
:param response_key: the key name of field that stores responses
:param history_key: the key name of field that stores history of
queries and responses
Expand Down Expand Up @@ -373,7 +373,7 @@ def __init__(self, *args, **kwargs):
to be processed
:param video_key: the key name of field that stores sample video list
to be processed
:param query_key: the key name of field that stores sample queris
:param query_key: the key name of field that stores sample queries
:param response_key: the key name of field that stores responses
:param history_key: the key name of field that stores history of
queries and responses
Expand Down Expand Up @@ -500,7 +500,7 @@ def __init__(self, *args, **kwargs):
to be processed
:param video_key: the key name of field that stores sample video list
to be processed
:param query_key: the key name of field that stores sample queris
:param query_key: the key name of field that stores sample queries
:param response_key: the key name of field that stores responses
:param history_key: the key name of field that stores history of
queries and responses
Expand Down Expand Up @@ -569,7 +569,7 @@ def __init__(self, *args, **kwargs):
to be processed
:param video_key: the key name of field that stores sample video list
to be processed
:param query_key: the key name of field that stores sample queris
:param query_key: the key name of field that stores sample queries
:param response_key: the key name of field that stores responses
:param history_key: the key name of field that stores history of
queries and responses
Expand Down Expand Up @@ -607,7 +607,7 @@ def __init__(self, *args, **kwargs):
to be processed
:param video_key: the key name of field that stores sample video list
to be processed
:param query_key: the key name of field that stores sample queris
:param query_key: the key name of field that stores sample queries
:param response_key: the key name of field that stores responses
:param history_key: the key name of field that stores history of
queries and responses
Expand Down Expand Up @@ -647,7 +647,7 @@ def __init__(self, *args, **kwargs):
to be processed
:param video_key: the key name of field that stores sample video list
to be processed
:param query_key: the key name of field that stores sample queris
:param query_key: the key name of field that stores sample queries
:param response_key: the key name of field that stores responses
:param history_key: the key name of field that stores history of
queries and responses
Expand Down
10 changes: 5 additions & 5 deletions data_juicer/ops/common/helper_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def split_on_whitespace(document, new_line=False, tab=False):
"""
This method also removes concatenated spaces.
:param document: document to be splited
:param document: document to be split
:param new_line: whether to split document with '\\\\n'
:param tag: whether to split document with '\\\\t'
:return: word list obtained after splitting document
Expand All @@ -75,7 +75,7 @@ def split_on_newline_tab_whitespace(document):
sentences.
First split on "\\\\n", then on "\\\\t", then on " ".
:param document: document to be splited
:param document: document to be split
:return: sentence list obtained after splitting document
"""
sentences = document.split('\n')
Expand Down Expand Up @@ -189,7 +189,7 @@ def get_sentences_from_document(document, model_func=None):
:param document: document that need to split sentences
:param model_func: function of sentence model, if specified, the
function will be used for spliting document into different
function will be used for splitting document into different
sentences.
:return: document with the sentences separated by '\\\\n'
"""
Expand All @@ -204,8 +204,8 @@ def split_text_by_punctuation(text):
"""
Split text by any zh and en punctuation
:param text: text to be splitted.
:return: sub texts splitted by any zh and en punctuation
:param text: text to be split.
:return: sub texts split by any zh and en punctuation
"""
# any zh and en punctuation
punctuation_pattern = r'[\u3000-\u303f\uff00-\uffef]|[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~]' # noqa: E501
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def __init__(
union-find algorithm. Default it's 'auto', and it will be
determined by half of the number of CPUs.
:param union_threshold: threshold for minhash values group to
perform union-find algorightm. Default it's 256.
perform union-find algorithm. Default it's 256.
:param max_pending_edge_buffer_task: max number of pending edge buffer
ray tasks. Default it's 20.
:param num_edge_buffer_task_returns: number of edge buffer tasks for
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/video_motion_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def VideoCapture(*args, **kwargs):
@OPERATORS.register_module(OP_NAME)
class VideoMotionScoreFilter(Filter):
"""Filter to keep samples with video motion scores within a specific range. The
Farneback's algorith from OpenCV is used to compute dense optical flow.
Farneback's algorithm from OpenCV is used to compute dense optical flow.
"""

_default_kwargs = {
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/pair_preference_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self,
Defaults to 'choices.0.message.content'.
:param system_prompt: System prompt for guiding the generation task.
:param input_template: Template for building the model input. It must
contain placeholders '{query}' and '{reponse}', and can optionally
contain placeholders '{query}' and '{response}', and can optionally
include '{reference}'.
:param output_pattern: Regular expression for parsing model output.
:param rejected_key: The field name in the sample to store the
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def prepare_huggingface_model(pretrained_model_name_or_path,
pipe_task='text-generation',
**model_params):
"""
Prepare and load a HuggingFace model with the correspoding processor.
Prepare and load a HuggingFace model with the corresponding processor.
:param pretrained_model_name_or_path: model name or path
:param return_model: return model or not
Expand Down Expand Up @@ -521,7 +521,7 @@ def prepare_sentencepiece_for_lang(lang,
name_pattern='{}.sp.model',
**model_params):
"""
Prepare and load a sentencepiece model for specific langauge.
Prepare and load a sentencepiece model for specific language.
:param lang: language to render model name
:param name_pattern: pattern to render the model name
Expand Down Expand Up @@ -634,7 +634,7 @@ def prepare_video_blip_model(pretrained_model_name_or_path,
return_model=True,
**model_params):
"""
Prepare and load a video-clip model with the correspoding processor.
Prepare and load a video-clip model with the corresponding processor.
:param pretrained_model_name_or_path: model name or path
:param return_model: return model or not
Expand Down Expand Up @@ -775,7 +775,7 @@ def __init__(self, config: transformers.Blip2Config) -> None:

def prepare_vllm_model(pretrained_model_name_or_path, **model_params):
"""
Prepare and load a HuggingFace model with the correspoding processor.
Prepare and load a HuggingFace model with the corresponding processor.
:param pretrained_model_name_or_path: model name or path
:param model_params: LLM initialization parameters.
Expand Down

0 comments on commit 3408e1f

Please sign in to comment.