diff --git a/configs/config_all.yaml b/configs/config_all.yaml index a28861c77..5b817c4d0 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -311,6 +311,10 @@ process: min_ratio: 0.333 # the min aspect ratio of filter range max_ratio: 3.0 # the max aspect ratio of filter range any_or_all: any # keep this sample when any/all images meet the filter condition + - image_face_count_filter: # filter samples according to the face count in images + cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'. + min_face_count: 1 # the minimum number of faces required for samples. + max_face_count: 1 # the maximum number of faces required for samples. - image_face_ratio_filter: # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one. cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'. min_ratio: 0.0 # the min face area ratio of filter range diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index 68e9ba521..f21c81546 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -3,17 +3,17 @@ audio_nmf_snr_filter, audio_size_filter, average_line_length_filter, character_repetition_filter, flagged_words_filter, image_aesthetics_filter, - image_aspect_ratio_filter, image_face_ratio_filter, - image_nsfw_filter, image_pair_similarity_filter, - image_shape_filter, image_size_filter, - image_text_matching_filter, image_text_similarity_filter, - image_watermark_filter, language_id_score_filter, - maximum_line_length_filter, perplexity_filter, - phrase_grounding_recall_filter, special_characters_filter, - specified_field_filter, specified_numeric_field_filter, - stopwords_filter, suffix_filter, text_action_filter, - text_entity_dependency_filter, text_length_filter, - token_num_filter, video_aesthetics_filter, + image_aspect_ratio_filter, image_face_count_filter, + image_face_ratio_filter, image_nsfw_filter, + image_pair_similarity_filter, image_shape_filter, + image_size_filter, image_text_matching_filter, + image_text_similarity_filter, image_watermark_filter, + language_id_score_filter, maximum_line_length_filter, + perplexity_filter, phrase_grounding_recall_filter, + special_characters_filter, specified_field_filter, + specified_numeric_field_filter, stopwords_filter, suffix_filter, + text_action_filter, text_entity_dependency_filter, + text_length_filter, token_num_filter, video_aesthetics_filter, video_aspect_ratio_filter, video_duration_filter, video_frames_text_similarity_filter, video_motion_score_filter, video_nsfw_filter, video_ocr_area_ratio_filter, @@ -29,6 +29,7 @@ from .flagged_words_filter import FlaggedWordFilter from .image_aesthetics_filter import ImageAestheticsFilter from .image_aspect_ratio_filter import ImageAspectRatioFilter +from .image_face_count_filter import ImageFaceCountFilter from .image_face_ratio_filter import ImageFaceRatioFilter from .image_nsfw_filter import ImageNSFWFilter from .image_pair_similarity_filter import ImagePairSimilarityFilter @@ -102,6 +103,7 @@ 'ImageSizeFilter', 'VideoWatermarkFilter', 'WordsNumFilter', + 'ImageFaceCountFilter', 'ImageFaceRatioFilter', 'FlaggedWordFilter', 'WordRepetitionFilter', diff --git a/docs/Operators.md b/docs/Operators.md index dd56871c4..27915dc09 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -12,7 +12,7 @@ The operators in Data-Juicer are categorized into 5 types. |-----------------------------------|:------:|-------------------------------------------------| | [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data | | [ Mapper ]( #mapper ) | 47 | Edits and transforms samples | -| [ Filter ]( #filter ) | 42 | Filters out low-quality samples | +| [ Filter ]( #filter ) | 43 | Filters out low-quality samples | | [ Deduplicator ]( #deduplicator ) | 5 | Detects and removes duplicate samples | | [ Selector ]( #selector ) | 4 | Selects top samples based on ranking | @@ -101,50 +101,51 @@ All the specific operators are listed below, each featured with several capabili ## Filter -| Operator | Domain | Lang | Description | -|--------------------------------|------------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------| -| alphanumeric_filter | General | en, zh | Keeps samples with alphanumeric ratio within the specified range | -| audio_duration_filter | Audio | - | Keep data samples whose audios' durations are within a specified range | -| audio_nmf_snr_filter | Audio | - | Keep data samples whose audios' Signal-to-Noise Ratios (SNRs, computed based on Non-Negative Matrix Factorization, NMF) are within a specified range| -| audio_size_filter | Audio | - | Keep data samples whose audios' sizes are within a specified range | -| average_line_length_filter | Code | en, zh | Keeps samples with average line length within the specified range | -| character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range | -| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold | -| image_aesthetics_filter | Image | - | Keeps samples containing images whose aesthetics scores are within the specified range | -| image_aspect_ratio_filter | Image | - | Keeps samples containing images with aspect ratios within the specified range | -| image_face_ratio_filter | Image | - | Keeps samples containing images with face area ratios within the specified range | -| image_nsfw_filter | Image | - | Keeps samples containing images with NSFW scores below the threshold | -| image_pair_similarity_filter | Image | - | Keeps image pairs with image feature cosine similarity within the specified range based on a CLIP model | -| image_shape_filter | Image | - | Keeps samples containing images with widths and heights within the specified range | -| image_size_filter | Image | - | Keeps samples containing images whose size in bytes are within the specified range | -| image_text_matching_filter | Multimodal | - | Keeps samples with image-text classification matching score within the specified range based on a BLIP model | -| image_text_similarity_filter | Multimodal | - | Keeps samples with image-text feature cosine similarity within the specified range based on a CLIP model | -| image_watermark_filter | Image | - | Keeps samples containing images with predicted watermark probabilities below the threshold | -| language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score | -| maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range | -| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold | -| phrase_grounding_recall_filter | Multimodal | - | Keeps samples whose locating recalls of phrases extracted from text in the images are within a specified range | -| special_characters_filter | General | en, zh | Keeps samples with special-char ratio within the specified range | -| specified_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified targets | -| specified_numeric_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified range (for numeric types) | -| stopwords_filter | General | en, zh | Keeps samples with stopword ratio above the specified threshold | -| suffix_filter | General | en, zh | Keeps samples with specified suffixes | -| text_action_filter | General | en, zh | Keeps samples containing action verbs in their texts | -| text_entity_dependency_filter | General | en, zh | Keeps samples containing dependency edges for an entity in the dependency tree of the texts | -| text_length_filter | General | en, zh | Keeps samples with total text length within the specified range | -| token_num_filter | General | en, zh | Keeps samples with token count within the specified range | -| video_aesthetics_filter | Video | - | Keeps samples whose specified frames have aesthetics scores within the specified range | -| video_aspect_ratio_filter | Video | - | Keeps samples containing videos with aspect ratios within the specified range | -| video_duration_filter | Video | - | Keep data samples whose videos' durations are within a specified range | -| video_frames_text_similarity_filter | Multimodal | - | Keep data samples whose similarities between sampled video frame images and text are within a specific range | -| video_motion_score_filter | Video | - | Keep samples with video motion scores within a specific range | -| video_nsfw_filter | Video | - | Keeps samples containing videos with NSFW scores below the threshold | -| video_ocr_area_ratio_filter | Video | - | Keep data samples whose detected text area ratios for specified frames in the video are within a specified range | -| video_resolution_filter | Video | - | Keeps samples containing videos with horizontal and vertical resolutions within the specified range | -| video_watermark_filter | Video | - | Keeps samples containing videos with predicted watermark probabilities below the threshold | -| video_tagging_from_frames_filter | Video | - | Keep samples containing videos with given tags | -| words_num_filter | General | en, zh | Keeps samples with word count within the specified range | -| word_repetition_filter | General | en, zh | Keeps samples with word-level n-gram repetition ratio within the specified range | +| Operator | Domain | Lang | Description | +|-------------------------------------|------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| alphanumeric_filter | General | en, zh | Keeps samples with alphanumeric ratio within the specified range | +| audio_duration_filter | Audio | - | Keep data samples whose audios' durations are within a specified range | +| audio_nmf_snr_filter | Audio | - | Keep data samples whose audios' Signal-to-Noise Ratios (SNRs, computed based on Non-Negative Matrix Factorization, NMF) are within a specified range | +| audio_size_filter | Audio | - | Keep data samples whose audios' sizes are within a specified range | +| average_line_length_filter | Code | en, zh | Keeps samples with average line length within the specified range | +| character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range | +| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold | +| image_aesthetics_filter | Image | - | Keeps samples containing images whose aesthetics scores are within the specified range | +| image_aspect_ratio_filter | Image | - | Keeps samples containing images with aspect ratios within the specified range | +| image_face_count_filter | Image | - | Keeps samples containing images with face counts within the specified range | +| image_face_ratio_filter | Image | - | Keeps samples containing images with face area ratios within the specified range | +| image_nsfw_filter | Image | - | Keeps samples containing images with NSFW scores below the threshold | +| image_pair_similarity_filter | Image | - | Keeps image pairs with image feature cosine similarity within the specified range based on a CLIP model | +| image_shape_filter | Image | - | Keeps samples containing images with widths and heights within the specified range | +| image_size_filter | Image | - | Keeps samples containing images whose size in bytes are within the specified range | +| image_text_matching_filter | Multimodal | - | Keeps samples with image-text classification matching score within the specified range based on a BLIP model | +| image_text_similarity_filter | Multimodal | - | Keeps samples with image-text feature cosine similarity within the specified range based on a CLIP model | +| image_watermark_filter | Image | - | Keeps samples containing images with predicted watermark probabilities below the threshold | +| language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score | +| maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range | +| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold | +| phrase_grounding_recall_filter | Multimodal | - | Keeps samples whose locating recalls of phrases extracted from text in the images are within a specified range | +| special_characters_filter | General | en, zh | Keeps samples with special-char ratio within the specified range | +| specified_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified targets | +| specified_numeric_field_filter | General | en, zh | Filters samples based on field, with value lies in the specified range (for numeric types) | +| stopwords_filter | General | en, zh | Keeps samples with stopword ratio above the specified threshold | +| suffix_filter | General | en, zh | Keeps samples with specified suffixes | +| text_action_filter | General | en, zh | Keeps samples containing action verbs in their texts | +| text_entity_dependency_filter | General | en, zh | Keeps samples containing dependency edges for an entity in the dependency tree of the texts | +| text_length_filter | General | en, zh | Keeps samples with total text length within the specified range | +| token_num_filter | General | en, zh | Keeps samples with token count within the specified range | +| video_aesthetics_filter | Video | - | Keeps samples whose specified frames have aesthetics scores within the specified range | +| video_aspect_ratio_filter | Video | - | Keeps samples containing videos with aspect ratios within the specified range | +| video_duration_filter | Video | - | Keep data samples whose videos' durations are within a specified range | +| video_frames_text_similarity_filter | Multimodal | - | Keep data samples whose similarities between sampled video frame images and text are within a specific range | +| video_motion_score_filter | Video | - | Keep samples with video motion scores within a specific range | +| video_nsfw_filter | Video | - | Keeps samples containing videos with NSFW scores below the threshold | +| video_ocr_area_ratio_filter | Video | - | Keep data samples whose detected text area ratios for specified frames in the video are within a specified range | +| video_resolution_filter | Video | - | Keeps samples containing videos with horizontal and vertical resolutions within the specified range | +| video_watermark_filter | Video | - | Keeps samples containing videos with predicted watermark probabilities below the threshold | +| video_tagging_from_frames_filter | Video | - | Keep samples containing videos with given tags | +| words_num_filter | General | en, zh | Keeps samples with word count within the specified range | +| word_repetition_filter | General | en, zh | Keeps samples with word-level n-gram repetition ratio within the specified range | ## Deduplicator diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index f5d598f54..115e6f446 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -12,7 +12,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 |------------------------------------|:--:|---------------| | [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 | | [ Mapper ]( #mapper ) | 47 | 对数据样本进行编辑和转换 | -| [ Filter ]( #filter ) | 42 | 过滤低质量样本 | +| [ Filter ]( #filter ) | 43 | 过滤低质量样本 | | [ Deduplicator ]( #deduplicator ) | 5 | 识别、删除重复样本 | | [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 | @@ -99,50 +99,51 @@ Data-Juicer 中的算子分为以下 5 种类型。 ## Filter -| 算子 | 场景 | 语言 | 描述 | -|--------------------------------|------------|--------|---------------------------------------------| -| alphanumeric_filter | General | en, zh | 保留字母数字比例在指定范围内的样本 | -| audio_duration_filter | Audio | - | 保留包含音频的时长在指定范围内的样本 | -| audio_nmf_snr_filter | Audio | - | 保留包含音频信噪比SNR(基于非负矩阵分解方法NMF计算)在指定范围内的样本 | -| audio_size_filter | Audio | - | 保留包含音频的大小(bytes)在指定范围内的样本 | -| average_line_length_filter | Code | en, zh | 保留平均行长度在指定范围内的样本 | -| character_repetition_filter | General | en, zh | 保留 char-level n-gram 重复比率在指定范围内的样本 | -| flagged_words_filter | General | en, zh | 保留使标记字比率保持在指定阈值以下的样本 | -| image_aesthetics_filter | Image | - | 保留包含美学分数在指定范围内的图像的样本 | -| image_aspect_ratio_filter | Image | - | 保留样本中包含的图片的宽高比在指定范围内的样本 | -| image_face_ratio_filter | Image | - | 保留样本中包含的图片的最大脸部区域在指定范围内的样本 | -| image_nsfw_filter | Image | - | 保留包含NSFW分数在指定阈值之下的图像的样本 | -| image_pair_similarity_filter | Image | - | 保留图像特征余弦相似度(基于CLIP模型)在指定范围内的样本 | -| image_shape_filter | Image | - | 保留样本中包含的图片的形状(即宽和高)在指定范围内的样本 | -| image_size_filter | Image | - | 保留样本中包含的图片的大小(bytes)在指定范围内的样本 | -| image_text_matching_filter | Multimodal | - | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本 | -| image_text_similarity_filter | Multimodal | - | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | -| image_watermark_filter | Image | - | 保留包含有水印概率在指定阈值之下的图像的样本 | -| language_id_score_filter | General | en, zh | 保留特定语言的样本,通过预测的置信度得分来判断 | -| maximum_line_length_filter | Code | en, zh | 保留最大行长度在指定范围内的样本 | -| perplexity_filter | General | en, zh | 保留困惑度低于指定阈值的样本 | -| phrase_grounding_recall_filter | Multimodal | - | 保留从文本中提取的名词短语在图像中的定位召回率在一定范围内的样本 | -| special_characters_filter | General | en, zh | 保留 special-char 比率的在指定范围内的样本 | -| specified_field_filter | General | en, zh | 根据字段过滤样本,要求字段的值处于指定目标中 | -| specified_numeric_field_filter | General | en, zh | 根据字段过滤样本,要求字段的值处于指定范围(针对数字类型) | -| stopwords_filter | General | en, zh | 保留停用词比率高于指定阈值的样本 | -| suffix_filter | General | en, zh | 保留包含特定后缀的样本 | -| text_action_filter | General | en, zh | 保留文本部分包含动作的样本 | -| text_entity_dependency_filter | General | en, zh | 保留文本部分的依存树中具有非独立实体的样本 | -| text_length_filter | General | en, zh | 保留总文本长度在指定范围内的样本 | -| token_num_filter | General | en, zh | 保留token数在指定范围内的样本 | -| video_aspect_ratio_filter | Video | - | 保留包含视频的宽高比在指定范围内的样本 | -| video_duration_filter | Video | - | 保留包含视频的时长在指定范围内的样本 | -| video_aesthetics_filter | Video | - | 保留指定帧的美学分数在指定范围内的样本| -| video_frames_text_similarity_filter | Multimodal | - | 保留视频中指定帧的图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | -| video_motion_score_filter | Video | - | 保留包含视频的运动分数(基于稠密光流)在指定范围内的样本 | -| video_nsfw_filter | Video | - | 保留包含视频的NSFW分数在指定阈值之下的样本 | -| video_ocr_area_ratio_filter | Video | - | 保留包含视频的特定帧中检测出的文本的面积占比在指定范围内的样本 | -| video_resolution_filter | Video | - | 保留包含视频的分辨率(包括横向分辨率和纵向分辨率)在指定范围内的样本 | -| video_watermark_filter | Video | - | 保留包含视频有水印的概率在指定阈值之下的样本 | -| video_tagging_from_frames_filter | Video | - | 保留包含具有给定标签视频的样本 | -| words_num_filter | General | en, zh | 保留字数在指定范围内的样本 | -| word_repetition_filter | General | en, zh | 保留 word-level n-gram 重复比率在指定范围内的样本 | +| 算子 | 场景 | 语言 | 描述 | +|-------------------------------------|------------|--------|-------------------------------------------| +| alphanumeric_filter | General | en, zh | 保留字母数字比例在指定范围内的样本 | +| audio_duration_filter | Audio | - | 保留包含音频的时长在指定范围内的样本 | +| audio_nmf_snr_filter | Audio | - | 保留包含音频信噪比SNR(基于非负矩阵分解方法NMF计算)在指定范围内的样本 | +| audio_size_filter | Audio | - | 保留包含音频的大小(bytes)在指定范围内的样本 | +| average_line_length_filter | Code | en, zh | 保留平均行长度在指定范围内的样本 | +| character_repetition_filter | General | en, zh | 保留 char-level n-gram 重复比率在指定范围内的样本 | +| flagged_words_filter | General | en, zh | 保留使标记字比率保持在指定阈值以下的样本 | +| image_aesthetics_filter | Image | - | 保留包含美学分数在指定范围内的图像的样本 | +| image_aspect_ratio_filter | Image | - | 保留样本中包含的图片的宽高比在指定范围内的样本 | +| image_face_count_filter | Image | - | 保留样本中包含的图片中检测到的人脸数目在指定范围内的样本 | +| image_face_ratio_filter | Image | - | 保留样本中包含的图片的最大脸部区域在指定范围内的样本 | +| image_nsfw_filter | Image | - | 保留包含NSFW分数在指定阈值之下的图像的样本 | +| image_pair_similarity_filter | Image | - | 保留图像特征余弦相似度(基于CLIP模型)在指定范围内的样本 | +| image_shape_filter | Image | - | 保留样本中包含的图片的形状(即宽和高)在指定范围内的样本 | +| image_size_filter | Image | - | 保留样本中包含的图片的大小(bytes)在指定范围内的样本 | +| image_text_matching_filter | Multimodal | - | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本 | +| image_text_similarity_filter | Multimodal | - | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | +| image_watermark_filter | Image | - | 保留包含有水印概率在指定阈值之下的图像的样本 | +| language_id_score_filter | General | en, zh | 保留特定语言的样本,通过预测的置信度得分来判断 | +| maximum_line_length_filter | Code | en, zh | 保留最大行长度在指定范围内的样本 | +| perplexity_filter | General | en, zh | 保留困惑度低于指定阈值的样本 | +| phrase_grounding_recall_filter | Multimodal | - | 保留从文本中提取的名词短语在图像中的定位召回率在一定范围内的样本 | +| special_characters_filter | General | en, zh | 保留 special-char 比率的在指定范围内的样本 | +| specified_field_filter | General | en, zh | 根据字段过滤样本,要求字段的值处于指定目标中 | +| specified_numeric_field_filter | General | en, zh | 根据字段过滤样本,要求字段的值处于指定范围(针对数字类型) | +| stopwords_filter | General | en, zh | 保留停用词比率高于指定阈值的样本 | +| suffix_filter | General | en, zh | 保留包含特定后缀的样本 | +| text_action_filter | General | en, zh | 保留文本部分包含动作的样本 | +| text_entity_dependency_filter | General | en, zh | 保留文本部分的依存树中具有非独立实体的样本 | +| text_length_filter | General | en, zh | 保留总文本长度在指定范围内的样本 | +| token_num_filter | General | en, zh | 保留token数在指定范围内的样本 | +| video_aspect_ratio_filter | Video | - | 保留包含视频的宽高比在指定范围内的样本 | +| video_duration_filter | Video | - | 保留包含视频的时长在指定范围内的样本 | +| video_aesthetics_filter | Video | - | 保留指定帧的美学分数在指定范围内的样本 | +| video_frames_text_similarity_filter | Multimodal | - | 保留视频中指定帧的图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | +| video_motion_score_filter | Video | - | 保留包含视频的运动分数(基于稠密光流)在指定范围内的样本 | +| video_nsfw_filter | Video | - | 保留包含视频的NSFW分数在指定阈值之下的样本 | +| video_ocr_area_ratio_filter | Video | - | 保留包含视频的特定帧中检测出的文本的面积占比在指定范围内的样本 | +| video_resolution_filter | Video | - | 保留包含视频的分辨率(包括横向分辨率和纵向分辨率)在指定范围内的样本 | +| video_watermark_filter | Video | - | 保留包含视频有水印的概率在指定阈值之下的样本 | +| video_tagging_from_frames_filter | Video | - | 保留包含具有给定标签视频的样本 | +| words_num_filter | General | en, zh | 保留字数在指定范围内的样本 | +| word_repetition_filter | General | en, zh | 保留 word-level n-gram 重复比率在指定范围内的样本 | ## Deduplicator