From f2999866361fd181a21bf580c3e31a0689f74941 Mon Sep 17 00:00:00 2001 From: zhijianma Date: Fri, 15 Dec 2023 11:07:23 +0800 Subject: [PATCH] fix: 1. change relpath to abspath of dataset (#137) * fix: 1. change relpath to abspath of dataset 2. torch set_num_threads in python3.8 on Mac system. * fix: import fingerprint_warnings from datasets --- data_juicer/config/config.py | 14 +++++++------- data_juicer/utils/availability_utils.py | 21 ++++++++++++++++++++- data_juicer/utils/fingerprint_utils.py | 4 ++-- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index a3611c56f..7069cf087 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -311,6 +311,7 @@ def init_setup_from_cfg(cfg): :param cfg: a updated cfg """ + cfg.export_path = os.path.abspath(cfg.export_path) export_path = cfg.export_path cfg.work_dir = os.path.dirname(export_path) log_dir = os.path.join(cfg.work_dir, 'log') @@ -325,16 +326,15 @@ def init_setup_from_cfg(cfg): # check and get dataset dir if os.path.exists(cfg.dataset_path): + cfg.dataset_path = os.path.abspath(cfg.dataset_path) if os.path.isdir(cfg.dataset_path): - cfg.dataset_dir = os.path.abspath(cfg.dataset_path) + cfg.dataset_dir = cfg.dataset_path else: - cfg.dataset_dir = os.path.abspath(os.path.dirname( - cfg.dataset_path)) + cfg.dataset_dir = os.path.dirname(cfg.dataset_path) else: - logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. ' - f'Please check and retry.') - raise ValueError(f'Input dataset_path [{cfg.dataset_path}] is ' - f'invalid. Please check and retry.') + logger.warning(f'dataset_path [{cfg.dataset_path}] not found in local.' + 'Please check and retry, otherwise we will treat it ' + 'as a remote dataset.') # whether or not to use cache management # disabling the cache or using checkpoint explicitly will turn off the diff --git a/data_juicer/utils/availability_utils.py b/data_juicer/utils/availability_utils.py index bc3ca475d..17b718403 100644 --- a/data_juicer/utils/availability_utils.py +++ b/data_juicer/utils/availability_utils.py @@ -1,6 +1,9 @@ + + from loguru import logger UNAVAILABLE_OPERATORS = {} +CHECK_SYSTEM_INFO_ONCE = True class UnavailableOperator: @@ -52,7 +55,23 @@ def __init__( f'`pip install -v -e .[{self.requires_type}]`' def __enter__(self): - pass + + # only for python3.8 on mac + global CHECK_SYSTEM_INFO_ONCE + if CHECK_SYSTEM_INFO_ONCE: + import os + import platform + import sys + major, minor = sys.version_info[:2] + system = platform.system() + if major == 3 and minor == 8 and system == 'Darwin': + logger.warning( + 'The torch.set_num_threads function does not ' + 'work in python3.8 version on Mac systems. We will set ' + 'OMP_NUM_THREADS to 1 manually before importing torch') + + os.environ['OMP_NUM_THREADS'] = str(1) + CHECK_SYSTEM_INFO_ONCE = False def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is ModuleNotFoundError: diff --git a/data_juicer/utils/fingerprint_utils.py b/data_juicer/utils/fingerprint_utils.py index 0dfc814c0..c984ca1c4 100644 --- a/data_juicer/utils/fingerprint_utils.py +++ b/data_juicer/utils/fingerprint_utils.py @@ -2,7 +2,7 @@ import dill import xxhash -from datasets.fingerprint import (_CACHING_ENABLED, +from datasets.fingerprint import (_CACHING_ENABLED, fingerprint_warnings, format_kwargs_for_fingerprint, format_transform_for_fingerprint, generate_random_fingerprint, @@ -54,7 +54,7 @@ def update_fingerprint(fingerprint, transform, transform_args): """ Combining various objects to update the fingerprint. """ - global fingerprint_warnings + hasher = Hasher() hasher.update(fingerprint) try: