Skip to content

Commit

Permalink
fix: 1. change relpath to abspath of dataset (#137)
Browse files Browse the repository at this point in the history
* fix: 1. change relpath to abspath of dataset
2. torch set_num_threads in python3.8 on Mac system.

* fix: import fingerprint_warnings from datasets
  • Loading branch information
zhijianma authored Dec 15, 2023
1 parent 48c081e commit f299986
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 10 deletions.
14 changes: 7 additions & 7 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ def init_setup_from_cfg(cfg):
:param cfg: a updated cfg
"""

cfg.export_path = os.path.abspath(cfg.export_path)
export_path = cfg.export_path
cfg.work_dir = os.path.dirname(export_path)
log_dir = os.path.join(cfg.work_dir, 'log')
Expand All @@ -325,16 +326,15 @@ def init_setup_from_cfg(cfg):

# check and get dataset dir
if os.path.exists(cfg.dataset_path):
cfg.dataset_path = os.path.abspath(cfg.dataset_path)
if os.path.isdir(cfg.dataset_path):
cfg.dataset_dir = os.path.abspath(cfg.dataset_path)
cfg.dataset_dir = cfg.dataset_path
else:
cfg.dataset_dir = os.path.abspath(os.path.dirname(
cfg.dataset_path))
cfg.dataset_dir = os.path.dirname(cfg.dataset_path)
else:
logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. '
f'Please check and retry.')
raise ValueError(f'Input dataset_path [{cfg.dataset_path}] is '
f'invalid. Please check and retry.')
logger.warning(f'dataset_path [{cfg.dataset_path}] not found in local.'
'Please check and retry, otherwise we will treat it '
'as a remote dataset.')

# whether or not to use cache management
# disabling the cache or using checkpoint explicitly will turn off the
Expand Down
21 changes: 20 additions & 1 deletion data_juicer/utils/availability_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@


from loguru import logger

UNAVAILABLE_OPERATORS = {}
CHECK_SYSTEM_INFO_ONCE = True


class UnavailableOperator:
Expand Down Expand Up @@ -52,7 +55,23 @@ def __init__(
f'`pip install -v -e .[{self.requires_type}]`'

def __enter__(self):
pass

# only for python3.8 on mac
global CHECK_SYSTEM_INFO_ONCE
if CHECK_SYSTEM_INFO_ONCE:
import os
import platform
import sys
major, minor = sys.version_info[:2]
system = platform.system()
if major == 3 and minor == 8 and system == 'Darwin':
logger.warning(
'The torch.set_num_threads function does not '
'work in python3.8 version on Mac systems. We will set '
'OMP_NUM_THREADS to 1 manually before importing torch')

os.environ['OMP_NUM_THREADS'] = str(1)
CHECK_SYSTEM_INFO_ONCE = False

def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is ModuleNotFoundError:
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/utils/fingerprint_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import dill
import xxhash
from datasets.fingerprint import (_CACHING_ENABLED,
from datasets.fingerprint import (_CACHING_ENABLED, fingerprint_warnings,
format_kwargs_for_fingerprint,
format_transform_for_fingerprint,
generate_random_fingerprint,
Expand Down Expand Up @@ -54,7 +54,7 @@ def update_fingerprint(fingerprint, transform, transform_args):
"""
Combining various objects to update the fingerprint.
"""
global fingerprint_warnings

hasher = Hasher()
hasher.update(fingerprint)
try:
Expand Down

0 comments on commit f299986

Please sign in to comment.