diff --git a/configs/datasets/ondisk_json.yaml b/configs/datasets/local_json.yaml similarity index 83% rename from configs/datasets/ondisk_json.yaml rename to configs/datasets/local_json.yaml index a01e3b5a1..791ec9f32 100644 --- a/configs/datasets/ondisk_json.yaml +++ b/configs/datasets/local_json.yaml @@ -2,5 +2,5 @@ project_name: 'dataset-ondisk-json' dataset: configs: - - type: 'ondisk' + - type: 'local' path: 'path/to/json/file' diff --git a/configs/datasets/ondisk_parquet.yaml b/configs/datasets/local_parquet.yaml similarity index 84% rename from configs/datasets/ondisk_parquet.yaml rename to configs/datasets/local_parquet.yaml index e0f2fb144..bfded66f8 100644 --- a/configs/datasets/ondisk_parquet.yaml +++ b/configs/datasets/local_parquet.yaml @@ -2,5 +2,5 @@ project_name: 'dataset-ondisk-parquet' dataset: configs: - - type: 'ondisk' + - type: 'local' path: 'path/to/parquet/file' diff --git a/configs/datasets/mixture.yaml b/configs/datasets/mixture.yaml index da2f077e8..14999ca1a 100644 --- a/configs/datasets/mixture.yaml +++ b/configs/datasets/mixture.yaml @@ -2,9 +2,9 @@ project_name: 'dataset-mixture' dataset: max_sample_num: 10000 configs: - - type: 'ondisk' + - type: 'local' weight: 1.0 path: 'path/to/json/file' - - type: 'ondisk' + - type: 'local' weight: 1.0 path: 'path/to/csv/file' diff --git a/configs/datasets/validation.yaml b/configs/datasets/validation.yaml index 77947e48d..10aa138b2 100644 --- a/configs/datasets/validation.yaml +++ b/configs/datasets/validation.yaml @@ -1,6 +1,6 @@ dataset: configs: - - type: ondisk + - type: local path: path/to/data.json validators: diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py index ef1fabb34..8b4d0cdcb 100644 --- a/data_juicer/core/analyzer.py +++ b/data_juicer/core/analyzer.py @@ -46,7 +46,7 @@ def __init__(self, cfg: Optional[Namespace] = None): # setup dataset builder logger.info('Setting up dataset builder...') - self.dataset_builder = DatasetBuilder(cfg, executor_type='local') + self.dataset_builder = DatasetBuilder(cfg, executor_type='default') # prepare exporter and check export path suffix # NOTICE: no need to export dataset texts for analyzer @@ -86,7 +86,7 @@ def run(self, load_data_np = self.cfg.np if dataset is None: logger.info('Loading dataset from data formatter...') - dataset = self.formatter.load_dataset(load_data_np, self.cfg) + dataset = self.dataset_builder.load_dataset(num_proc=load_data_np) else: logger.info(f'Using existing dataset {dataset}') if self.cfg.auto: diff --git a/data_juicer/core/data/dataset_builder.py b/data_juicer/core/data/dataset_builder.py index 3c25f1257..0203554f2 100644 --- a/data_juicer/core/data/dataset_builder.py +++ b/data_juicer/core/data/dataset_builder.py @@ -21,7 +21,7 @@ class DatasetBuilder(object): DatasetBuilder is a class that builds a dataset from a configuration. """ - def __init__(self, cfg: Namespace, executor_type: str = 'local'): + def __init__(self, cfg: Namespace, executor_type: str = 'default'): # if generated_dataset_config present, prioritize if hasattr( cfg, @@ -133,11 +133,12 @@ def load_dataset(self, **kwargs) -> Union[NestedDataset, RayDataset]: _datasets.append(dataset) # handle data mixture - if self.executor_type == 'local': + if self.executor_type == 'default': return NestedDataset(concatenate_datasets(_datasets)) elif self.executor_type == 'ray': # TODO: support multiple datasets and mixing for ray - assert len(_datasets) == 1, 'Ray setup supports one dataset now' + assert len( + _datasets) == 1, 'Ray setup only supports one dataset now' return _datasets[0] @classmethod @@ -177,7 +178,7 @@ def rewrite_cli_datapath(dataset_path, max_sample_num=None) -> List: for p, w in zip(paths, weights): if os.path.isdir(p) or os.path.isfile(p): # local files - ret['configs'].append({'type': 'ondisk', 'path': p, 'weight': w}) + ret['configs'].append({'type': 'local', 'path': p, 'weight': w}) elif (not is_absolute_path(p) and not p.startswith('.') and p.count('/') <= 1): # remote huggingface diff --git a/data_juicer/core/data/load_strategy.py b/data_juicer/core/data/load_strategy.py index aebf7d6b9..9473356ac 100644 --- a/data_juicer/core/data/load_strategy.py +++ b/data_juicer/core/data/load_strategy.py @@ -123,8 +123,8 @@ def register(cls, executor_type: str, data_type: str, data_source: str): """ Decorator for registering data load strategies with wildcard support - :param executor_type: Type of executor (e.g., 'local', 'ray') - :param data_type: Type of data (e.g., 'ondisk', 'remote') + :param executor_type: Type of executor (e.g., 'default', 'ray') + :param data_type: Type of data (e.g., 'local', 'remote') :param data_source: Specific data source (e.g., 'arxiv', 's3') :return: Decorator function """ @@ -153,7 +153,7 @@ def load_data(self, **kwargs) -> RayDataset: pass -class LocalDataLoadStrategy(DataLoadStrategy): +class DefaultDataLoadStrategy(DataLoadStrategy): """ abstract class for data load strategy for LocalExecutor """ @@ -176,8 +176,8 @@ def load_data(self, **kwargs) -> DJDataset: # pass -@DataLoadStrategyRegistry.register('ray', 'ondisk', '*') -class RayOndiskJsonDataLoadStrategy(RayDataLoadStrategy): +@DataLoadStrategyRegistry.register('ray', 'local', '*') +class RayLocalJsonDataLoadStrategy(RayDataLoadStrategy): # TODO ray defaults to json @@ -212,8 +212,8 @@ def load_data(self, **kwargs): 'Huggingface data load strategy is not implemented') -@DataLoadStrategyRegistry.register('local', 'ondisk', '*') -class LocalOndiskDataLoadStrategy(LocalDataLoadStrategy): +@DataLoadStrategyRegistry.register('default', 'local', '*') +class DefaultLocalDataLoadStrategy(DefaultDataLoadStrategy): """ data load strategy for on disk data for LocalExecutor rely on AutoFormatter for actual data loading @@ -239,8 +239,8 @@ def load_data(self, **kwargs): return formatter.load_dataset() -@DataLoadStrategyRegistry.register('local', 'remote', 'huggingface') -class LocalHuggingfaceDataLoadStrategy(LocalDataLoadStrategy): +@DataLoadStrategyRegistry.register('default', 'remote', 'huggingface') +class DefaultHuggingfaceDataLoadStrategy(DefaultDataLoadStrategy): """ data load strategy for Huggingface dataset for LocalExecutor """ @@ -268,19 +268,19 @@ def load_data(self, **kwargs): global_cfg=self.cfg) -@DataLoadStrategyRegistry.register('local', 'remote', 'modelscope') -class LocalModelScopeDataLoadStrategy(LocalDataLoadStrategy): +@DataLoadStrategyRegistry.register('default', 'remote', 'modelscope') +class DefaultModelScopeDataLoadStrategy(DefaultDataLoadStrategy): """ data load strategy for ModelScope dataset for LocalExecutor """ - def load_data(self): + def load_data(self, **kwargs): raise NotImplementedError( 'ModelScope data load strategy is not implemented') -@DataLoadStrategyRegistry.register('local', 'remote', 'arxiv') -class LocalArxivDataLoadStrategy(LocalDataLoadStrategy): +@DataLoadStrategyRegistry.register('default', 'remote', 'arxiv') +class DefaultArxivDataLoadStrategy(DefaultDataLoadStrategy): """ data load strategy for arxiv dataset for LocalExecutor """ @@ -293,13 +293,13 @@ class LocalArxivDataLoadStrategy(LocalDataLoadStrategy): 'custom_validators': {} } - def load_data(self): + def load_data(self, **kwargs): raise NotImplementedError( 'Arxiv data load strategy is not implemented') -@DataLoadStrategyRegistry.register('local', 'remote', 'wiki') -class LocalWikiDataLoadStrategy(LocalDataLoadStrategy): +@DataLoadStrategyRegistry.register('default', 'remote', 'wiki') +class DefaultWikiDataLoadStrategy(DefaultDataLoadStrategy): """ data load strategy for wiki dataset for LocalExecutor """ @@ -312,12 +312,12 @@ class LocalWikiDataLoadStrategy(LocalDataLoadStrategy): 'custom_validators': {} } - def load_data(self): + def load_data(self, **kwargs): raise NotImplementedError('Wiki data load strategy is not implemented') -@DataLoadStrategyRegistry.register('local', 'remote', 'commoncrawl') -class LocalCommonCrawlDataLoadStrategy(LocalDataLoadStrategy): +@DataLoadStrategyRegistry.register('default', 'remote', 'commoncrawl') +class DefaultCommonCrawlDataLoadStrategy(DefaultDataLoadStrategy): """ data load strategy for commoncrawl dataset for LocalExecutor """ @@ -336,6 +336,6 @@ class LocalCommonCrawlDataLoadStrategy(LocalDataLoadStrategy): } } - def load_data(self): + def load_data(self, **kwargs): raise NotImplementedError( 'CommonCrawl data load strategy is not implemented') diff --git a/data_juicer/core/executor/__init__.py b/data_juicer/core/executor/__init__.py index 58402bf9b..75ed7676a 100644 --- a/data_juicer/core/executor/__init__.py +++ b/data_juicer/core/executor/__init__.py @@ -1,6 +1,6 @@ from .base import ExecutorBase +from .default_executor import Executor from .factory import ExecutorFactory -from .local_executor import Executor from .ray_executor import RayExecutor __all__ = ['ExecutorBase' diff --git a/data_juicer/core/executor/local_executor.py b/data_juicer/core/executor/default_executor.py similarity index 100% rename from data_juicer/core/executor/local_executor.py rename to data_juicer/core/executor/default_executor.py diff --git a/data_juicer/core/executor/factory.py b/data_juicer/core/executor/factory.py index a97e49291..4ca162350 100644 --- a/data_juicer/core/executor/factory.py +++ b/data_juicer/core/executor/factory.py @@ -1,6 +1,6 @@ from typing import Union -from .local_executor import Executor +from .default_executor import Executor from .ray_executor import RayExecutor diff --git a/tests/core/data/test_config.yaml b/tests/core/data/test_config.yaml index 65db30be3..3a1679585 100644 --- a/tests/core/data/test_config.yaml +++ b/tests/core/data/test_config.yaml @@ -1,5 +1,5 @@ -project_name: 'dataset-ondisk-json' +project_name: 'dataset-local-json' dataset: configs: - - type: 'ondisk' + - type: 'local' path: 'sample.json' \ No newline at end of file diff --git a/tests/core/data/test_config_list.yaml b/tests/core/data/test_config_list.yaml index 15b52b036..d32964672 100644 --- a/tests/core/data/test_config_list.yaml +++ b/tests/core/data/test_config_list.yaml @@ -1,7 +1,7 @@ -project_name: 'dataset-ondisk-list' +project_name: 'dataset-local-list' dataset: configs: - - type: 'ondisk' + - type: 'local' path: 'sample.json' - - type: 'ondisk' + - type: 'local' path: 'sample.txt' \ No newline at end of file diff --git a/tests/core/data/test_config_ray.yaml b/tests/core/data/test_config_ray.yaml index e394f0b26..ff3220c15 100644 --- a/tests/core/data/test_config_ray.yaml +++ b/tests/core/data/test_config_ray.yaml @@ -3,7 +3,7 @@ project_name: 'ray-demo-new-config' dataset: configs: - - type: ondisk + - type: local path: ./demos/process_on_ray/data/demo-dataset.jsonl # path to your dataset directory or file weight: 1.0 diff --git a/tests/core/test_dataload_strategy.py b/tests/core/test_dataload_strategy.py index 773b06eae..a9a8f7087 100644 --- a/tests/core/test_dataload_strategy.py +++ b/tests/core/test_dataload_strategy.py @@ -14,27 +14,27 @@ def setUp(self): def test_exact_match(self): # Register a specific strategy - @DataLoadStrategyRegistry.register("local", 'ondisk', 'json') + @DataLoadStrategyRegistry.register("default", 'local', 'json') class TestStrategy(MockStrategy): pass # Test exact match strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'json') + "default", 'local', 'json') self.assertEqual(strategy, TestStrategy) # Test no match strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'csv') + "default", 'local', 'csv') self.assertIsNone(strategy) def test_wildcard_matching(self): # Register strategies with different wildcard patterns - @DataLoadStrategyRegistry.register("local", 'ondisk', '*') + @DataLoadStrategyRegistry.register("default", 'local', '*') class AllFilesStrategy(MockStrategy): pass - @DataLoadStrategyRegistry.register("local", '*', '*') + @DataLoadStrategyRegistry.register("default", '*', '*') class AllLocalStrategy(MockStrategy): pass @@ -44,11 +44,11 @@ class FallbackStrategy(MockStrategy): # Test specific matches strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'json') + "default", 'local', 'json') self.assertEqual(strategy, AllFilesStrategy) # Should match most specific wildcard strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'remote', 'json') + "default", 'remote', 'json') self.assertEqual(strategy, AllLocalStrategy) # Should match second level wildcard strategy = DataLoadStrategyRegistry.get_strategy_class( @@ -60,29 +60,29 @@ def test_specificity_priority(self): class GeneralStrategy(MockStrategy): pass - @DataLoadStrategyRegistry.register("local", '*', '*') + @DataLoadStrategyRegistry.register("default", '*', '*') class LocalStrategy(MockStrategy): pass - @DataLoadStrategyRegistry.register("local", 'ondisk', '*') + @DataLoadStrategyRegistry.register("default", 'local', '*') class LocalOndiskStrategy(MockStrategy): pass - @DataLoadStrategyRegistry.register("local", 'ondisk', 'json') + @DataLoadStrategyRegistry.register("default", 'local', 'json') class ExactStrategy(MockStrategy): pass # Test matching priority strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'json') + "default", 'local', 'json') self.assertEqual(strategy, ExactStrategy) # Should match exact first strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'csv') + "default", 'local', 'csv') self.assertEqual(strategy, LocalOndiskStrategy) # Should match one wildcard strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'remote', 'json') + "default", 'remote', 'json') self.assertEqual(strategy, LocalStrategy) # Should match two wildcards strategy = DataLoadStrategyRegistry.get_strategy_class( @@ -91,41 +91,41 @@ class ExactStrategy(MockStrategy): def test_pattern_matching(self): @DataLoadStrategyRegistry.register( - "local", 'ondisk', '*.json') + "default", 'local', '*.json') class JsonStrategy(MockStrategy): pass @DataLoadStrategyRegistry.register( - "local", 'ondisk', 'data_[0-9]*') + "default", 'local', 'data_[0-9]*') class NumberedDataStrategy(MockStrategy): pass # Test pattern matching strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'test.json') + "default", 'local', 'test.json') self.assertEqual(strategy, JsonStrategy) strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'data_123') + "default", 'local', 'data_123') self.assertEqual(strategy, NumberedDataStrategy) strategy = DataLoadStrategyRegistry.get_strategy_class( - "local", 'ondisk', 'test.csv') + "default", 'local', 'test.csv') self.assertIsNone(strategy) def test_strategy_key_matches(self): # Test StrategyKey matching directly - wildcard_key = StrategyKey("*", 'ondisk', '*.json') - specific_key = StrategyKey("local", 'ondisk', 'test.json') + wildcard_key = StrategyKey("*", 'local', '*.json') + specific_key = StrategyKey("default", 'local', 'test.json') # Exact keys don't match wildcards self.assertTrue(wildcard_key.matches(specific_key)) self.assertFalse(specific_key.matches(wildcard_key)) # Test pattern matching - pattern_key = StrategyKey("local", '*', 'data_[0-9]*') - match_key = StrategyKey("local", 'ondisk', 'data_123') - no_match_key = StrategyKey("local", 'ondisk', 'data_abc') + pattern_key = StrategyKey("default", '*', 'data_[0-9]*') + match_key = StrategyKey("default", 'local', 'data_123') + no_match_key = StrategyKey("default", 'local', 'data_abc') self.assertTrue(pattern_key.matches(match_key)) self.assertFalse(pattern_key.matches(no_match_key)) diff --git a/tests/core/test_dataset_builder.py b/tests/core/test_dataset_builder.py index 7ac101d16..c0964fe55 100644 --- a/tests/core/test_dataset_builder.py +++ b/tests/core/test_dataset_builder.py @@ -11,7 +11,7 @@ from data_juicer.core.data.config_validator import ConfigValidationError from data_juicer.utils.unittest_utils import (DataJuicerTestCaseBase, SKIPPED_TESTS) -from data_juicer.core.data.load_strategy import RayOndiskJsonDataLoadStrategy +from data_juicer.core.data.load_strategy import RayLocalJsonDataLoadStrategy WORK_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -24,7 +24,7 @@ def setUp(self): """Setup basic configuration for tests""" self.base_cfg = Namespace() self.base_cfg.dataset_path = None - self.executor_type = 'local' + self.executor_type = 'default' # Get the directory where this test file is located test_file_dir = os.path.dirname(os.path.abspath(__file__)) @@ -36,7 +36,7 @@ def test_rewrite_cli_datapath_local_single_file(self): ans = rewrite_cli_datapath(dataset_path) self.assertEqual( {'configs': [ - {'path': dataset_path, 'type': 'ondisk', 'weight': 1.0}]}, + {'path': dataset_path, 'type': 'local', 'weight': 1.0}]}, ans) def test_rewrite_cli_datapath_local_directory(self): @@ -44,7 +44,7 @@ def test_rewrite_cli_datapath_local_directory(self): ans = rewrite_cli_datapath(dataset_path) self.assertEqual( {'configs': [ - {'path': dataset_path, 'type': 'ondisk', 'weight': 1.0}]}, + {'path': dataset_path, 'type': 'local', 'weight': 1.0}]}, ans) def test_rewrite_cli_datapath_hf(self): @@ -67,8 +67,8 @@ def test_rewrite_cli_datapath_with_weights(self): ans = rewrite_cli_datapath(dataset_path) self.assertEqual( {'configs': [ - {'path': './data/sample.json', 'type': 'ondisk', 'weight': 0.5}, - {'path': './data/sample.txt', 'type': 'ondisk', 'weight': 1.0}]}, + {'path': './data/sample.json', 'type': 'local', 'weight': 0.5}, + {'path': './data/sample.txt', 'type': 'local', 'weight': 1.0}]}, ans) @patch('os.path.isdir') @@ -81,9 +81,9 @@ def test_rewrite_cli_datapath_local_files(self, mock_isfile, mock_isdir): dataset_path = "1.0 ds1.jsonl 2.0 ds2_dir 3.0 ds3.jsonl" expected = { 'configs': [ - {'type': 'ondisk', 'path': 'ds1.jsonl', 'weight': 1.0}, - {'type': 'ondisk', 'path': 'ds2_dir', 'weight': 2.0}, - {'type': 'ondisk', 'path': 'ds3.jsonl', 'weight': 3.0} + {'type': 'local', 'path': 'ds1.jsonl', 'weight': 1.0}, + {'type': 'local', 'path': 'ds2_dir', 'weight': 2.0}, + {'type': 'local', 'path': 'ds3.jsonl', 'weight': 3.0} ] } result = rewrite_cli_datapath(dataset_path) @@ -113,7 +113,7 @@ def test_rewrite_cli_datapath_with_max_samples(self): expected = { 'configs': [{ - 'type': 'ondisk', + 'type': 'local', 'path': './data/sample.txt', 'weight': 1.0 }], @@ -129,7 +129,7 @@ def test_rewrite_cli_datapath_without_max_samples(self): expected = { 'configs': [{ - 'type': 'ondisk', + 'type': 'local', 'path': './data/sample.txt', 'weight': 1.0 }] @@ -255,7 +255,7 @@ def test_builder_single_dataset_config(self): self.base_cfg.dataset = { 'configs': [ { - 'type': 'ondisk', + 'type': 'local', 'path': 'test.jsonl' } ] @@ -269,7 +269,7 @@ def test_builder_single_dataset_config(self): # Verify config content preserved strategy = builder.load_strategies[0] - self.assertEqual(strategy.ds_config['type'], 'ondisk') + self.assertEqual(strategy.ds_config['type'], 'local') self.assertEqual(strategy.ds_config['path'], 'test.jsonl') def test_builder_multiple_dataset_config(self): @@ -278,11 +278,11 @@ def test_builder_multiple_dataset_config(self): self.base_cfg.dataset = { 'configs': [ { - 'type': 'ondisk', + 'type': 'local', 'path': 'test1.jsonl' }, { - 'type': 'ondisk', + 'type': 'local', 'path': 'test2.jsonl' } ] @@ -311,7 +311,7 @@ def test_builder_mixed_dataset_types(self): self.base_cfg.dataset = { 'configs': [ { - 'type': 'ondisk', + 'type': 'local', 'path': 'test1.jsonl' }, { @@ -373,9 +373,9 @@ def test_builder_ondisk_config(self): with redirect_stdout(out): cfg = init_configs(args=f'--config {test_config_file}'.split()) self.assertIsInstance(cfg, Namespace) - self.assertEqual(cfg.project_name, 'dataset-ondisk-json') + self.assertEqual(cfg.project_name, 'dataset-local-json') self.assertEqual(cfg.dataset, - {'configs': [{'path': 'sample.json', 'type': 'ondisk'}]}) + {'configs': [{'path': 'sample.json', 'type': 'local'}]}) self.assertEqual(not cfg.dataset_path, True) def test_builder_ondisk_config_list(self): @@ -384,11 +384,11 @@ def test_builder_ondisk_config_list(self): with redirect_stdout(out): cfg = init_configs(args=f'--config {test_config_file}'.split()) self.assertIsInstance(cfg, Namespace) - self.assertEqual(cfg.project_name, 'dataset-ondisk-list') + self.assertEqual(cfg.project_name, 'dataset-local-list') self.assertEqual(cfg.dataset, {'configs': [ - {'path': 'sample.json', 'type': 'ondisk'}, - {'path': 'sample.txt', 'type': 'ondisk'} + {'path': 'sample.json', 'type': 'local'}, + {'path': 'sample.txt', 'type': 'local'} ]}) self.assertEqual(not cfg.dataset_path, True) @@ -396,7 +396,7 @@ def test_builder_with_max_samples(self): """Test DatasetBuilder with max_sample_num""" self.base_cfg.dataset = { 'configs': [{ - 'type': 'ondisk', + 'type': 'local', 'path': 'test.jsonl', 'weight': 1.0 }], @@ -412,7 +412,7 @@ def test_builder_without_max_samples(self): """Test DatasetBuilder without max_sample_num""" self.base_cfg.dataset = { 'configs': [{ - 'type': 'ondisk', + 'type': 'local', 'path': 'test.jsonl', 'weight': 1.0 }] @@ -428,12 +428,12 @@ def test_mixed_dataset_configs(self): self.base_cfg.dataset = { 'configs': [ { - 'type': 'ondisk', + 'type': 'local', 'path': 'test1.jsonl', 'weight': 1.0 }, { - 'type': 'ondisk', + 'type': 'local', 'path': 'test2.jsonl', 'weight': 2.0 } @@ -461,7 +461,7 @@ def test_invalid_max_sample_num(self): for value in invalid_values: self.base_cfg.dataset = { 'configs': [{ - 'type': 'ondisk', + 'type': 'local', 'path': 'test.jsonl', 'weight': 1.0 }], @@ -489,7 +489,7 @@ def test_builder_ray_config(self): # Verify dataset config self.assertEqual(cfg.dataset, { 'configs': [{ - 'type': 'ondisk', + 'type': 'local', 'path': './demos/process_on_ray/data/demo-dataset.jsonl', 'weight': 1.0 }] @@ -498,7 +498,7 @@ def test_builder_ray_config(self): # Create builder and verify builder = DatasetBuilder(cfg, executor_type=cfg.executor_type) self.assertEqual(len(builder.load_strategies), 1) - self.assertIsInstance(builder.load_strategies[0], RayOndiskJsonDataLoadStrategy) + self.assertIsInstance(builder.load_strategies[0], RayLocalJsonDataLoadStrategy) if __name__ == '__main__': unittest.main()