Merge branch 'milo/harbor-checkpointer' of github.com:mosaicml/llm-fo…

…undry into milo/harbor-checkpointer
mosaicml · Jan 24, 2025 · ec56132 · ec56132
2 parents c857319 + f81d87f
commit ec56132
Show file tree

Hide file tree

Showing 20 changed files with 127 additions and 21 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
     additional_dependencies:
     - toml
 - repo: https://github.com/hadialqattan/pycln
-  rev: v2.1.2
+  rev: v2.5.0
   hooks:
   - id: pycln
     args: [. --all]

diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py
@@ -3,4 +3,4 @@
 
 """The LLM Foundry Version."""
 
-__version__ = '0.16.0.dev0'
+__version__ = '0.17.0.dev0'
diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
@@ -619,9 +619,7 @@ def tensor_hook(
 
         hooks = []
         for _, module in state_dict_model.named_modules():
-            hooks.append(
-                module._register_state_dict_hook(tensor_hook),
-            )
+            hooks.append(module._register_state_dict_hook(tensor_hook),)
 
         state_dict = get_model_state_dict(
             state_dict_model,
@@ -840,8 +838,10 @@ def _save_checkpoint(
             assert new_model_instance is not None
             if register_to_mlflow:
                 self._register_hf_model(
-                    temp_save_dir, original_tokenizer, use_temp_dir,
-                    new_model_instance
+                    temp_save_dir,
+                    original_tokenizer,
+                    use_temp_dir,
+                    new_model_instance,
                 )
             else:
                 # Clean up the temporary directory if we don't need to register to mlflow.

diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py
@@ -730,6 +730,14 @@ def fetch_DT(
                 message=
                 f'The data preparation cluster you provided is not usable. Please retry with a cluster that is healthy and alive. {e}',
             ) from e
+        if isinstance(
+            e,
+            spark_errors.SparkConnectGrpcException,
+        ) and 'do not have permission to attach to cluster' in str(e):
+            raise FaultyDataPrepCluster(
+                message=
+                f'You do not have permission to attach to the data preparation cluster you provided. {e}',
+            ) from e
         if isinstance(e, grpc.RpcError) and e.code(
         ) == grpc.StatusCode.INTERNAL and 'Job aborted due to stage failure' in e.details(
         ):

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -601,6 +601,9 @@ class StreamingFinetuningDataset(StreamingDataset):
         replication (int, optional): Determines how many consecutive devices will receive the same
             samples. Useful for training with tensor or sequence parallelism, where multiple
             devices need to see the same partition of the dataset. Defaults to ``None``.
+        stream_name (str): The name of the Stream to use which is registered in
+            streaming.base.stream.streams_registry. Defaults to ``stream``.
+        stream_config (dict[str, Any]): Additional arguments to pass to the Stream constructor.
     """
 
     def __init__(
@@ -632,6 +635,8 @@ def __init__(
         allow_unsafe_types: bool = False,
         replication: Optional[int] = None,
         packing_ratio: Optional[float] = None,
+        stream_name: str = 'stream',
+        stream_config: Optional[dict[str, Any]] = None,
         **kwargs: Any,
     ):
 
@@ -675,6 +680,8 @@ def __init__(
             batching_method=batching_method,
             allow_unsafe_types=allow_unsafe_types,
             replication=replication,
+            stream_name=stream_name,
+            stream_config=stream_config,
             **kwargs,
         )
 
@@ -1161,3 +1168,31 @@ def shareGPT_format_preprocessor(inp: dict) -> ChatFormattedDict:
     except Exception as e:
         raise UnableToProcessPromptResponseError(inp) from e
     return {'messages': messages}
+
+
+@dataset_constructor.register('math-ai/StackMathQA')
+def QA_format_preprocessor(inp: dict) -> ChatFormattedDict:
+    """Convert from QA format to our chat format."""
+    try:
+        Q = inp['Q']
+        A = inp['A']
+        messages: list[dict[str, str]] = [{
+            'role': 'user',
+            'content': Q,
+        }, {
+            'role': 'assistant',
+            'content': A,
+        }]
+    except Exception as e:
+        raise UnableToProcessPromptResponseError(inp) from e
+    return {'messages': messages}
+
+
+@dataset_constructor.register('AI-MO/NuminaMath-CoT')
+def messages_format_preprocessor(inp: dict) -> ChatFormattedDict:
+    """Convert from QA format to our chat format."""
+    try:
+        messages = inp['messages']
+    except Exception as e:
+        raise UnableToProcessPromptResponseError(inp) from e
+    return {'messages': messages}
diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
@@ -105,6 +105,9 @@ class StreamingTextDataset(StreamingDataset):
         replication (int, optional): Determines how many consecutive devices will receive the same
             samples. Useful for training with tensor or sequence parallelism, where multiple
             devices need to see the same partition of the dataset. Defaults to ``None``.
+        stream_name (str): The name of the Stream to use which is registered in
+            streaming.base.stream.streams_registry. Defaults to ``stream``.
+        stream_config (dict[str, Any]): Additional arguments to pass to the Stream constructor.
     """
 
     def __init__(
@@ -135,6 +138,8 @@ def __init__(
         batching_method: str = 'random',
         allow_unsafe_types: bool = False,
         replication: Optional[int] = None,
+        stream_name: str = 'stream',
+        stream_config: Optional[dict[str, Any]] = None,
         **kwargs: Any,
     ):
 
@@ -183,6 +188,8 @@ def __init__(
             batching_method=batching_method,
             allow_unsafe_types=allow_unsafe_types,
             replication=replication,
+            stream_name=stream_name,
+            stream_config=stream_config,
             **kwargs,
         )
         self.tokenizer = tokenizer

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml
@@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.5.1_cu124-latest
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
 

diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit:  # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-llama3-70b-instruct-finetune.yaml b/mcli/mcli-llama3-70b-instruct-finetune.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu,openai]
   ssh_clone: false  # Should be true if using a private repo

diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml
@@ -14,7 +14,7 @@ integrations:
   - oci-cli==3.23.2
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.1
+  git_branch: v0.16.0
   # git_commit: # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo

diff --git a/setup.py b/setup.py
@@ -56,9 +56,9 @@
     'mlflow>=2.14.1,<2.19',
     'accelerate>=0.25,<1.2',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.47',
-    'mosaicml-streaming>=0.10.0,<0.11',
+    'mosaicml-streaming>=0.11.0,<0.12',
     'torch>=2.5.1,<2.5.2',
-    'datasets>=2.20.0,<3.2',
+    'datasets>=2.20.0,<3.3',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
     'sentencepiece==0.2.0',
     'einops==0.8.0',
@@ -79,7 +79,7 @@
 extra_deps = {}
 
 extra_deps['dev'] = [
-    'coverage[toml]==7.6.4',
+    'coverage[toml]==7.6.10',
     'pre-commit>=3.4.0,<4',
     'pytest>=7.2.1,<9',
     'pytest_codeblocks>=0.16.1,<0.18',

diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py
@@ -625,6 +625,15 @@ def test_fetch_DT_catches_grpc_errors(
                     'The data preparation cluster you provided is not usable. Please retry with a cluster that is healthy and alive.',
                 ],
             ),
+            (
+                SparkConnectGrpcException(
+                    'do not have permission to attach to cluster etc...',
+                ),
+                FaultyDataPrepCluster,
+                [
+                    'You do not have permission to attach to the data preparation cluster you provided.',
+                ],
+            ),
             (
                 grpc_lib_error,
                 FaultyDataPrepCluster,

diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py
@@ -8,8 +8,10 @@
 import pytest
 
 from llmfoundry.data.finetuning.tasks import (
+    QA_format_preprocessor,
     _get_num_processes,
     dataset_constructor,
+    messages_format_preprocessor,
 )
 from llmfoundry.utils.exceptions import DatasetTooSmallError
 
@@ -60,3 +62,48 @@ def get_local_world_size(self):
                 new=MockDataset,
             ):
                 dataset_constructor.build_from_streaming()
+
+
+def test_QA_format_preprocessor():
+    inp = {
+        'Q': 'What is the capital of France?',
+        'A': 'Paris',
+        'meta': {
+            'a': 'b',
+        },
+    }
+
+    expected_messages = [{
+        'role': 'user',
+        'content': 'What is the capital of France?',
+    }, {
+        'role': 'assistant',
+        'content': 'Paris',
+    }]
+    output = QA_format_preprocessor(inp)
+    assert len(output) == 1
+    assert 'messages' in output
+    for i, message in enumerate(output['messages']):
+        expected_message = expected_messages[i]
+        for k, v in message.items():
+            assert k in expected_message
+            assert v == expected_message[k]
+
+
+def test_messages_format_preprocessor():
+    messages = [{
+        'role': 'user',
+        'content': 'What is the capital of France?',
+    }, {
+        'role': 'assistant',
+        'content': 'Paris',
+    }]
+    inp = {
+        'messages': messages,
+        'other_key': 'other_value',
+    }
+
+    output = messages_format_preprocessor(inp)
+    assert len(output) == 1
+    assert 'messages' in output
+    assert output['messages'] == messages
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		"""The LLM Foundry Version."""

		__version__ = '0.16.0.dev0'
		__version__ = '0.17.0.dev0'