Skip to content

Commit

Permalink
Merge branch 'milo/harbor-checkpointer' of github.com:mosaicml/llm-fo…
Browse files Browse the repository at this point in the history
…undry into milo/harbor-checkpointer
  • Loading branch information
milocress committed Jan 24, 2025
2 parents c857319 + f81d87f commit ec56132
Show file tree
Hide file tree
Showing 20 changed files with 127 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
additional_dependencies:
- toml
- repo: https://github.com/hadialqattan/pycln
rev: v2.1.2
rev: v2.5.0
hooks:
- id: pycln
args: [. --all]
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

"""The LLM Foundry Version."""

__version__ = '0.16.0.dev0'
__version__ = '0.17.0.dev0'
10 changes: 5 additions & 5 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,9 +619,7 @@ def tensor_hook(

hooks = []
for _, module in state_dict_model.named_modules():
hooks.append(
module._register_state_dict_hook(tensor_hook),
)
hooks.append(module._register_state_dict_hook(tensor_hook),)

state_dict = get_model_state_dict(
state_dict_model,
Expand Down Expand Up @@ -840,8 +838,10 @@ def _save_checkpoint(
assert new_model_instance is not None
if register_to_mlflow:
self._register_hf_model(
temp_save_dir, original_tokenizer, use_temp_dir,
new_model_instance
temp_save_dir,
original_tokenizer,
use_temp_dir,
new_model_instance,
)
else:
# Clean up the temporary directory if we don't need to register to mlflow.
Expand Down
8 changes: 8 additions & 0 deletions llmfoundry/command_utils/data_prep/convert_delta_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,14 @@ def fetch_DT(
message=
f'The data preparation cluster you provided is not usable. Please retry with a cluster that is healthy and alive. {e}',
) from e
if isinstance(
e,
spark_errors.SparkConnectGrpcException,
) and 'do not have permission to attach to cluster' in str(e):
raise FaultyDataPrepCluster(
message=
f'You do not have permission to attach to the data preparation cluster you provided. {e}',
) from e
if isinstance(e, grpc.RpcError) and e.code(
) == grpc.StatusCode.INTERNAL and 'Job aborted due to stage failure' in e.details(
):
Expand Down
35 changes: 35 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,9 @@ class StreamingFinetuningDataset(StreamingDataset):
replication (int, optional): Determines how many consecutive devices will receive the same
samples. Useful for training with tensor or sequence parallelism, where multiple
devices need to see the same partition of the dataset. Defaults to ``None``.
stream_name (str): The name of the Stream to use which is registered in
streaming.base.stream.streams_registry. Defaults to ``stream``.
stream_config (dict[str, Any]): Additional arguments to pass to the Stream constructor.
"""

def __init__(
Expand Down Expand Up @@ -632,6 +635,8 @@ def __init__(
allow_unsafe_types: bool = False,
replication: Optional[int] = None,
packing_ratio: Optional[float] = None,
stream_name: str = 'stream',
stream_config: Optional[dict[str, Any]] = None,
**kwargs: Any,
):

Expand Down Expand Up @@ -675,6 +680,8 @@ def __init__(
batching_method=batching_method,
allow_unsafe_types=allow_unsafe_types,
replication=replication,
stream_name=stream_name,
stream_config=stream_config,
**kwargs,
)

Expand Down Expand Up @@ -1161,3 +1168,31 @@ def shareGPT_format_preprocessor(inp: dict) -> ChatFormattedDict:
except Exception as e:
raise UnableToProcessPromptResponseError(inp) from e
return {'messages': messages}


@dataset_constructor.register('math-ai/StackMathQA')
def QA_format_preprocessor(inp: dict) -> ChatFormattedDict:
"""Convert from QA format to our chat format."""
try:
Q = inp['Q']
A = inp['A']
messages: list[dict[str, str]] = [{
'role': 'user',
'content': Q,
}, {
'role': 'assistant',
'content': A,
}]
except Exception as e:
raise UnableToProcessPromptResponseError(inp) from e
return {'messages': messages}


@dataset_constructor.register('AI-MO/NuminaMath-CoT')
def messages_format_preprocessor(inp: dict) -> ChatFormattedDict:
"""Convert from QA format to our chat format."""
try:
messages = inp['messages']
except Exception as e:
raise UnableToProcessPromptResponseError(inp) from e
return {'messages': messages}
7 changes: 7 additions & 0 deletions llmfoundry/data/text_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ class StreamingTextDataset(StreamingDataset):
replication (int, optional): Determines how many consecutive devices will receive the same
samples. Useful for training with tensor or sequence parallelism, where multiple
devices need to see the same partition of the dataset. Defaults to ``None``.
stream_name (str): The name of the Stream to use which is registered in
streaming.base.stream.streams_registry. Defaults to ``stream``.
stream_config (dict[str, Any]): Additional arguments to pass to the Stream constructor.
"""

def __init__(
Expand Down Expand Up @@ -135,6 +138,8 @@ def __init__(
batching_method: str = 'random',
allow_unsafe_types: bool = False,
replication: Optional[int] = None,
stream_name: str = 'stream',
stream_config: Optional[dict[str, Any]] = None,
**kwargs: Any,
):

Expand Down Expand Up @@ -183,6 +188,8 @@ def __init__(
batching_method=batching_method,
allow_unsafe_types=allow_unsafe_types,
replication=replication,
stream_name=stream_name,
stream_config=stream_config,
**kwargs,
)
self.tokenizer = tokenizer
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b-max-seq-len-8k.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-benchmark-mpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.5.1_cu124-latest
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]

Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-convert-composer-to-hf.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-hf-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-hf-generate.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-llama2-finetune.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-llama3-70b-instruct-finetune.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-openai-eval.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
integrations:
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .[gpu,openai]
ssh_clone: false # Should be true if using a private repo
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-pretokenize-oci-upload.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ integrations:
- oci-cli==3.23.2
- integration_type: git_repo
git_repo: mosaicml/llm-foundry
git_branch: v0.15.1
git_branch: v0.16.0
# git_commit: # OR use your commit hash
pip_install: .
ssh_clone: false # Should be true if using a private repo
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@
'mlflow>=2.14.1,<2.19',
'accelerate>=0.25,<1.2', # for HF inference `device_map`
'transformers>=4.43.2,<4.47',
'mosaicml-streaming>=0.10.0,<0.11',
'mosaicml-streaming>=0.11.0,<0.12',
'torch>=2.5.1,<2.5.2',
'datasets>=2.20.0,<3.2',
'datasets>=2.20.0,<3.3',
'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data
'sentencepiece==0.2.0',
'einops==0.8.0',
Expand All @@ -79,7 +79,7 @@
extra_deps = {}

extra_deps['dev'] = [
'coverage[toml]==7.6.4',
'coverage[toml]==7.6.10',
'pre-commit>=3.4.0,<4',
'pytest>=7.2.1,<9',
'pytest_codeblocks>=0.16.1,<0.18',
Expand Down
9 changes: 9 additions & 0 deletions tests/a_scripts/data_prep/test_convert_delta_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,15 @@ def test_fetch_DT_catches_grpc_errors(
'The data preparation cluster you provided is not usable. Please retry with a cluster that is healthy and alive.',
],
),
(
SparkConnectGrpcException(
'do not have permission to attach to cluster etc...',
),
FaultyDataPrepCluster,
[
'You do not have permission to attach to the data preparation cluster you provided.',
],
),
(
grpc_lib_error,
FaultyDataPrepCluster,
Expand Down
47 changes: 47 additions & 0 deletions tests/data/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
import pytest

from llmfoundry.data.finetuning.tasks import (
QA_format_preprocessor,
_get_num_processes,
dataset_constructor,
messages_format_preprocessor,
)
from llmfoundry.utils.exceptions import DatasetTooSmallError

Expand Down Expand Up @@ -60,3 +62,48 @@ def get_local_world_size(self):
new=MockDataset,
):
dataset_constructor.build_from_streaming()


def test_QA_format_preprocessor():
inp = {
'Q': 'What is the capital of France?',
'A': 'Paris',
'meta': {
'a': 'b',
},
}

expected_messages = [{
'role': 'user',
'content': 'What is the capital of France?',
}, {
'role': 'assistant',
'content': 'Paris',
}]
output = QA_format_preprocessor(inp)
assert len(output) == 1
assert 'messages' in output
for i, message in enumerate(output['messages']):
expected_message = expected_messages[i]
for k, v in message.items():
assert k in expected_message
assert v == expected_message[k]


def test_messages_format_preprocessor():
messages = [{
'role': 'user',
'content': 'What is the capital of France?',
}, {
'role': 'assistant',
'content': 'Paris',
}]
inp = {
'messages': messages,
'other_key': 'other_value',
}

output = messages_format_preprocessor(inp)
assert len(output) == 1
assert 'messages' in output
assert output['messages'] == messages

0 comments on commit ec56132

Please sign in to comment.