Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
Signed-off-by: Mayank Mishra <[email protected]>
  • Loading branch information
mayank31398 committed Oct 9, 2024
2 parents 4e6f95f + 3819967 commit 5ed79f3
Show file tree
Hide file tree
Showing 161 changed files with 5,645 additions and 4,245 deletions.
158 changes: 1 addition & 157 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -1,157 +1 @@
# copied from https://github.com/microsoft/DeepSpeed

---
# Refer to the following link for the explanation of each params:
# http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
# This is deprecated
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
# disabling the below splits, else, they'll just add to the vertical length of source files!
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: WebKit
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 119
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 4
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
CanonicalDelimiter: ''
BasedOnStyle: google
# Enabling comment reflow causes doxygen comments to be messed up in their formats!
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
# Be consistent with indent-width, even for people who use tab for indentation!
TabWidth: 4
UseTab: Never
BasedOnStyle: LLVM
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Installation
run: |
make install-dev
git clone -b granitemoe https://github.com/mayank31398/transformers && cd transformers && pip install . && cd ..
git clone https://github.com/huggingface/transformers && cd transformers && pip install . && cd ..
- name: Unit Tests
run: make test
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ repos:
- id: isort
name: isort (python)
- repo: https://github.com/psf/black
rev: 24.4.2
rev: 24.8.0
hooks:
- id: black
args: [--line-length=119,--target-version=py311]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v17.0.3
rev: v18.1.8
hooks:
- id: clang-format
types_or: [c++, c, cuda]
Expand Down
14 changes: 11 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
install:
pip install .
pip install --extra-index-url https://download.pytorch.org/whl/nightly/cpu .

install-dev:
pip install -e .
pip install --extra-index-url https://download.pytorch.org/whl/nightly/cpu -e .
pip install -r requirements-dev.txt

git clone https://github.com/sustcsonglin/flash-linear-attention
cd flash-linear-attention
pip install .
cd ..

test:
pytest tests
RUN_SLOW=True pytest tests

test-fast:
RUN_SLOW=False pytest tests

update-precommit:
pre-commit autoupdate
Expand Down
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<!-- Topic -->
[Efficient Training]: https://img.shields.io/static/v1?label=&message=Efficient%20Training&color=blueviolet
[Efficient Inference]: https://img.shields.io/static/v1?label=&message=Efficient%20Inference&color=blueviolet
[Learning Rate Scheduler]: https://img.shields.io/static/v1?label=&message=Learning%20Rate%20Scheduler&color=blueviolet
[Instruction Finetuning]: https://img.shields.io/static/v1?label=&message=Instruction%20Finetuning&color=blueviolet
[Mixture of Experts]: https://img.shields.io/static/v1?label=&message=Mixture%20of%20Experts&color=blueviolet
[Model Architecture]: https://img.shields.io/static/v1?label=&message=Model%20Architecture&color=blueviolet
Expand All @@ -19,15 +20,15 @@ _Mayank Mishra_
1. [Reducing Transformer Key-Value Cache Size with Cross-Layer Attention](https://arxiv.org/abs/2405.12981)
_William Brandon, Mayank Mishra, Aniruddha Nrusimha, Rameswar Panda, Jonathan Ragan Kelly_
![image][Efficient Inference] ![image][Model Architecture]
1. [Dense Training, Sparse Inference: Rethinking Training of Mixture-of-Experts Language Models](https://arxiv.org/abs/2404.05567)
_Bowen Pan, Yikang Shen, Haokun Liu, Mayank Mishra, Gaoyuan Zhang, Aude Oliva, Colin Raffel, Rameswar Panda_
![image][Mixture of Experts] ![image][Efficient Inference] ![image][Model Architecture]
1. [NEFTune: Noisy Embeddings Improve Instruction Finetuning](https://arxiv.org/abs/2310.05914)
_Neel Jain, Ping-yeh Chiang, Yuxin Wen, John Kirchenbauer, Hong-Min Chu, Gowthami Somepalli, Brian R. Bartoldson, Bhavya Kailkhura, Avi Schwarzschild, Aniruddha Saha, Micah Goldblum, Jonas Geiping, Tom Goldstein_
![image][Instruction Finetuning]
1. [Parallelizing Linear Transformers with the Delta Rule over Sequence Length](https://arxiv.org/abs/2406.06484)
_Songlin Yang, Bailin Wang, Yu Zhang, Yikang Shen, Yoon Kim_
![image][Model Architecture] ![image][Efficient Training] ![image][Efficient Inference]
1. [Power scheduler: a batch size and token number agnostic learning rate scheduler](https://arxiv.org/abs/2408.13359)
_Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox, Rameswar Panda_
![image][Learning Rate Scheduler]
1. [Scattered Mixture-of-Experts Implementation](https://arxiv.org/abs/2403.08245)
_Shawn Tan, Yikang Shen, Rameswar Panda, Aaron Courville_
![image][Mixture of Experts] ![image][Efficient Training] ![image][Efficient Inference]
Expand Down Expand Up @@ -226,7 +227,7 @@ If you find this repository useful, please consider citing it in your research:
author = {Mishra, Mayank},
month = jun,
title = {{Dolomite Engine: A Hyper-Optimized Library for Pretraining and Finetuning}},
url = {https://github.com/ibm-granite/dolomite-engine},
url = {https://github.com/ibm/dolomite-engine},
year = {2024}
}
```
73 changes: 73 additions & 0 deletions configs/distillation/power-3b.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
datasets:
# class_name, data_name & data_sampling_ratio are not used but need to be passed to avoid errors
- class_name: MegatronDataset
data_name: Megatron
data_sampling_ratio: 1
class_args:
eval_steps: 2
data_cache_path: /proj/checkpoints/mayank/cache
# Option 1: data loading using --data-path with single file
data_path:
- /proj/datasets/training_data_starcoder_cleaned_0324/fineweb-edu
split: 98,1,1
sequence_length: 2048

tokenizer_args:
tokenizer_name: bigcode/starcoder

model_args:
model_class: AutoModelForCausalLM
model_name: ibm/PowerLM-3b
efficient_initialization: false
attention_implementation: sdpa
use_padding_free_transformer: false

teacher_args:
model_class: AutoModelForCausalLM
model_name: ibm/PowerLM-3b
dtype: bf16
kl_divergence_method: forward
kl_divergence_weight: 1

tuning_args:
tuning_method: distillation

save_args:
save_path: /proj/checkpoints/mayank/tmp
save_interval: 5000

logging_args:
log_interval: 10

training_parameters:
num_training_steps: 25000
eval_interval: 2500000
micro_batch_size: 2
gradient_accumulation_steps: 4

optimizer_args:
class_name: TorchAdamW
class_args:
lr: 3e-4
weight_decay: 0.1
betas:
- 0.9
- 0.95
eps: 1e-10

lr_scheduler_args:
lr_decay_style: cosine
num_warmup_steps: 2000
num_decay_steps: 23000

mixed_precision_args:
dtype: bf16

distributed_args:
distributed_backend: torch
communication_dtype: fp32
stage: 3
fsdp_algorithm: 2
zero_topology:
data_parallel_replication_world_size: 1
data_parallel_sharding_world_size: 8
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ model_args:
attention_softmax_in_fp32: true
add_bias: true
position_embedding_type: learned_absolute
rope_theta: 10000
attention_implementation: flash_attention_2
use_padding_free_transformer: true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ model_args:
attention_softmax_in_fp32: true
add_bias: true
position_embedding_type: learned_absolute
rope_theta: 10000
attention_implementation: flash_attention_2
use_padding_free_transformer: true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ model_args:
attention_softmax_in_fp32: true
add_bias: true
position_embedding_type: learned_absolute
rope_theta: 10000
attention_implementation: flash_attention_2
use_padding_free_transformer: true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ model_args:
attention_softmax_in_fp32: true
add_bias: true
position_embedding_type: learned_absolute
rope_theta: 10000
attention_implementation: flash_attention_2
use_padding_free_transformer: true

Expand Down
Loading

0 comments on commit 5ed79f3

Please sign in to comment.