merge

Signed-off-by: Mayank Mishra <[email protected]>
IBM · Oct 9, 2024 · 5ed79f3 · 5ed79f3
2 parents 4e6f95f + 3819967
commit 5ed79f3
Show file tree

Hide file tree

Showing 161 changed files with 5,645 additions and 4,245 deletions.
diff --git a/.clang-format b/.clang-format
@@ -1,157 +1 @@
-# copied from https://github.com/microsoft/DeepSpeed
-
----
-# Refer to the following link for the explanation of each params:
-#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
-Language: Cpp
-# BasedOnStyle: Google
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: false
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-# This is deprecated
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments:  false
-BinPackParameters: false
-BraceWrapping:
-  AfterClass:            false
-  AfterControlStatement: false
-  AfterEnum:             false
-  AfterFunction:         false
-  AfterNamespace:        false
-  AfterObjCDeclaration:  false
-  AfterStruct:           false
-  AfterUnion:            false
-  AfterExternBlock:      false
-  BeforeCatch:           false
-  BeforeElse:            false
-  IndentBraces:          false
-  # disabling the below splits, else, they'll just add to the vertical length of source files!
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: WebKit
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit: 119
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat: false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks: Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 4
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 4
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats:
-  - Language: Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-  - Language: TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle: google
-# Enabling comment reflow causes doxygen comments to be messed up in their formats!
-ReflowComments: true
-SortIncludes: true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard: Cpp11
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-# Be consistent with indent-width, even for people who use tab for indentation!
-TabWidth: 4
-UseTab: Never
+BasedOnStyle: LLVM
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -25,7 +25,7 @@ jobs:
       - name: Installation
         run: |
           make install-dev
-          git clone -b granitemoe https://github.com/mayank31398/transformers && cd transformers && pip install . && cd ..
+          git clone https://github.com/huggingface/transformers && cd transformers && pip install . && cd ..
 
       - name: Unit Tests
         run: make test
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,12 +9,12 @@ repos:
       - id: isort
         name: isort (python)
   - repo: https://github.com/psf/black
-    rev: 24.4.2
+    rev: 24.8.0
     hooks:
       - id: black
         args: [--line-length=119,--target-version=py311]
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v17.0.3
+    rev: v18.1.8
     hooks:
       - id: clang-format
         types_or: [c++, c, cuda]

diff --git a/Makefile b/Makefile
@@ -1,12 +1,20 @@
 install:
-	pip install .
+	pip install --extra-index-url https://download.pytorch.org/whl/nightly/cpu .
 
 install-dev:
-	pip install -e .
+	pip install --extra-index-url https://download.pytorch.org/whl/nightly/cpu -e .
 	pip install -r requirements-dev.txt
+
+	git clone https://github.com/sustcsonglin/flash-linear-attention
+	cd flash-linear-attention
+	pip install .
+	cd ..
 
 test:
-	pytest tests
+	RUN_SLOW=True pytest tests
+
+test-fast:
+	RUN_SLOW=False pytest tests
 
 update-precommit:
 	pre-commit autoupdate

diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@
 <!-- Topic -->
 [Efficient Training]: https://img.shields.io/static/v1?label=&message=Efficient%20Training&color=blueviolet
 [Efficient Inference]: https://img.shields.io/static/v1?label=&message=Efficient%20Inference&color=blueviolet
+[Learning Rate Scheduler]: https://img.shields.io/static/v1?label=&message=Learning%20Rate%20Scheduler&color=blueviolet
 [Instruction Finetuning]: https://img.shields.io/static/v1?label=&message=Instruction%20Finetuning&color=blueviolet
 [Mixture of Experts]: https://img.shields.io/static/v1?label=&message=Mixture%20of%20Experts&color=blueviolet
 [Model Architecture]: https://img.shields.io/static/v1?label=&message=Model%20Architecture&color=blueviolet
@@ -19,15 +20,15 @@ _Mayank Mishra_
 1. [Reducing Transformer Key-Value Cache Size with Cross-Layer Attention](https://arxiv.org/abs/2405.12981)  
 _William Brandon, Mayank Mishra, Aniruddha Nrusimha, Rameswar Panda, Jonathan Ragan Kelly_  
 ![image][Efficient Inference] ![image][Model Architecture]
-1. [Dense Training, Sparse Inference: Rethinking Training of Mixture-of-Experts Language Models](https://arxiv.org/abs/2404.05567)  
-_Bowen Pan, Yikang Shen, Haokun Liu, Mayank Mishra, Gaoyuan Zhang, Aude Oliva, Colin Raffel, Rameswar Panda_  
-![image][Mixture of Experts] ![image][Efficient Inference] ![image][Model Architecture]
 1. [NEFTune: Noisy Embeddings Improve Instruction Finetuning](https://arxiv.org/abs/2310.05914)  
 _Neel Jain, Ping-yeh Chiang, Yuxin Wen, John Kirchenbauer, Hong-Min Chu, Gowthami Somepalli, Brian R. Bartoldson, Bhavya Kailkhura, Avi Schwarzschild, Aniruddha Saha, Micah Goldblum, Jonas Geiping, Tom Goldstein_  
 ![image][Instruction Finetuning]
 1. [Parallelizing Linear Transformers with the Delta Rule over Sequence Length](https://arxiv.org/abs/2406.06484)  
 _Songlin Yang, Bailin Wang, Yu Zhang, Yikang Shen, Yoon Kim_  
 ![image][Model Architecture] ![image][Efficient Training] ![image][Efficient Inference]
+1. [Power scheduler: a batch size and token number agnostic learning rate scheduler](https://arxiv.org/abs/2408.13359)  
+_Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox, Rameswar Panda_  
+![image][Learning Rate Scheduler]
 1. [Scattered Mixture-of-Experts Implementation](https://arxiv.org/abs/2403.08245)  
 _Shawn Tan, Yikang Shen, Rameswar Panda, Aaron Courville_  
 ![image][Mixture of Experts] ![image][Efficient Training] ![image][Efficient Inference]
@@ -226,7 +227,7 @@ If you find this repository useful, please consider citing it in your research:
     author = {Mishra, Mayank},
     month = jun,
     title = {{Dolomite Engine: A Hyper-Optimized Library for Pretraining and Finetuning}},
-    url = {https://github.com/ibm-granite/dolomite-engine},
+    url = {https://github.com/ibm/dolomite-engine},
     year = {2024}
 }
 ```
diff --git a/configs/distillation/power-3b.yml b/configs/distillation/power-3b.yml
@@ -0,0 +1,73 @@
+datasets:
+  # class_name, data_name & data_sampling_ratio are not used but need to be passed to avoid errors
+  - class_name: MegatronDataset
+    data_name: Megatron
+    data_sampling_ratio: 1
+    class_args:
+      eval_steps: 2
+      data_cache_path: /proj/checkpoints/mayank/cache
+      # Option 1: data loading using --data-path with single file
+      data_path:
+        - /proj/datasets/training_data_starcoder_cleaned_0324/fineweb-edu
+      split: 98,1,1
+      sequence_length: 2048
+
+tokenizer_args:
+  tokenizer_name: bigcode/starcoder
+
+model_args:
+  model_class: AutoModelForCausalLM
+  model_name: ibm/PowerLM-3b
+  efficient_initialization: false
+  attention_implementation: sdpa
+  use_padding_free_transformer: false
+
+teacher_args:
+  model_class: AutoModelForCausalLM
+  model_name: ibm/PowerLM-3b
+  dtype: bf16
+  kl_divergence_method: forward
+  kl_divergence_weight: 1
+
+tuning_args:
+  tuning_method: distillation
+
+save_args:
+  save_path: /proj/checkpoints/mayank/tmp
+  save_interval: 5000
+
+logging_args:
+  log_interval: 10
+
+training_parameters:
+  num_training_steps: 25000
+  eval_interval: 2500000
+  micro_batch_size: 2
+  gradient_accumulation_steps: 4
+
+optimizer_args:
+  class_name: TorchAdamW
+  class_args:
+    lr: 3e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    eps: 1e-10
+
+lr_scheduler_args:
+  lr_decay_style: cosine
+  num_warmup_steps: 2000
+  num_decay_steps: 23000
+
+mixed_precision_args:
+  dtype: bf16
+
+distributed_args:
+  distributed_backend: torch
+  communication_dtype: fp32
+  stage: 3
+  fsdp_algorithm: 2
+  zero_topology:
+    data_parallel_replication_world_size: 1
+    data_parallel_sharding_world_size: 8
diff --git a/configs/pretraining-examples/pretrain-1.yml → ...pretraining-examples/dense/pretrain-1.yml b/configs/pretraining-examples/pretrain-1.yml → ...pretraining-examples/dense/pretrain-1.yml
@@ -42,7 +42,6 @@ model_args:
     attention_softmax_in_fp32: true
     add_bias: true
     position_embedding_type: learned_absolute
-    rope_theta: 10000
   attention_implementation: flash_attention_2
   use_padding_free_transformer: true
 

diff --git a/configs/pretraining-examples/pretrain-2.yml → ...pretraining-examples/dense/pretrain-2.yml b/configs/pretraining-examples/pretrain-2.yml → ...pretraining-examples/dense/pretrain-2.yml
@@ -47,7 +47,6 @@ model_args:
     attention_softmax_in_fp32: true
     add_bias: true
     position_embedding_type: learned_absolute
-    rope_theta: 10000
   attention_implementation: flash_attention_2
   use_padding_free_transformer: true
 

diff --git a/configs/pretraining-examples/pretrain-3.yml → ...pretraining-examples/dense/pretrain-3.yml b/configs/pretraining-examples/pretrain-3.yml → ...pretraining-examples/dense/pretrain-3.yml
@@ -60,7 +60,6 @@ model_args:
     attention_softmax_in_fp32: true
     add_bias: true
     position_embedding_type: learned_absolute
-    rope_theta: 10000
   attention_implementation: flash_attention_2
   use_padding_free_transformer: true
 

diff --git a/configs/pretraining-examples/pretrain-4.yml → ...pretraining-examples/dense/pretrain-4.yml b/configs/pretraining-examples/pretrain-4.yml → ...pretraining-examples/dense/pretrain-4.yml
@@ -99,7 +99,6 @@ model_args:
     attention_softmax_in_fp32: true
     add_bias: true
     position_embedding_type: learned_absolute
-    rope_theta: 10000
   attention_implementation: flash_attention_2
   use_padding_free_transformer: true