Starcoder2 : KVCache and flash attention (FusedSDPA) enablement (hugg…

…ingface#1149) Co-authored-by: Colabrese <[email protected]> Co-authored-by: Abhilash Majumder <[email protected]> Co-authored-by: Sayantan Sarkar <[email protected]> Co-authored-by: regisss <[email protected]>
harborn · Aug 6, 2024 · 13b6452 · 13b6452
1 parent ec90e05
commit 13b6452
Show file tree

Hide file tree

Showing 8 changed files with 665 additions and 308 deletions.
diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
@@ -102,13 +102,22 @@ def __init__(self, tokenizer, model, args, options):
         self.options = options
         self._device = args.device
         self.model_inputs = {"use_cache": self.options.use_cache}
-        if self.model.config.model_type in ["llama", "mistral", "falcon", "phi", "mixtral", "qwen2", "gptj"]:
+        if self.model.config.model_type in [
+            "llama",
+            "mistral",
+            "falcon",
+            "phi",
+            "mixtral",
+            "qwen2",
+            "gptj",
+            "starcoder2",
+        ]:
             self.model_inputs.update(
                 {
                     "reuse_cache": self.options.reuse_cache,
                 }
             )
-        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon"]:
+        if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2"]:
             if self.model.config.model_type != "falcon":
                 self.model_inputs.update(
                     {

diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
@@ -381,7 +381,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
 
     model = deepspeed.init_inference(model, **ds_inference_kwargs)
     model = model.module
-    if model.config.model_type in ["llama", "falcon", "qwen2"]:
+    if model.config.model_type in ["llama", "falcon", "qwen2", "starcoder2"]:
         patch_scoped_linear_all_reduce(model)
 
     if args.quant_config:

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
@@ -94,6 +94,7 @@
     "starcoder2",
     "persimmon",
     "qwen2",
+    "starcoder2",
     "llava",
     "llava_next",
     "stablelm",
@@ -435,7 +436,7 @@ def create_pad_arg(pad_amount, i, j):
                         else:
                             assert False
                     elif model_kwargs["past_key_values"][0][0].dim() == 4:
-                        return (0, 0, 0, pad_amount)  # llama, falcon, qwen2
+                        return (0, 0, 0, pad_amount)  # llama, falcon, qwen2, starcoder2
                     else:
                         assert False, "Unknown case, please handle, or dont use bucketing"
 
@@ -860,7 +861,8 @@ def generate(
                 "phi",
                 "qwen2",
                 "gptj",
-            ], "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2 and gptj at the moment"
+                "starcoder2",
+            ], "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2 and starcoder2 at the moment"
             if not generation_config.bucket_internal:
                 assert (
                     generation_config.bucket_size <= 0
@@ -1016,7 +1018,7 @@ def generate(
                 model_kwargs["kv_cache_len"] = calculated_max_length
                 model_kwargs["kv_cache_pad_len"] = generation_config.max_new_tokens
 
-            if self.config.model_type in ["llama", "falcon", "mistral", "qwen2", "gptj"]:
+            if self.config.model_type in ["llama", "falcon", "mistral", "qwen2", "gptj", "starcoder2"]:
                 if self.config.max_position_embeddings < calculated_max_length:
                     unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
 

diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
@@ -88,8 +88,10 @@
     GaudiQwen2Model,
     GaudiStableLmDecoderLayer,
     GaudiStableLmForCausalLM,
+    GaudiStarcoder2Attention,
     GaudiStarcoder2DecoderLayer,
     GaudiStarcoder2ForCausalLM,
+    GaudiStarcoder2Model,
     LlamaConfig,
     MistralConfig,
     MixtralConfig,
@@ -175,8 +177,6 @@
     gaudi_SpeechT5DecoderLayer_forward,
     gaudi_stablelm_attention_forward,
     gaudi_stablelm_model_forward,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
     gaudi_swin_get_attn_mask,
     gaudi_t5_layernorm_forward,
     gaudi_T5Attention_forward,
@@ -517,8 +517,8 @@ def adapt_transformers_to_gaudi():
 
     # Optimization for starcoder2 on Gaudi
     transformers.models.starcoder2.modeling_starcoder2.Starcoder2ForCausalLM = GaudiStarcoder2ForCausalLM
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Model.forward = gaudi_starcoder2_model_forward
-    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Attention.forward = gaudi_starcoder2_attention_forward
+    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Model = GaudiStarcoder2Model
+    transformers.models.starcoder2.modeling_starcoder2.Starcoder2Attention = GaudiStarcoder2Attention
     transformers.models.starcoder2.modeling_starcoder2.Starcoder2DecoderLayer = GaudiStarcoder2DecoderLayer
 
     # Optimization for qwen2 on Gaudi

diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
@@ -191,10 +191,10 @@
     gaudi_stablelm_model_forward,
 )
 from .starcoder2 import (
+    GaudiStarcoder2Attention,
     GaudiStarcoder2DecoderLayer,
     GaudiStarcoder2ForCausalLM,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
+    GaudiStarcoder2Model,
 )
 from .swin import gaudi_swin_get_attn_mask
 from .t5 import (

diff --git a/optimum/habana/transformers/models/starcoder2/__init__.py b/optimum/habana/transformers/models/starcoder2/__init__.py
@@ -1,6 +1,6 @@
 from .modeling_starcoder2 import (
+    GaudiStarcoder2Attention,
     GaudiStarcoder2DecoderLayer,
     GaudiStarcoder2ForCausalLM,
-    gaudi_starcoder2_attention_forward,
-    gaudi_starcoder2_model_forward,
+    GaudiStarcoder2Model,
 )