From 4be284bd79d2c4ffab378b93d7282b54f96647e9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 5 Jul 2024 23:48:42 -0700
Subject: [PATCH] Gemma 2 bug fixes + All RoPE Scaling Support (#736)

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update gemma2.py

* init

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* All RoPE Scaling support

* cleanup

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* exec

* exec

* Attention_Module

* attention_module

* imports

* exec

* Update llama.py

* Update llama.py

* boolean mask

* revert masking

* Update llama.py

* Update save.py

* Update llama.py

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update utils.py

* retry

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* Update _utils.py

* Update gemma2.py

* Update chat_templates.py

* Gemma 2 Ollama support

* Update llama.py

* Update llama.py
---
 unsloth/chat_templates.py |  27 ++++-
 unsloth/models/_utils.py  | 211 ++++++++++++++++++++++++++++++++------
 unsloth/models/gemma.py   |  43 ++++++++
 unsloth/models/gemma2.py  |  18 +++-
 unsloth/models/llama.py   |  15 ++-
 unsloth/models/mistral.py |  13 ++-
 unsloth/models/qwen2.py   |  13 ++-
 unsloth/save.py           |   7 +-
 8 files changed, 306 insertions(+), 41 deletions(-)

diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
index 5f5b4e16c..596548df3 100644
--- a/unsloth/chat_templates.py
+++ b/unsloth/chat_templates.py
@@ -416,6 +416,21 @@
 CHAT_TEMPLATES["gemma_chatml"] = (gemma_chatml_template, gemma_chatml_eos_token, True, gemma_chatml_ollama,)
 pass
 
+# =========================================== Gemma 2
+# Same as Gemma 1, but with sliding window attention!
+# https://ollama.com/library/gemma2/blobs/6522ca797f47
+gemma2_template = gemma_template
+gemma2_ollama = gemma_ollama + "PARAMETER num_ctx 4096\n"
+gemma2_eos_token = "<end_of_turn>"
+CHAT_TEMPLATES["gemma2"] = (gemma2_template, gemma2_eos_token, True, gemma2_ollama,)
+
+# =========================================== Gemma 2 with ChatML instead
+gemma2_chatml_template = gemma_chatml_template
+gemma2_chatml_ollama = gemma_chatml_ollama + "PARAMETER num_ctx 4096\n"
+gemma2_chatml_eos_token = gemma_chatml_eos_token
+CHAT_TEMPLATES["gemma2_chatml"] = (gemma2_chatml_template, gemma2_chatml_eos_token, True, gemma2_chatml_ollama,)
+pass
+
 # =========================================== Llama-3
 # Weirdly \n\n is needed?
 llama3_template = \
@@ -1014,7 +1029,17 @@ def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []):
     pass
     final_eos_tokens += extra_eos_tokens
     final_eos_tokens += repeatted_tokens
-    return final_eos_tokens
+
+    # Remove new lines, spaces and HTML tags
+    filtered_eos_tokens = []
+    for token in final_eos_tokens:
+        if   token.count("\n") == len(token): continue
+        elif token.count("▁") == len(token): continue
+        elif token.startswith("<") and len(token) <= 2: continue
+        elif token.startswith("</") and len(token) == 3: continue
+        filtered_eos_tokens.append(token)
+    pass
+    return filtered_eos_tokens
 pass
 
 
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 73aa0c6c9..fd1b87c53 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -12,9 +12,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+__version__ = "2024.7"
+
+__all__ = [
+    "prepare_model_for_kbit_training",
+    "xformers",
+    "xformers_attention",
+    "xformers_version",
+    "__version__",
+    "HAS_FLASH_ATTENTION",
+    "platform_system",
+    "patch_tokenizer",
+    "get_statistics",
+    "Unsloth_Offloaded_Gradient_Checkpointer",
+    "offload_to_disk",
+    "offload_input_embeddings",
+    "offload_output_embeddings",
+    "is_bfloat16_supported",
+    "unsloth_offloaded_gradient_checkpoint",
+    "torch_compile_options",
+    "patch_linear_scaling",
+    "create_boolean_mask",
+]
+
 import torch
-from typing import Union, Optional, List, Any, Callable
+from typing import Union, Optional, List, Any, Callable, Tuple
 import warnings
+from platform import system as platform_system
+platform_system = platform_system()
+import math
+import numpy as np
+import os
+import psutil
+import inspect
+import re
+
+# =============================================
+# Disable some warnings which can get annoying
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "torch")
 warnings.filterwarnings(action = "ignore", category = UserWarning,    module = "huggingface_hub")
 warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "subprocess")
@@ -26,20 +60,42 @@
 # Stop "Special tokens have been added in the vocabulary, ..."
 import logging
 logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.CRITICAL+1)
+# =============================================
+
+# =============================================
+# Edits all Config files to enable RoPE Scaling for all models
+from transformers import PretrainedConfig
+
+model_architectures = ["llama", "mistral", "gemma", "gemma2", "qwen2",]
+
+for model_name in model_architectures:
+    config_filepath = f"transformers.models.{model_name}.configuration_{model_name}"
+    model_filepath = f"transformers.models.{model_name}.modeling_{model_name}"
+    config_filename = f"{model_name.title()}Config"
+    exec(f"from {config_filepath} import {config_filename}", globals())
+
+    config = inspect.getsource(eval(config_filename))
+    if "rope_scaling" in config: continue
+    config = re.sub(
+        r"(\*\*kwargs)[\s]{0,}\,[\s]{0,}\)[\s]{0,}\:",
+        r"rope_scaling=None,"\
+        r"\n        **kwargs):\n"\
+        r"\n        self.rope_scaling = rope_scaling\n",
+        config,
+    )
+    exec(config, globals())
+
+    exec(f"import {config_filepath}", globals())
+    exec(f"{config_filepath}.{config_filename} = {config_filename}", globals())
+pass
+# =============================================
 
+# =============================================
+# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 import bitsandbytes as bnb
 from transformers.models.llama.modeling_llama import logger
 from transformers import AutoTokenizer
-from platform import system as platform_system
-platform_system = platform_system()
-import math
-import numpy as np
-import os
-import psutil
 
-__version__ = "2024.7"
-
-# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 major_version, minor_version = torch.cuda.get_device_capability()
 SUPPORTS_BFLOAT16 = False
 
@@ -69,25 +125,10 @@
 import xformers.ops.fmha as xformers
 xformers_attention = xformers.memory_efficient_attention
 from xformers import __version__ as xformers_version
+# =============================================
 
-__all__ = [
-    "prepare_model_for_kbit_training",
-    "xformers",
-    "xformers_attention",
-    "xformers_version",
-    "__version__",
-    "HAS_FLASH_ATTENTION",
-    "platform_system",
-    "patch_tokenizer",
-    "get_statistics",
-    "Unsloth_Offloaded_Gradient_Checkpointer",
-    "offload_to_disk",
-    "offload_input_embeddings",
-    "offload_output_embeddings",
-    "is_bfloat16_supported",
-    "unsloth_offloaded_gradient_checkpoint",
-    "torch_compile_options",
-]
+# =============================================
+# Torch compile settings
 
 # Just remove max_autotune_gemm warning
 import functools
@@ -128,7 +169,7 @@ def is_big_gpu(index):
     "trace.enabled"     : False, # Output Triton kernel outputs!
     "triton.cudagraphs" : False,
 }
-
+# =============================================
 
 def prepare_model_for_kbit_training(
     model                      : Any,
@@ -266,6 +307,7 @@ def patch_tokenizer(model, tokenizer):
 pass
 
 
+# =============================================
 # Weirdly LoraLayer.update_layer downcasts PEFT layers to float16??
 # For mixed precision, we need it to be in float32 not float16.
 from peft.tuners.lora.layer import LoraLayer
@@ -295,6 +337,7 @@ def patch_tokenizer(model, tokenizer):
         "Luckily, your training run will still work in the meantime!"
     )
 pass
+# =============================================
 
 
 def get_statistics():
@@ -456,9 +499,8 @@ def unsloth_offloaded_gradient_checkpoint(function, *args, use_reentrant = None,
 pass
 
 
-"""
-    Remove warnings about missing kwargs and patch stuff
-"""
+# =============================================
+# Fixes Bitsandbytes to remove missing warnings
 from transformers.utils.quantization_config import BitsAndBytesConfig, QuantizationMethod
 from inspect import getsource
 from accelerate.utils.dataclasses import DistributedType
@@ -501,7 +543,7 @@ def _prepare_backend(
 
 import transformers.utils.quantization_config
 transformers.utils.quantization_config.BitsAndBytesConfig.__init__ = _BitsAndBytesConfig__init__
-
+# =============================================
 
 # Offloading to disk for modules (lm_head, embed_tokens)
 import pickle
@@ -549,3 +591,106 @@ def offload_output_embeddings(model, temporary_location : str = "_unsloth_tempor
 def is_bfloat16_supported():
     return SUPPORTS_BFLOAT16
 pass
+
+
+# Patches models to add RoPE Scaling
+def patch_linear_scaling(
+    model_name = "gemma2",
+    rope_module = None,
+    scaled_rope_module = None,
+    attention_module = None,
+):
+    assert(rope_module is not None and scaled_rope_module is not None)
+    assert(attention_module is not None)
+
+    rope_name = rope_module.__name__
+    scaled_rope_name = scaled_rope_module.__name__
+    model_filepath = f"transformers.models.{model_name}.modeling_{model_name}"
+    exec_code = \
+        f"import torch.nn as nn\n"\
+        f"from typing import Union, Optional, List, Any, Callable, Tuple\n"\
+        f"from {model_filepath} import logger, "\
+        f"{model_name.title()}Attention, {model_name.title()}Config"
+
+    function = inspect.getsource(attention_module.__init__)
+    where = function.find("def")
+    function = function.split("\n")
+    function = "\n".join(x[where:] for x in function)
+    init_name = f"{model_name.title()}Attention__init__"
+    function = function.replace("def __init__", f"def {init_name}")
+    function = function.replace(
+        "super().__init__()",
+        f"super({model_name.title()}Attention, self).__init__()",
+    )
+    fix_rope_function = """
+    if getattr(self.config, "rope_scaling", None) is None:
+        self.rotary_emb = {rope_function}(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+    else:
+        scaling_type = self.config.rope_scaling["type"]
+        scaling_factor = self.config.rope_scaling["factor"]
+        if scaling_type == "linear":
+            self.rotary_emb = {scaled_rope_function}(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=scaling_factor,
+                base=self.rope_theta,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {{scaling_type}}")
+    pass
+    """
+    fix_rope_function = fix_rope_function.format(
+        rope_function        = rope_module.__name__,
+        scaled_rope_function = scaled_rope_module.__name__,
+    )
+    rotary_emb = re.findall(
+        "self.rotary_emb = .+?\)", function,
+        flags = re.DOTALL | re.MULTILINE,
+    )
+    if len(rotary_emb) == 0: return
+    rotary_emb = rotary_emb[0]
+    function = function.replace(rotary_emb, fix_rope_function, 1)
+    function = exec_code + "\n\n" + function
+    return init_name, function
+pass
+
+
+def create_boolean_mask(n = 4096, sliding_window = 2048):
+    # Creates a boolean mask for attention
+    mask = torch.ones(n, n, dtype = torch.bool)
+    if sliding_window == 0:
+        return torch.triu(mask, diagonal = 1, out = mask)
+    pass
+    torch.triu(mask, diagonal = 0, out = mask)
+    torch.triu(mask.T, diagonal = -sliding_window, out = mask.T)
+    mask = mask.T
+    torch.logical_not(mask, out = mask)
+    return mask
+pass
+
+
+def test_mask_creation():
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+    for n in range(2, 23):
+        for s in range(1, 23):
+            correct_mask = AttentionMaskConverter(
+                is_causal = True,
+                sliding_window = s,
+            ).to_causal_4d(1, n, n, dtype = torch.float16,).squeeze(0).squeeze(0)
+            correct_mask = (correct_mask == correct_mask.min())
+            our_mask = create_boolean_mask(n = n, sliding_window = s)
+            assert(torch.all(correct_mask == our_mask))
+        pass
+        correct_mask = AttentionMaskConverter(
+            is_causal = True,
+            sliding_window = None,
+        ).to_causal_4d(1, n, n, dtype = torch.float16,).squeeze(0).squeeze(0)
+        correct_mask = (correct_mask == correct_mask.min())
+        our_mask = create_boolean_mask(n = n, sliding_window = 0)
+        assert(torch.all(correct_mask == our_mask))
+    pass
+pass
diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py
index 4c4515b79..4d3db8d39 100644
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@@ -236,10 +236,53 @@ def forward(self, x, position_ids=None, seq_len=None):
 pass
 
 
+class GemmaFixedLinearScalingRotaryEmbedding(GemmaFixedRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    # Fixes https://github.com/huggingface/transformers/pull/28837
+    # https://github.com/microsoft/DeepSpeed/issues/4932
+    # The precision of RoPE buffers is not correct, so we cast to int64.
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    pass
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+# Note: on the original Llama codebase, these tensors are created on the target device (and not on CPU) and
+        # in FP32. They are applied (multiplied) in FP32 as well.
+        self.max_seq_len_cached = seq_len
+
+        # The difference is we do division explicity instead of t * (1/x) ie we do t/x.
+        freq_exponents = (2.0 / self.dim) * (
+            torch.arange(self.dim // 2, dtype = torch.int64, device = "cpu").float()
+        )
+        timescale = self.base**freq_exponents
+        positions = torch.arange(self.max_seq_len_cached, device = "cpu", dtype = torch.int64).float()
+        positions = positions /  self.scaling_factor
+        radians_new = positions[..., None] / timescale[None, None, :]
+        radians_new = radians_new.squeeze(0)
+
+        emb = torch.cat((radians_new, radians_new), dim = -1)
+        # We must do RoPE in float32!
+        cos = emb.cos().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
+        sin = emb.sin().to(device = "cuda:0", non_blocking = True)#, dtype = dtype)
+        self.register_buffer("cos_cached", cos, persistent = False)
+        self.register_buffer("sin_cached", sin, persistent = False)
+    pass
+pass
+
+
 class FastGemmaModel(FastLlamaModel):
 
     @staticmethod
     def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "gemma",
+            rope_module        = GemmaFixedRotaryEmbedding,
+            scaled_rope_module = GemmaFixedLinearScalingRotaryEmbedding,
+            attention_module   = GemmaAttention,
+        )
+        exec(function, globals())
+        GemmaAttention.__init__      = eval(init_name)
         GemmaAttention      .forward = LlamaAttention_fast_forward
         GemmaSdpaAttention  .forward = LlamaAttention_fast_forward
         GemmaFlashAttention2.forward = LlamaAttention_fast_forward
diff --git a/unsloth/models/gemma2.py b/unsloth/models/gemma2.py
index 0669e4220..4a1420fb4 100644
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@@ -16,6 +16,7 @@
 from ._utils import __version__
 from .gemma import (
     GemmaFixedRotaryEmbedding,
+    GemmaFixedLinearScalingRotaryEmbedding,
     fast_geglu_inference,
 )
 from transformers.models.gemma2.modeling_gemma2 import (
@@ -27,7 +28,6 @@
     apply_rotary_pos_emb,
     repeat_kv,
 )
-from transformers.models.gemma2.modeling_gemma2 import *
 from transformers.modeling_attn_mask_utils import (
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
@@ -46,7 +46,7 @@
 # [TODO] We must randomnly use torch.compile?
 # I checked the gradients and formulas and I'm sure it's correct.
 # I'm stumped :(
-@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
+@torch.compile(fullgraph = True, dynamic = True)#, options = torch_compile_options)
 def fast_rms_layernorm_gemma2_compiled(layernorm, X, gemma = True):
     old_dtype = X.dtype
     X = X.float()
@@ -77,6 +77,8 @@ def gemma2_attention(Q, K, V, causal_mask, self, bsz, q_len):
     A = torch.matmul(Q, K.transpose(2, 3))
     A = t * torch.tanh(A / t) # Logit softcapping
     A += causal_mask[:q_len, :q_len]
+    # Much slower in torch compile!
+    # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
     A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
     A = torch.matmul(A, V)
     A = A.transpose(1, 2).contiguous()
@@ -255,6 +257,8 @@ def Gemma2Attention_fast_forward_inference(
         self.temp_QA = torch.empty((2, bsz, 1, attention_size), dtype = dtype, device = "cuda:0")
         self.temp_KV = torch.empty((2, bsz, 1, n_kv_heads*head_dim), dtype = dtype, device = "cuda:0")
         self.RH_Q = torch.empty((bsz, n_heads, 1, head_dim), dtype = dtype, device = "cuda:0")
+        # Only for Gemma2
+        self.temp_O  = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
         self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
         self.scalar = 1.0 / math_sqrt(self.config.hidden_size // self.config.num_attention_heads)
         self.half_head_dim = head_dim // 2
@@ -341,7 +345,7 @@ def Gemma2Attention_fast_forward_inference(
     # pass
     A = A.transpose(1, 2)
     A = A.reshape(bsz, 1, attention_size)
-    A = fast_linear_forward(self.o_proj, A, out = self.temp_QA[1][:,:,:self.hidden_size])
+    A = fast_linear_forward(self.o_proj, A, out = self.temp_O)
     return A, (Kn, Vn)
 pass
 
@@ -426,6 +430,14 @@ class FastGemma2Model(FastLlamaModel):
 
     @staticmethod
     def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "gemma2",
+            rope_module        = GemmaFixedRotaryEmbedding,
+            scaled_rope_module = GemmaFixedLinearScalingRotaryEmbedding,
+            attention_module   = Gemma2Attention,
+        )
+        exec(function, globals())
+        Gemma2Attention.__init__      = eval(init_name)
         Gemma2Attention      .forward = Gemma2Attention_fast_forward
         Gemma2SdpaAttention  .forward = Gemma2Attention_fast_forward
         Gemma2FlashAttention2.forward = Gemma2Attention_fast_forward
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index e19b85726..c7ae67e42 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -663,8 +663,11 @@ def LlamaModel_fast_forward(
 
     # Gemma2 has alternating SWA and global attn
     if IS_GEMMA2 and not hasattr(self, "SWA_mask"):
-        from transformers.modeling_attn_mask_utils import AttentionMaskConverter
         n = self.config.max_position_embeddings
+        # masked_fill is making stuff slower!
+        # self. GA_mask = create_boolean_mask(n = n, sliding_window = 0)
+        # self.SWA_mask = create_boolean_mask(n = n, sliding_window = self.config.sliding_window)
+        from transformers.modeling_attn_mask_utils import AttentionMaskConverter
         self.SWA_mask = AttentionMaskConverter(
             is_causal = True,
             sliding_window = self.config.sliding_window,
@@ -1099,6 +1102,13 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
+        if trust_remote_code:
+            print(
+                "Unsloth: WARNING `trust_remote_code` is True.\n"\
+                "Are you certain you want to do remote code execution?"
+            )
+        pass
+
         if token is None and "HF_TOKEN" in os.environ:
             token = os.environ["HF_TOKEN"]
 
@@ -1139,6 +1149,7 @@ def from_pretrained(
             with open(inspect.getfile(model_function), "r") as file:
                 has_rope_scaling = "self.config.rope_scaling" in file.read()
         except: pass
+        has_rope_scaling = True
 
         # If max_seq_length is not specified, use maximum fron config
         if max_seq_length is None:
@@ -1183,6 +1194,7 @@ def from_pretrained(
         # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/discussions/12
         # RoPE Scaling's max_position_embeddings must be updated
         max_position_embeddings = max(max_seq_length, model_max_seq_length)
+        kwargs.pop("attn_implementation", None); # No need since we auto call it
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             device_map              = device_map,
@@ -1191,6 +1203,7 @@ def from_pretrained(
             token                   = token,
             max_position_embeddings = max_position_embeddings,
             trust_remote_code       = trust_remote_code,
+            attn_implementation     = "eager",
             **kwargs,
         )
 
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
index e0b51a16e..28f664ca2 100644
--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@@ -15,7 +15,10 @@
 from .llama import *
 import os
 from ._utils import __version__
-
+from .llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+)
 from transformers.models.mistral.modeling_mistral import (
     MistralAttention,
     MistralDecoderLayer,
@@ -268,6 +271,14 @@ class FastMistralModel(FastLlamaModel):
 
     @staticmethod
     def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "mistral",
+            rope_module        = LlamaRotaryEmbedding,
+            scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
+            attention_module   = MistralAttention,
+        )
+        exec(function, globals())
+        MistralAttention.__init__      = eval(init_name)
         MistralAttention      .forward = MistralAttention_fast_forward
         MistralSdpaAttention  .forward = MistralAttention_fast_forward
         MistralFlashAttention2.forward = MistralAttention_fast_forward
diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
index 5b9fff5d5..dcd05af60 100644
--- a/unsloth/models/qwen2.py
+++ b/unsloth/models/qwen2.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 
 from .llama import *
-
+from .llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+)
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
     Qwen2DecoderLayer,
@@ -36,6 +39,14 @@ class FastQwen2Model(FastLlamaModel):
 
     @staticmethod
     def pre_patch():
+        init_name, function = patch_linear_scaling(
+            model_name         = "qwen2",
+            rope_module        = LlamaRotaryEmbedding,
+            scaled_rope_module = LlamaLinearScalingRotaryEmbedding,
+            attention_module   = Qwen2Attention,
+        )
+        exec(function, globals())
+        Qwen2Attention.__init__      = eval(init_name)
         Qwen2Attention      .forward = LlamaAttention_fast_forward
         Qwen2SdpaAttention  .forward = LlamaAttention_fast_forward
         Qwen2FlashAttention2.forward = LlamaAttention_fast_forward
diff --git a/unsloth/save.py b/unsloth/save.py
index 1ceea3c19..293e43060 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -49,6 +49,7 @@
 )
 LLAMA_LAYERNORMS = (
     "input_layernorm", "post_attention_layernorm",
+    "pre_feedforward_layernorm", "post_feedforward_layernorm",
 )
 
 # https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/quantize.cpp#L19
@@ -557,7 +558,11 @@ def unsloth_save_model(
                 state_dict[name] = torch.load(filename, map_location = "cpu", mmap = True)
         pass
         for item in LLAMA_LAYERNORMS:
-            state_dict[f"model.layers.{j}.{item}.weight"] = eval(f"layer.{item}.weight.data")
+            try:
+                # Skip for Gemma 2
+                state_dict[f"model.layers.{j}.{item}.weight"] = eval(f"layer.{item}.weight.data")
+            except:
+                continue
         pass
     pass