From 432b3e6f0580dd4cb4afd6e479da99df506df7a7 Mon Sep 17 00:00:00 2001 From: Nagico2 Date: Wed, 24 Jul 2024 15:24:27 +0800 Subject: [PATCH 1/3] fix: add position_embeddings args to LlamaAttention a new arg here: https://github.com/huggingface/transformers/blame/main/src/transformers/models/llama/modeling_llama.py#L316 --- intel_npu_acceleration_library/nn/llm.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py index 8cf6cd3..ff97a15 100644 --- a/intel_npu_acceleration_library/nn/llm.py +++ b/intel_npu_acceleration_library/nn/llm.py @@ -12,7 +12,7 @@ from intel_npu_acceleration_library.nn import Linear from intel_npu_acceleration_library.backend import run_factory, MLP from functools import partial -from typing import Optional, List, Generator +from typing import Optional, List, Generator, Tuple from transformers.cache_utils import Cache import torch import uuid @@ -169,6 +169,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in Transformers v4.45 ): """Torch module forward method. @@ -180,6 +181,7 @@ def forward( output_attentions (Optional[bool], optional): Whether or not to return the attentions tensors of all attention layers.. Defaults to False. use_cache (Optional[bool], optional): If set to `True`, `past_key_values` key value states are returned. Defaults to False. cache_position (Optional[torch.LongTensor], optional): Cache position useful for static cache applications . Defaults to None. + position_embeddings (Optional[Tuple[torch.Tensor, torch.Tensor]], optional): If set to a tuple, it means the `sin` and `cos` are uniformly calculated by the outer `LlamaModel` and passed in. Defaults to None. Returns: _type_: result @@ -202,7 +204,10 @@ def forward( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb( query_states, key_states, cos, sin, position_ids From b6a9e8b727246606ce64a7407f8a1a91c9e2123d Mon Sep 17 00:00:00 2001 From: Nagico Date: Thu, 25 Jul 2024 12:53:08 +0800 Subject: [PATCH 2/3] feat: update transformers version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d22de26..14609e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy torch -transformers>=4.39.3 +transformers>=4.43.0 neural-compressor \ No newline at end of file From a5d7bf3c6c972113eac647ebd9f68196ab197eba Mon Sep 17 00:00:00 2001 From: Nagico Date: Thu, 25 Jul 2024 12:57:11 +0800 Subject: [PATCH 3/3] style: fix incorrect style --- intel_npu_acceleration_library/nn/llm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py index ff97a15..eeee94d 100644 --- a/intel_npu_acceleration_library/nn/llm.py +++ b/intel_npu_acceleration_library/nn/llm.py @@ -169,7 +169,9 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in Transformers v4.45 + position_embeddings: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None, # will become mandatory in Transformers v4.45 ): """Torch module forward method.