pre-commit: running and fixing...

Andrei-Aksionov · Nov 22, 2023 · 6ba9332 · 6ba9332
1 parent a1acab4
commit 6ba9332
Show file tree

Hide file tree

Showing 19 changed files with 66 additions and 221 deletions.
diff --git a/notebooks/examples/gpt_model_training.ipynb b/notebooks/examples/gpt_model_training.ipynb
diff --git a/src/__init__.py b/src/__init__.py
@@ -1 +0,0 @@
-from src.utils.config import config

diff --git a/src/data/__init__.py b/src/data/__init__.py
@@ -1,2 +0,0 @@
-from src.data.dataset import NextTokenDataset, NextTokenRandomDataset
-from src.data.tokenizer import CharTokenizer

diff --git a/src/model/__init__.py b/src/model/__init__.py
@@ -1,7 +0,0 @@
-from src.model.bigram_language_model.bigram import BigramLanguageModel
-from src.model.gpt_language_model.attention import MultiHeadAttention, SelfAttentionHead
-from src.model.gpt_language_model.feed_forward import FeedForward
-from src.model.gpt_language_model.gpt import GPTLanguageModel
-from src.model.gpt_language_model.transformer_block import TransformerBlock
-from src.model.lr_schedulers import CosineWarmupLRScheduler
-from src.model.trainer import Trainer

diff --git a/src/model/bigram_language_model/bigram.py b/src/model/bigram_language_model/bigram.py
@@ -50,17 +50,10 @@ def loss(self, logits: Tensor, targets: Tensor) -> Tensor:
             tensor with loss value (of how good model's predictions are)
         """
         B, T, C = logits.shape  # noqa: N806
-        return F.cross_entropy(
-            logits.view(B * T, C),
-            targets.view(B * T),
-        )
+        return F.cross_entropy(logits.view(B * T, C), targets.view(B * T))
 
     def generate(
-        self,
-        idx: Tensor,
-        max_new_tokens: int,
-        temperature: float = 1.0,
-        top_k_logits: Optional[int] = None,
+        self, idx: Tensor, max_new_tokens: int, temperature: float = 1.0, top_k_logits: Optional[int] = None
     ) -> Tensor:
         """Generate new character after the current one.
 

diff --git a/src/model/generate.py b/src/model/generate.py
@@ -83,7 +83,7 @@ def generate_new_tokens(
         elif size and gpt2_config:
             log_error(
                 "For GPT language model either size or gpt2_config has to be provided, not both, "
-                f"but was provided size={size} and gpt2_config={gpt2_config}",
+                f"but was provided size={size} and gpt2_config={gpt2_config}"
             )
 
     # if all checks are passed, that means that either size of gpt2_config is provided
@@ -107,11 +107,7 @@ def generate_new_tokens(
     # use only those kwargs that are accepted by function
     kwargs = grab_arguments(
         func=model.generate,
-        kwargs={
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature,
-            "use_kv_cache": use_kv_cache,
-        },
+        kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature, "use_kv_cache": use_kv_cache},
     )
     # model returns indices of the dictionary
     new_token_indices = model.generate(init_context, **kwargs).squeeze().tolist()
@@ -125,24 +121,16 @@ def main() -> None:
     """Generate new tokens from either GPT or a simple bigram language model."""
     # main parser will store subparsers, shared parser - arguments that are shared between subparsers
     main_parser = argparse.ArgumentParser(
-        description="Generate new tokens with Bigram or GPT model",
-        formatter_class=argparse.RawTextHelpFormatter,
+        description="Generate new tokens with Bigram or GPT model", formatter_class=argparse.RawTextHelpFormatter
     )
     shared_parser = argparse.ArgumentParser(add_help=False)
     # ordering matters: first shared arguments, then - subparsers
     # ---------- shared arguments ----------
     shared_parser.add_argument(
-        "--device",
-        help="Optionally you can select device on which the model will be trained",
-        required=False,
-        type=str,
+        "--device", help="Optionally you can select device on which the model will be trained", required=False, type=str
     )
     shared_parser.add_argument(
-        "--max-new-tokens",
-        default=100,
-        help="How many new tokens do you want to generate",
-        required=False,
-        type=int,
+        "--max-new-tokens", default=100, help="How many new tokens do you want to generate", required=False, type=int
     )
     shared_parser.add_argument(
         "--temperature",
@@ -153,39 +141,22 @@ def main() -> None:
         type=float,
     )
     shared_parser.add_argument(
-        "--fix-seed",
-        help="Make token generation deterministic",
-        action="store_true",
-        required=False,
+        "--fix-seed", help="Make token generation deterministic", action="store_true", required=False
     )
     shared_parser.add_argument(
-        "--continue-tokens",
-        default=" ",
-        help="Generation should continue these tokens",
-        required=False,
-        type=str,
+        "--continue-tokens", default=" ", help="Generation should continue these tokens", required=False, type=str
     )
     # ---------- subparsers ----------
     subparsers = main_parser.add_subparsers(dest="model", description="Choose model type")
     # bigram subparser
     bigram_subparser = subparsers.add_parser("bigram", parents=[shared_parser])
     bigram_subparser.add_argument(
-        "--size",
-        "-s",
-        choices=["large"],
-        help="The size of the Bigram model",
-        required=True,
-        type=str,
+        "--size", "-s", choices=["large"], help="The size of the Bigram model", required=True, type=str
     )
     # gpt subparser
     gpt_subparser = subparsers.add_parser("gpt", parents=[shared_parser])
     gpt_subparser.add_argument(
-        "--size",
-        "-s",
-        choices=["small", "medium", "large"],
-        help="The size of the GPT model",
-        required=False,
-        type=str,
+        "--size", "-s", choices=["small", "medium", "large"], help="The size of the GPT model", required=False, type=str
     )
     gpt_subparser.add_argument(
         "--gpt2-config",
@@ -195,10 +166,7 @@ def main() -> None:
         type=str,
     )
     gpt_subparser.add_argument(
-        "--use-kv-cache",
-        help="Use kv-value cache to speed up token generation",
-        action="store_true",
-        required=False,
+        "--use-kv-cache", help="Use kv-value cache to speed up token generation", action="store_true", required=False
     )
     # combining 'help' output from both argparsers
     shared_parser_help = (
@@ -209,10 +177,7 @@ def main() -> None:
 
     # parser arguments
     args = vars(main_parser.parse_args())
-    model_name = {
-        "bigram": BigramLanguageModel,
-        "gpt": GPTLanguageModel,
-    }[args.pop("model")]
+    model_name = {"bigram": BigramLanguageModel, "gpt": GPTLanguageModel}[args.pop("model")]
 
     # run token generation
     generate_new_tokens(model_name, **args)

diff --git a/src/model/gpt_language_model/attention.py b/src/model/gpt_language_model/attention.py
@@ -10,14 +10,7 @@
 
 class SelfAttentionHead(nn.Module):
     def __init__(
-        self,
-        embeddings_size: int,
-        context_size: int,
-        head_size: int,
-        bias: bool,
-        dropout: float,
-        *,
-        is_decoder: bool,
+        self, embeddings_size: int, context_size: int, head_size: int, bias: bool, dropout: float, *, is_decoder: bool
     ) -> None:
         """Single self-attention head.
 
@@ -186,7 +179,7 @@ def __init__(
             if embeddings_size % num_heads != 0:
                 log_error(
                     "Embeddings size should be divisible by number of heads without remainder, "
-                    f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}",
+                    f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}"
                 )
             head_size = embeddings_size // num_heads
 
@@ -209,7 +202,7 @@ def __init__(
                     is_decoder=self.is_decoder,
                 )
                 for _ in range(self.num_heads)
-            ],
+            ]
         )
 
         # if after concatenation the size of channels is bigger than embeddings size
@@ -260,15 +253,11 @@ def forward(self, x: Tensor, kv_cache: Optional[Tensor]) -> Tensor:
 
         if all(x is not None for x in kv_cache):
             kv_cache = torch.stack(
-                kv_cache,
-                dim=-2,
+                kv_cache, dim=-2
             )  # num_heads * (2, B, T, head_size) -> (2, B, T, num_heads, head_size)
             kv_cache = kv_cache.transpose(2, 3)  # (2, B, num_heads, T, head_size)
 
-        return (
-            output,  # (B, T, num_heads * head_size)
-            kv_cache,  # num_heads * None | (2, B, num_heads, T, head_size)
-        )
+        return (output, kv_cache)  # (B, T, num_heads * head_size)  # num_heads * None | (2, B, num_heads, T, head_size)
 
 
 class CausalSelfAttention(nn.Module):
@@ -319,7 +308,7 @@ def __init__(
             if embeddings_size % num_heads != 0:
                 log_error(
                     "Embeddings size should be divisible by the number of heads without a residual, "
-                    f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}",
+                    f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}"
                 )
             head_size = embeddings_size // num_heads
 
@@ -373,8 +362,7 @@ def forward(self, x: Tensor, kv_cache: Optional[Tensor]) -> Tensor:
 
         # single pass for query, key and value; that's why we need to split into 3 parts
         query, key, value = self.causal_self_attention(x).split(
-            self.head_size * self.num_heads,
-            dim=-1,
+            self.head_size * self.num_heads, dim=-1
         )  # (B, T, C) -> (B, T, 3 * hs * nh) -> (B, T, hs * nh)
 
         # transform (B, T, nh * hs) -> (B, nh, T, hs) so it's similar to multi-head attention

diff --git a/src/model/gpt_language_model/gpt.py b/src/model/gpt_language_model/gpt.py
@@ -94,7 +94,7 @@ def __init__(
                     use_causal_self_attention=True,
                 )
                 for _ in range(self.num_layers)
-            ],
+            ]
         )
         self.layer_norm_final = LayerNorm(self.embeddings_size, bias=self.bias)  # final layer norm
         self.language_model_head = nn.Linear(self.embeddings_size, self.vocab_size, bias=False)
@@ -114,8 +114,8 @@ def __init__(
         # report number of parameters
         logger.debug(
             "GPT language model is created with number of parameters: {:.2f} million".format(
-                self.__get_parameters_number() / 1e6,
-            ),
+                self.__get_parameters_number() / 1e6
+            )
         )
 
     def __get_parameters_number(self, exclude_positional_embeddings: bool = True) -> int:
@@ -147,13 +147,7 @@ def __init_weights(self, module: torch.nn.modules) -> None:
             if hasattr(module, "bias") and module.bias is not None:
                 torch.nn.init.zeros_(module.bias)
 
-    def forward(
-        self,
-        idx: Tensor,
-        *,
-        inference: bool = False,
-        kv_cache: Optional[List[Tensor]] = None,
-    ) -> Tensor:
+    def forward(self, idx: Tensor, *, inference: bool = False, kv_cache: Optional[List[Tensor]] = None) -> Tensor:
         """Do the whole forward pass for decoder part of transformer.
 
         This forward method includes all steps for decoder:
@@ -279,10 +273,7 @@ def loss(self, logits: Tensor, targets: Tensor) -> Tensor:
             tensor with loss value (of how good model's predictions are)
         """
         B, T, C = logits.shape  # noqa: N806
-        return F.cross_entropy(
-            logits.view(B * T, C),
-            targets.view(B * T),
-        )
+        return F.cross_entropy(logits.view(B * T, C), targets.view(B * T))
 
     @classmethod
     def from_pretrained(cls: "GPTLanguageModel", gpt2_type: str) -> "GPTLanguageModel":
@@ -398,7 +389,7 @@ def sync_name(name: str) -> str:
             if source_weights.shape != target_state_dict[target_key].shape:
                 log_error(
                     f"Shape mismatch: shape of source '{source_weights.shape}' and destination - "
-                    f"'{target_state_dict[target_key].shape}'",
+                    f"'{target_state_dict[target_key].shape}'"
                 )
             with torch.no_grad():
                 target_state_dict[target_key].copy_(source_weights)
@@ -473,9 +464,7 @@ def generate(
                 context = idx[:, -1:]
             # get the predictions
             logits, kv_cache = self(
-                context,
-                inference=True,
-                kv_cache=kv_cache if use_kv_cache else None,
+                context, inference=True, kv_cache=kv_cache if use_kv_cache else None
             )  # (B, T, C), with inference=True -> (1, 1, C)
             # focus only on the last time step and scale by desired temperature
             logits = logits[:, -1, :] / temperature  # becomes (B, C)

diff --git a/src/model/gpt_language_model/peft/lora.py b/src/model/gpt_language_model/peft/lora.py
@@ -56,13 +56,7 @@
 
 
 class LoRALayer:
-    def __init__(
-        self,
-        r: int,
-        lora_alpha: int,
-        lora_dropout: float,
-        merge_weights: bool,
-    ) -> None:
+    def __init__(self, r: int, lora_alpha: int, lora_dropout: float, merge_weights: bool) -> None:
         """Store LoRA specific attributes in a class.
 
         Parameters
@@ -160,7 +154,7 @@ def __init__(
         if r > 0 and any(enable_lora):
             self.lora_A = nn.Parameter(self.weight.new_zeros((r * sum(enable_lora), in_features)))  # (4, 128)
             self.lora_B = nn.Parameter(
-                self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r)),  # (256, 2)
+                self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r))  # (256, 2)
             )  # weights for Conv1D with groups=sum(enable_lora)
             # Notes about shapes above
             # - self.lora_A has shape (4, 128): 4 because rank is 2 and LoRA is applied only to two matrices;
@@ -194,8 +188,7 @@ def __init__(
             # | query         | key       | value    |
             # ----------------------------------------
             self.lora_ind = self.weight.new_zeros((out_features,), dtype=torch.bool).view(
-                len(enable_lora),
-                -1,
+                len(enable_lora), -1
             )  # (3, 128)
             self.lora_ind[enable_lora, :] = True  # (3, 128)
             self.lora_ind = self.lora_ind.view(-1)  # (384,)
@@ -246,8 +239,7 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         result = x.new_zeros((*x.shape[:-1], self.out_features))  # (64, 64, 384)
         result = result.view(-1, self.out_features)  # (4096, 384)
         result[:, self.lora_ind] = x.reshape(
-            -1,
-            self.out_features // len(self.enable_lora) * sum(self.enable_lora),
+            -1, self.out_features // len(self.enable_lora) * sum(self.enable_lora)
         )  # (4096, 256)
         return result.view((*x.shape[:-1], self.out_features)).transpose(0, 1)  # (64, 64, 384)
 
@@ -290,7 +282,7 @@ def T(w: torch.Tensor) -> torch.Tensor:  # noqa: N802
                     self.lora_B.data.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
                     groups=sum(self.enable_lora),
                 ).squeeze(  # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128)
-                    0,
+                    0
                 )  # (1, 256, 128) -> (256, 128)
                 # -1: W = W - delta_W (unmerge), +1: W = W + delta_W (merge)
                 sign = -1 if mode else 1
@@ -335,8 +327,7 @@ def T(w: torch.Tensor) -> torch.Tensor:  # noqa: N802
             result = F.linear(x, T(self.weight), bias=self.bias)  # (64, 64, 128) @ (384, 128) -> (64, 64, 384)
             if self.r > 0:
                 after_A = F.linear(  # noqa: N806
-                    self.lora_dropout(x),
-                    self.lora_A,
+                    self.lora_dropout(x), self.lora_A
                 )  # (64, 64, 128) @ (4, 128) -> (64, 64, 4)
 
                 # For F.conv1d:
@@ -349,8 +340,7 @@ def T(w: torch.Tensor) -> torch.Tensor:  # noqa: N802
                     self.lora_B.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
                     groups=sum(self.enable_lora),
                 ).transpose(  # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64)
-                    -2,
-                    -1,
+                    -2, -1
                 )  # (64, 256, 64) -> (64, 64, 256)
 
                 # (64, 64, 256) after zero_pad (64, 64, 384)

diff --git a/src/model/gpt_language_model/transformer_block.py b/src/model/gpt_language_model/transformer_block.py
@@ -2,10 +2,7 @@
 
 from torch import Tensor, nn
 
-from src.model.gpt_language_model.attention import (
-    CausalSelfAttention,
-    MultiHeadAttention,
-)
+from src.model.gpt_language_model.attention import CausalSelfAttention, MultiHeadAttention
 from src.model.gpt_language_model.feed_forward import FeedForward
Original file line number	Diff line number	Diff line change
		@@ -1,2 +0,0 @@
		from src.data.dataset import NextTokenDataset, NextTokenRandomDataset
		from src.data.tokenizer import CharTokenizer