From bbe43fa1a7ad4c4215578edd64f6a7ba6f0b0528 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 22 Nov 2023 10:56:02 +0000 Subject: [PATCH] pre-commit: running and fixing... --- notebooks/examples/gpt_model_training.ipynb | 5 +- src/__init__.py | 1 - src/data/__init__.py | 2 - src/model/__init__.py | 7 --- src/model/bigram_language_model/bigram.py | 11 +--- src/model/generate.py | 57 ++++--------------- src/model/gpt_language_model/attention.py | 26 +++------ src/model/gpt_language_model/gpt.py | 25 +++----- src/model/gpt_language_model/peft/lora.py | 24 +++----- .../gpt_language_model/transformer_block.py | 5 +- src/model/lr_schedulers.py | 13 +---- src/model/train.py | 52 +++-------------- src/model/trainer.py | 14 +---- src/utils/__init__.py | 11 ---- src/utils/model.py | 7 +-- tests/smoke/dataset_test.py | 2 +- tests/smoke/generate_test.py | 10 ++-- tests/smoke/model_test.py | 11 ++-- tests/smoke/train_test.py | 4 +- 19 files changed, 66 insertions(+), 221 deletions(-) diff --git a/notebooks/examples/gpt_model_training.ipynb b/notebooks/examples/gpt_model_training.ipynb index 3486112..598ac1e 100644 --- a/notebooks/examples/gpt_model_training.ipynb +++ b/notebooks/examples/gpt_model_training.ipynb @@ -281,10 +281,7 @@ "source": [ "set_seed(config.seed)\n", "\n", - "model = GPTLanguageModel(\n", - " vocab_size=tokenizer.vocab_size,\n", - " **grab_arguments(GPTLanguageModel, model_config),\n", - ")\n", + "model = GPTLanguageModel(vocab_size=tokenizer.vocab_size, **grab_arguments(GPTLanguageModel, model_config))\n", "optimizer = torch.optim.AdamW(model.parameters(), lr=model_config.learning_rate)\n", "trainer = Trainer(model, optimizer, train_dataloader, test_dataloader, DEVICE)\n", "trainer.train(epochs=model_config.epochs)" diff --git a/src/__init__.py b/src/__init__.py index b119375..e69de29 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1 +0,0 @@ -from src.utils.config import config diff --git a/src/data/__init__.py b/src/data/__init__.py index 0df61d5..e69de29 100644 --- a/src/data/__init__.py +++ b/src/data/__init__.py @@ -1,2 +0,0 @@ -from src.data.dataset import NextTokenDataset, NextTokenRandomDataset -from src.data.tokenizer import CharTokenizer diff --git a/src/model/__init__.py b/src/model/__init__.py index ebd6c82..e69de29 100644 --- a/src/model/__init__.py +++ b/src/model/__init__.py @@ -1,7 +0,0 @@ -from src.model.bigram_language_model.bigram import BigramLanguageModel -from src.model.gpt_language_model.attention import MultiHeadAttention, SelfAttentionHead -from src.model.gpt_language_model.feed_forward import FeedForward -from src.model.gpt_language_model.gpt import GPTLanguageModel -from src.model.gpt_language_model.transformer_block import TransformerBlock -from src.model.lr_schedulers import CosineWarmupLRScheduler -from src.model.trainer import Trainer diff --git a/src/model/bigram_language_model/bigram.py b/src/model/bigram_language_model/bigram.py index ab5146b..c463e35 100644 --- a/src/model/bigram_language_model/bigram.py +++ b/src/model/bigram_language_model/bigram.py @@ -50,17 +50,10 @@ def loss(self, logits: Tensor, targets: Tensor) -> Tensor: tensor with loss value (of how good model's predictions are) """ B, T, C = logits.shape # noqa: N806 - return F.cross_entropy( - logits.view(B * T, C), - targets.view(B * T), - ) + return F.cross_entropy(logits.view(B * T, C), targets.view(B * T)) def generate( - self, - idx: Tensor, - max_new_tokens: int, - temperature: float = 1.0, - top_k_logits: Optional[int] = None, + self, idx: Tensor, max_new_tokens: int, temperature: float = 1.0, top_k_logits: Optional[int] = None ) -> Tensor: """Generate new character after the current one. diff --git a/src/model/generate.py b/src/model/generate.py index 6988f0b..441351b 100644 --- a/src/model/generate.py +++ b/src/model/generate.py @@ -83,7 +83,7 @@ def generate_new_tokens( elif size and gpt2_config: log_error( "For GPT language model either size or gpt2_config has to be provided, not both, " - f"but was provided size={size} and gpt2_config={gpt2_config}", + f"but was provided size={size} and gpt2_config={gpt2_config}" ) # if all checks are passed, that means that either size of gpt2_config is provided @@ -107,11 +107,7 @@ def generate_new_tokens( # use only those kwargs that are accepted by function kwargs = grab_arguments( func=model.generate, - kwargs={ - "max_new_tokens": max_new_tokens, - "temperature": temperature, - "use_kv_cache": use_kv_cache, - }, + kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature, "use_kv_cache": use_kv_cache}, ) # model returns indices of the dictionary new_token_indices = model.generate(init_context, **kwargs).squeeze().tolist() @@ -125,24 +121,16 @@ def main() -> None: """Generate new tokens from either GPT or a simple bigram language model.""" # main parser will store subparsers, shared parser - arguments that are shared between subparsers main_parser = argparse.ArgumentParser( - description="Generate new tokens with Bigram or GPT model", - formatter_class=argparse.RawTextHelpFormatter, + description="Generate new tokens with Bigram or GPT model", formatter_class=argparse.RawTextHelpFormatter ) shared_parser = argparse.ArgumentParser(add_help=False) # ordering matters: first shared arguments, then - subparsers # ---------- shared arguments ---------- shared_parser.add_argument( - "--device", - help="Optionally you can select device on which the model will be trained", - required=False, - type=str, + "--device", help="Optionally you can select device on which the model will be trained", required=False, type=str ) shared_parser.add_argument( - "--max-new-tokens", - default=100, - help="How many new tokens do you want to generate", - required=False, - type=int, + "--max-new-tokens", default=100, help="How many new tokens do you want to generate", required=False, type=int ) shared_parser.add_argument( "--temperature", @@ -153,39 +141,22 @@ def main() -> None: type=float, ) shared_parser.add_argument( - "--fix-seed", - help="Make token generation deterministic", - action="store_true", - required=False, + "--fix-seed", help="Make token generation deterministic", action="store_true", required=False ) shared_parser.add_argument( - "--continue-tokens", - default=" ", - help="Generation should continue these tokens", - required=False, - type=str, + "--continue-tokens", default=" ", help="Generation should continue these tokens", required=False, type=str ) # ---------- subparsers ---------- subparsers = main_parser.add_subparsers(dest="model", description="Choose model type") # bigram subparser bigram_subparser = subparsers.add_parser("bigram", parents=[shared_parser]) bigram_subparser.add_argument( - "--size", - "-s", - choices=["large"], - help="The size of the Bigram model", - required=True, - type=str, + "--size", "-s", choices=["large"], help="The size of the Bigram model", required=True, type=str ) # gpt subparser gpt_subparser = subparsers.add_parser("gpt", parents=[shared_parser]) gpt_subparser.add_argument( - "--size", - "-s", - choices=["small", "medium", "large"], - help="The size of the GPT model", - required=False, - type=str, + "--size", "-s", choices=["small", "medium", "large"], help="The size of the GPT model", required=False, type=str ) gpt_subparser.add_argument( "--gpt2-config", @@ -195,10 +166,7 @@ def main() -> None: type=str, ) gpt_subparser.add_argument( - "--use-kv-cache", - help="Use kv-value cache to speed up token generation", - action="store_true", - required=False, + "--use-kv-cache", help="Use kv-value cache to speed up token generation", action="store_true", required=False ) # combining 'help' output from both argparsers shared_parser_help = ( @@ -209,10 +177,7 @@ def main() -> None: # parser arguments args = vars(main_parser.parse_args()) - model_name = { - "bigram": BigramLanguageModel, - "gpt": GPTLanguageModel, - }[args.pop("model")] + model_name = {"bigram": BigramLanguageModel, "gpt": GPTLanguageModel}[args.pop("model")] # run token generation generate_new_tokens(model_name, **args) diff --git a/src/model/gpt_language_model/attention.py b/src/model/gpt_language_model/attention.py index 7a8e220..1357bb5 100644 --- a/src/model/gpt_language_model/attention.py +++ b/src/model/gpt_language_model/attention.py @@ -10,14 +10,7 @@ class SelfAttentionHead(nn.Module): def __init__( - self, - embeddings_size: int, - context_size: int, - head_size: int, - bias: bool, - dropout: float, - *, - is_decoder: bool, + self, embeddings_size: int, context_size: int, head_size: int, bias: bool, dropout: float, *, is_decoder: bool ) -> None: """Single self-attention head. @@ -186,7 +179,7 @@ def __init__( if embeddings_size % num_heads != 0: log_error( "Embeddings size should be divisible by number of heads without remainder, " - f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}", + f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}" ) head_size = embeddings_size // num_heads @@ -209,7 +202,7 @@ def __init__( is_decoder=self.is_decoder, ) for _ in range(self.num_heads) - ], + ] ) # if after concatenation the size of channels is bigger than embeddings size @@ -260,15 +253,11 @@ def forward(self, x: Tensor, kv_cache: Optional[Tensor]) -> Tensor: if all(x is not None for x in kv_cache): kv_cache = torch.stack( - kv_cache, - dim=-2, + kv_cache, dim=-2 ) # num_heads * (2, B, T, head_size) -> (2, B, T, num_heads, head_size) kv_cache = kv_cache.transpose(2, 3) # (2, B, num_heads, T, head_size) - return ( - output, # (B, T, num_heads * head_size) - kv_cache, # num_heads * None | (2, B, num_heads, T, head_size) - ) + return (output, kv_cache) # (B, T, num_heads * head_size) # num_heads * None | (2, B, num_heads, T, head_size) class CausalSelfAttention(nn.Module): @@ -319,7 +308,7 @@ def __init__( if embeddings_size % num_heads != 0: log_error( "Embeddings size should be divisible by the number of heads without a residual, " - f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}", + f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}" ) head_size = embeddings_size // num_heads @@ -373,8 +362,7 @@ def forward(self, x: Tensor, kv_cache: Optional[Tensor]) -> Tensor: # single pass for query, key and value; that's why we need to split into 3 parts query, key, value = self.causal_self_attention(x).split( - self.head_size * self.num_heads, - dim=-1, + self.head_size * self.num_heads, dim=-1 ) # (B, T, C) -> (B, T, 3 * hs * nh) -> (B, T, hs * nh) # transform (B, T, nh * hs) -> (B, nh, T, hs) so it's similar to multi-head attention diff --git a/src/model/gpt_language_model/gpt.py b/src/model/gpt_language_model/gpt.py index 37e9f78..23c899d 100644 --- a/src/model/gpt_language_model/gpt.py +++ b/src/model/gpt_language_model/gpt.py @@ -94,7 +94,7 @@ def __init__( use_causal_self_attention=True, ) for _ in range(self.num_layers) - ], + ] ) self.layer_norm_final = LayerNorm(self.embeddings_size, bias=self.bias) # final layer norm self.language_model_head = nn.Linear(self.embeddings_size, self.vocab_size, bias=False) @@ -114,8 +114,8 @@ def __init__( # report number of parameters logger.debug( "GPT language model is created with number of parameters: {:.2f} million".format( - self.__get_parameters_number() / 1e6, - ), + self.__get_parameters_number() / 1e6 + ) ) def __get_parameters_number(self, exclude_positional_embeddings: bool = True) -> int: @@ -147,13 +147,7 @@ def __init_weights(self, module: torch.nn.modules) -> None: if hasattr(module, "bias") and module.bias is not None: torch.nn.init.zeros_(module.bias) - def forward( - self, - idx: Tensor, - *, - inference: bool = False, - kv_cache: Optional[List[Tensor]] = None, - ) -> Tensor: + def forward(self, idx: Tensor, *, inference: bool = False, kv_cache: Optional[List[Tensor]] = None) -> Tensor: """Do the whole forward pass for decoder part of transformer. This forward method includes all steps for decoder: @@ -279,10 +273,7 @@ def loss(self, logits: Tensor, targets: Tensor) -> Tensor: tensor with loss value (of how good model's predictions are) """ B, T, C = logits.shape # noqa: N806 - return F.cross_entropy( - logits.view(B * T, C), - targets.view(B * T), - ) + return F.cross_entropy(logits.view(B * T, C), targets.view(B * T)) @classmethod def from_pretrained(cls: "GPTLanguageModel", gpt2_type: str) -> "GPTLanguageModel": @@ -398,7 +389,7 @@ def sync_name(name: str) -> str: if source_weights.shape != target_state_dict[target_key].shape: log_error( f"Shape mismatch: shape of source '{source_weights.shape}' and destination - " - f"'{target_state_dict[target_key].shape}'", + f"'{target_state_dict[target_key].shape}'" ) with torch.no_grad(): target_state_dict[target_key].copy_(source_weights) @@ -473,9 +464,7 @@ def generate( context = idx[:, -1:] # get the predictions logits, kv_cache = self( - context, - inference=True, - kv_cache=kv_cache if use_kv_cache else None, + context, inference=True, kv_cache=kv_cache if use_kv_cache else None ) # (B, T, C), with inference=True -> (1, 1, C) # focus only on the last time step and scale by desired temperature logits = logits[:, -1, :] / temperature # becomes (B, C) diff --git a/src/model/gpt_language_model/peft/lora.py b/src/model/gpt_language_model/peft/lora.py index d40e490..7bd701b 100644 --- a/src/model/gpt_language_model/peft/lora.py +++ b/src/model/gpt_language_model/peft/lora.py @@ -56,13 +56,7 @@ class LoRALayer: - def __init__( - self, - r: int, - lora_alpha: int, - lora_dropout: float, - merge_weights: bool, - ) -> None: + def __init__(self, r: int, lora_alpha: int, lora_dropout: float, merge_weights: bool) -> None: """Store LoRA specific attributes in a class. Parameters @@ -160,7 +154,7 @@ def __init__( if r > 0 and any(enable_lora): self.lora_A = nn.Parameter(self.weight.new_zeros((r * sum(enable_lora), in_features))) # (4, 128) self.lora_B = nn.Parameter( - self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r)), # (256, 2) + self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r)) # (256, 2) ) # weights for Conv1D with groups=sum(enable_lora) # Notes about shapes above # - self.lora_A has shape (4, 128): 4 because rank is 2 and LoRA is applied only to two matrices; @@ -194,8 +188,7 @@ def __init__( # | query | key | value | # ---------------------------------------- self.lora_ind = self.weight.new_zeros((out_features,), dtype=torch.bool).view( - len(enable_lora), - -1, + len(enable_lora), -1 ) # (3, 128) self.lora_ind[enable_lora, :] = True # (3, 128) self.lora_ind = self.lora_ind.view(-1) # (384,) @@ -246,8 +239,7 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor: result = x.new_zeros((*x.shape[:-1], self.out_features)) # (64, 64, 384) result = result.view(-1, self.out_features) # (4096, 384) result[:, self.lora_ind] = x.reshape( - -1, - self.out_features // len(self.enable_lora) * sum(self.enable_lora), + -1, self.out_features // len(self.enable_lora) * sum(self.enable_lora) ) # (4096, 256) return result.view((*x.shape[:-1], self.out_features)).transpose(0, 1) # (64, 64, 384) @@ -290,7 +282,7 @@ def T(w: torch.Tensor) -> torch.Tensor: # noqa: N802 self.lora_B.data.unsqueeze(-1), # (256, 2) -> (256, 2, 1) groups=sum(self.enable_lora), ).squeeze( # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) - 0, + 0 ) # (1, 256, 128) -> (256, 128) # -1: W = W - delta_W (unmerge), +1: W = W + delta_W (merge) sign = -1 if mode else 1 @@ -335,8 +327,7 @@ def T(w: torch.Tensor) -> torch.Tensor: # noqa: N802 result = F.linear(x, T(self.weight), bias=self.bias) # (64, 64, 128) @ (384, 128) -> (64, 64, 384) if self.r > 0: after_A = F.linear( # noqa: N806 - self.lora_dropout(x), - self.lora_A, + self.lora_dropout(x), self.lora_A ) # (64, 64, 128) @ (4, 128) -> (64, 64, 4) # For F.conv1d: @@ -349,8 +340,7 @@ def T(w: torch.Tensor) -> torch.Tensor: # noqa: N802 self.lora_B.unsqueeze(-1), # (256, 2) -> (256, 2, 1) groups=sum(self.enable_lora), ).transpose( # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64) - -2, - -1, + -2, -1 ) # (64, 256, 64) -> (64, 64, 256) # (64, 64, 256) after zero_pad (64, 64, 384) diff --git a/src/model/gpt_language_model/transformer_block.py b/src/model/gpt_language_model/transformer_block.py index c1f9554..a50d280 100644 --- a/src/model/gpt_language_model/transformer_block.py +++ b/src/model/gpt_language_model/transformer_block.py @@ -2,10 +2,7 @@ from torch import Tensor, nn -from src.model.gpt_language_model.attention import ( - CausalSelfAttention, - MultiHeadAttention, -) +from src.model.gpt_language_model.attention import CausalSelfAttention, MultiHeadAttention from src.model.gpt_language_model.feed_forward import FeedForward diff --git a/src/model/lr_schedulers.py b/src/model/lr_schedulers.py index 73c37ff..354f0e9 100644 --- a/src/model/lr_schedulers.py +++ b/src/model/lr_schedulers.py @@ -36,11 +36,7 @@ def step(self, iteration: int) -> None: class CosineWarmupLRScheduler(LRSchedulerBase): def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_iters: int, - lr_decay_iters: int, - min_lr: Optional[float] = None, + self, optimizer: torch.optim.Optimizer, warmup_iters: int, lr_decay_iters: int, min_lr: Optional[float] = None ) -> None: """Cosine learning rate schedular with warmup. @@ -78,12 +74,7 @@ def _get_lr(self, iteration: int) -> float: class CustomLRScheduler(LRSchedulerBase): - def __init__( - self, - optimizer: torch.optim.Optimizer, - model_dim: int, - warmup_iters: int, - ) -> None: + def __init__(self, optimizer: torch.optim.Optimizer, model_dim: int, warmup_iters: int) -> None: """Learning rate scheduler according to the formula in the original Transformer. https://www.tensorflow.org/text/tutorials/transformer#set_up_the_optimizer diff --git a/src/model/train.py b/src/model/train.py index 8317150..da7d059 100644 --- a/src/model/train.py +++ b/src/model/train.py @@ -10,21 +10,9 @@ from src import config from src.data import CharTokenizer, NextTokenDataset -from src.model import ( - BigramLanguageModel, - CosineWarmupLRScheduler, - GPTLanguageModel, - Trainer, -) +from src.model import BigramLanguageModel, CosineWarmupLRScheduler, GPTLanguageModel, Trainer from src.model.gpt_language_model.peft.lora import lora, mark_only_lora_as_trainable -from src.utils import ( - RangeChecker, - get_device, - get_model_config, - grab_arguments, - pickle_dump, - set_seed, -) +from src.utils import RangeChecker, get_device, get_model_config, grab_arguments, pickle_dump, set_seed # TODO: perhaps it's about time to split this func into a set of smaller ones @@ -131,10 +119,7 @@ def train( # noqa: PLR0915 else: extra_args = {} optimizer = torch.optim.AdamW( - optimizer_parameters, - lr=model_config.learning_rate, - betas=model_config.betas, - **extra_args, + optimizer_parameters, lr=model_config.learning_rate, betas=model_config.betas, **extra_args ) # Step 4.3 Configure LR schedular @@ -154,9 +139,7 @@ def train( # noqa: PLR0915 lr_decay_iters = int(len(train_dataloader) * lr_decay_iters) logger.debug("LR decay iters: {}".format(lr_decay_iters)) lr_scheduler = CosineWarmupLRScheduler( - optimizer=optimizer, - warmup_iters=warmup_iters, - lr_decay_iters=lr_decay_iters, + optimizer=optimizer, warmup_iters=warmup_iters, lr_decay_iters=lr_decay_iters ) # Step 4.4. Start training trainer = Trainer( @@ -178,17 +161,13 @@ def main() -> None: """Train either GPT or a simple bigram language model on tiny-shakespeare dataset.""" # main parser will store subparsers, shared parser - arguments that are shared between subparsers main_parser = argparse.ArgumentParser( - description="Train bigram or GPT language model", - formatter_class=argparse.RawTextHelpFormatter, + description="Train bigram or GPT language model", formatter_class=argparse.RawTextHelpFormatter ) shared_parser = argparse.ArgumentParser(add_help=False) # ordering matters: first shared arguments, then - subparsers # ---------- shared arguments ---------- shared_parser.add_argument( - "--device", - help="Optionally you can select device on which the model will be trained", - required=False, - type=str, + "--device", help="Optionally you can select device on which the model will be trained", required=False, type=str ) shared_parser.add_argument( "--dataset-fraction", @@ -202,22 +181,12 @@ def main() -> None: # bigram subparser bigram_subparser = subparsers.add_parser("bigram", parents=[shared_parser]) bigram_subparser.add_argument( - "--size", - "-s", - choices=["large"], - help="The size of the Bigram model", - required=True, - type=str, + "--size", "-s", choices=["large"], help="The size of the Bigram model", required=True, type=str ) # gpt subparser gpt_subparser = subparsers.add_parser("gpt", parents=[shared_parser]) gpt_subparser.add_argument( - "--size", - "-s", - choices=["small", "medium", "large"], - help="The size of the GPT model", - required=True, - type=str, + "--size", "-s", choices=["small", "medium", "large"], help="The size of the GPT model", required=True, type=str ) gpt_subparser.add_argument( "--use-lora", @@ -235,10 +204,7 @@ def main() -> None: # parser arguments args = vars(main_parser.parse_args()) - model_name = { - "bigram": BigramLanguageModel, - "gpt": GPTLanguageModel, - }[args.pop("model")] + model_name = {"bigram": BigramLanguageModel, "gpt": GPTLanguageModel}[args.pop("model")] # run model training train(model_name, **args) diff --git a/src/model/trainer.py b/src/model/trainer.py index b6c2965..99b1905 100644 --- a/src/model/trainer.py +++ b/src/model/trainer.py @@ -86,12 +86,7 @@ def __init__( def __move_batch_to(self, batch: List[Tensor]) -> List[Tensor]: return [x.to(self.device) for x in batch] - def _train_step( - self, - mode: str, - idx: int, - batch: Union[tuple, list], - ) -> Tensor: + def _train_step(self, mode: str, idx: int, batch: Union[tuple, list]) -> Tensor: # data should be on the same device as the model inputs, targets = self.__move_batch_to(batch) # during evaluation there is no need to store any information for backpropagation @@ -127,10 +122,7 @@ def train(self, epochs: int) -> None: for epoch in range(epochs): tqdm.write(f" Epoch: {epoch} ".center(40, "=")) # reuse code for training and evaluation - for mode, dataloader in zip( - ["train", "eval"], - [self.train_dataloader, self.eval_dataloader], - ): + for mode, dataloader in zip(["train", "eval"], [self.train_dataloader, self.eval_dataloader]): # set model into train or eval mode: required for BatchNorm or Dropout self.model.train() if mode == "train" else self.model.eval() tqdm_loop = tqdm(dataloader, desc=mode, ascii=True) @@ -152,7 +144,7 @@ def train(self, epochs: int) -> None: if eval_loss < best_eval_loss: logger.info( "Current eval loss is `{:.4f}` which is smaller than current best loss of `{:.4f}`; " - "saving the model...".format(eval_loss, best_eval_loss), + "saving the model...".format(eval_loss, best_eval_loss) ) best_eval_loss = eval_loss save_checkpoint(state=self.model.state_dict(), path=self.checkpoint_model_path) diff --git a/src/utils/__init__.py b/src/utils/__init__.py index 20cbf18..e69de29 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,11 +0,0 @@ -from src.utils.arguments import RangeChecker, grab_arguments -from src.utils.device import get_device -from src.utils.error import log_error -from src.utils.model import ( - get_model_config, - load_checkpoint, - pickle_dump, - pickle_load, - save_checkpoint, -) -from src.utils.seed import set_seed diff --git a/src/utils/model.py b/src/utils/model.py index 4877e41..50900b9 100644 --- a/src/utils/model.py +++ b/src/utils/model.py @@ -94,10 +94,9 @@ class of the model if there is no config in the config file for the provided model class """ model_class_name = model_class.__name__ - model_config = { - "BigramLanguageModel": config.model.bigram, - "GPTLanguageModel": config.model.gpt, - }.get(model_class_name) + model_config = {"BigramLanguageModel": config.model.bigram, "GPTLanguageModel": config.model.gpt}.get( + model_class_name + ) if model_config is None: log_error(f"There is no config for class '{model_class_name}'") return model_config.size[size] diff --git a/tests/smoke/dataset_test.py b/tests/smoke/dataset_test.py index 97d8e35..3f19997 100644 --- a/tests/smoke/dataset_test.py +++ b/tests/smoke/dataset_test.py @@ -6,7 +6,7 @@ from src.data.downloader import download -@pytest.mark.smoke +@pytest.mark.smoke() @pytest.mark.order(1) class TestDataset: def test_tiny_shakespeare_download(self) -> None: diff --git a/tests/smoke/generate_test.py b/tests/smoke/generate_test.py index cc730fa..ae16f31 100644 --- a/tests/smoke/generate_test.py +++ b/tests/smoke/generate_test.py @@ -7,7 +7,7 @@ @pytest.mark.order(4) class TestTokenGeneration: - @pytest.mark.smoke + @pytest.mark.smoke() @pytest.mark.parametrize("model_size", list(config.model.bigram.size.keys())) def test_bigram_token_generation(self, model_size: str) -> None: completed_process = subprocess.run( @@ -25,7 +25,7 @@ def test_bigram_token_generation(self, model_size: str) -> None: ) assert completed_process.returncode == 0 - @pytest.mark.smoke + @pytest.mark.smoke() @pytest.mark.parametrize("model_size", list(config.model.gpt.size.keys())) def test_gpt_token_generation(self, model_size: str) -> None: completed_process = subprocess.run( @@ -43,7 +43,7 @@ def test_gpt_token_generation(self, model_size: str) -> None: ) assert completed_process.returncode == 0 - @pytest.mark.smoke + @pytest.mark.smoke() @pytest.mark.parametrize("model_type", ["gpt2"]) def test_gpt2_pretrained_token_generation_fast(self, model_type: str) -> None: completed_process = subprocess.run( @@ -61,7 +61,7 @@ def test_gpt2_pretrained_token_generation_fast(self, model_type: str) -> None: ) assert completed_process.returncode == 0 - @pytest.mark.slow + @pytest.mark.slow() @pytest.mark.parametrize("model_type", ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]) def test_gpt2_pretrained_token_generation_slow(self, model_type: str) -> None: completed_process = subprocess.run( @@ -80,7 +80,7 @@ def test_gpt2_pretrained_token_generation_slow(self, model_type: str) -> None: assert completed_process.returncode == 0 # testing key-value cache - @pytest.mark.smoke + @pytest.mark.smoke() @pytest.mark.parametrize("model_type", ["gpt2"]) def test_gpt2_pretrained_token_generation_fast_kv_cache(self, model_type: str) -> None: completed_process = subprocess.run( diff --git a/tests/smoke/model_test.py b/tests/smoke/model_test.py index 2e28b0a..490186a 100644 --- a/tests/smoke/model_test.py +++ b/tests/smoke/model_test.py @@ -13,24 +13,23 @@ def setup_class(cls: "TestModel") -> None: # so for these tests it's totally arbitrary cls.vocab_size = 10 - @pytest.mark.smoke + @pytest.mark.smoke() def test_bigram_configs_load(self) -> None: _ = BigramLanguageModel(vocab_size=TestModel.vocab_size) - @pytest.mark.smoke + @pytest.mark.smoke() @pytest.mark.parametrize("model_size", list(config.model.gpt.size.keys())) def test_gpt_configs_load(self, model_size: str) -> None: _ = GPTLanguageModel( - vocab_size=TestModel.vocab_size, - **grab_arguments(GPTLanguageModel, config.model.gpt.size[model_size]), + vocab_size=TestModel.vocab_size, **grab_arguments(GPTLanguageModel, config.model.gpt.size[model_size]) ) - @pytest.mark.smoke + @pytest.mark.smoke() @pytest.mark.parametrize("model_type", ["gpt2"]) def test_gpt2_configs_load_fast(self, model_type: str) -> None: _ = GPTLanguageModel.from_pretrained(model_type) - @pytest.mark.slow + @pytest.mark.slow() @pytest.mark.parametrize("model_type", ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]) def test_gpt2_configs_load_slow(self, model_type: str) -> None: _ = GPTLanguageModel.from_pretrained(model_type) diff --git a/tests/smoke/train_test.py b/tests/smoke/train_test.py index e696ee2..405e2ad 100644 --- a/tests/smoke/train_test.py +++ b/tests/smoke/train_test.py @@ -5,7 +5,7 @@ from src import config -@pytest.mark.smoke +@pytest.mark.smoke() @pytest.mark.order(3) class TestTraining: @pytest.mark.parametrize("model_size", list(config.model.bigram.size.keys())) @@ -38,7 +38,7 @@ def test_gpt_training(self, model_size: str) -> None: # LoRA testing should be done after training without it to not confuse saved checkpoints -@pytest.mark.smoke +@pytest.mark.smoke() @pytest.mark.order(5) # Smoke tests of Low Ranking Adaptation (LoRA) class TestTrainingWithLoRA: