Skip to content

Commit

Permalink
pre-commit: running and fixing...
Browse files Browse the repository at this point in the history
  • Loading branch information
github-actions[bot] committed Nov 22, 2023
1 parent a1acab4 commit 6ba9332
Show file tree
Hide file tree
Showing 19 changed files with 66 additions and 221 deletions.
5 changes: 1 addition & 4 deletions notebooks/examples/gpt_model_training.ipynb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion src/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
from src.utils.config import config
2 changes: 0 additions & 2 deletions src/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from src.data.dataset import NextTokenDataset, NextTokenRandomDataset
from src.data.tokenizer import CharTokenizer
7 changes: 0 additions & 7 deletions src/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +0,0 @@
from src.model.bigram_language_model.bigram import BigramLanguageModel
from src.model.gpt_language_model.attention import MultiHeadAttention, SelfAttentionHead
from src.model.gpt_language_model.feed_forward import FeedForward
from src.model.gpt_language_model.gpt import GPTLanguageModel
from src.model.gpt_language_model.transformer_block import TransformerBlock
from src.model.lr_schedulers import CosineWarmupLRScheduler
from src.model.trainer import Trainer
11 changes: 2 additions & 9 deletions src/model/bigram_language_model/bigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,10 @@ def loss(self, logits: Tensor, targets: Tensor) -> Tensor:
tensor with loss value (of how good model's predictions are)
"""
B, T, C = logits.shape # noqa: N806
return F.cross_entropy(
logits.view(B * T, C),
targets.view(B * T),
)
return F.cross_entropy(logits.view(B * T, C), targets.view(B * T))

def generate(
self,
idx: Tensor,
max_new_tokens: int,
temperature: float = 1.0,
top_k_logits: Optional[int] = None,
self, idx: Tensor, max_new_tokens: int, temperature: float = 1.0, top_k_logits: Optional[int] = None
) -> Tensor:
"""Generate new character after the current one.
Expand Down
57 changes: 11 additions & 46 deletions src/model/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def generate_new_tokens(
elif size and gpt2_config:
log_error(
"For GPT language model either size or gpt2_config has to be provided, not both, "
f"but was provided size={size} and gpt2_config={gpt2_config}",
f"but was provided size={size} and gpt2_config={gpt2_config}"
)

# if all checks are passed, that means that either size of gpt2_config is provided
Expand All @@ -107,11 +107,7 @@ def generate_new_tokens(
# use only those kwargs that are accepted by function
kwargs = grab_arguments(
func=model.generate,
kwargs={
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"use_kv_cache": use_kv_cache,
},
kwargs={"max_new_tokens": max_new_tokens, "temperature": temperature, "use_kv_cache": use_kv_cache},
)
# model returns indices of the dictionary
new_token_indices = model.generate(init_context, **kwargs).squeeze().tolist()
Expand All @@ -125,24 +121,16 @@ def main() -> None:
"""Generate new tokens from either GPT or a simple bigram language model."""
# main parser will store subparsers, shared parser - arguments that are shared between subparsers
main_parser = argparse.ArgumentParser(
description="Generate new tokens with Bigram or GPT model",
formatter_class=argparse.RawTextHelpFormatter,
description="Generate new tokens with Bigram or GPT model", formatter_class=argparse.RawTextHelpFormatter
)
shared_parser = argparse.ArgumentParser(add_help=False)
# ordering matters: first shared arguments, then - subparsers
# ---------- shared arguments ----------
shared_parser.add_argument(
"--device",
help="Optionally you can select device on which the model will be trained",
required=False,
type=str,
"--device", help="Optionally you can select device on which the model will be trained", required=False, type=str
)
shared_parser.add_argument(
"--max-new-tokens",
default=100,
help="How many new tokens do you want to generate",
required=False,
type=int,
"--max-new-tokens", default=100, help="How many new tokens do you want to generate", required=False, type=int
)
shared_parser.add_argument(
"--temperature",
Expand All @@ -153,39 +141,22 @@ def main() -> None:
type=float,
)
shared_parser.add_argument(
"--fix-seed",
help="Make token generation deterministic",
action="store_true",
required=False,
"--fix-seed", help="Make token generation deterministic", action="store_true", required=False
)
shared_parser.add_argument(
"--continue-tokens",
default=" ",
help="Generation should continue these tokens",
required=False,
type=str,
"--continue-tokens", default=" ", help="Generation should continue these tokens", required=False, type=str
)
# ---------- subparsers ----------
subparsers = main_parser.add_subparsers(dest="model", description="Choose model type")
# bigram subparser
bigram_subparser = subparsers.add_parser("bigram", parents=[shared_parser])
bigram_subparser.add_argument(
"--size",
"-s",
choices=["large"],
help="The size of the Bigram model",
required=True,
type=str,
"--size", "-s", choices=["large"], help="The size of the Bigram model", required=True, type=str
)
# gpt subparser
gpt_subparser = subparsers.add_parser("gpt", parents=[shared_parser])
gpt_subparser.add_argument(
"--size",
"-s",
choices=["small", "medium", "large"],
help="The size of the GPT model",
required=False,
type=str,
"--size", "-s", choices=["small", "medium", "large"], help="The size of the GPT model", required=False, type=str
)
gpt_subparser.add_argument(
"--gpt2-config",
Expand All @@ -195,10 +166,7 @@ def main() -> None:
type=str,
)
gpt_subparser.add_argument(
"--use-kv-cache",
help="Use kv-value cache to speed up token generation",
action="store_true",
required=False,
"--use-kv-cache", help="Use kv-value cache to speed up token generation", action="store_true", required=False
)
# combining 'help' output from both argparsers
shared_parser_help = (
Expand All @@ -209,10 +177,7 @@ def main() -> None:

# parser arguments
args = vars(main_parser.parse_args())
model_name = {
"bigram": BigramLanguageModel,
"gpt": GPTLanguageModel,
}[args.pop("model")]
model_name = {"bigram": BigramLanguageModel, "gpt": GPTLanguageModel}[args.pop("model")]

# run token generation
generate_new_tokens(model_name, **args)
Expand Down
26 changes: 7 additions & 19 deletions src/model/gpt_language_model/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,7 @@

class SelfAttentionHead(nn.Module):
def __init__(
self,
embeddings_size: int,
context_size: int,
head_size: int,
bias: bool,
dropout: float,
*,
is_decoder: bool,
self, embeddings_size: int, context_size: int, head_size: int, bias: bool, dropout: float, *, is_decoder: bool
) -> None:
"""Single self-attention head.
Expand Down Expand Up @@ -186,7 +179,7 @@ def __init__(
if embeddings_size % num_heads != 0:
log_error(
"Embeddings size should be divisible by number of heads without remainder, "
f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}",
f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}"
)
head_size = embeddings_size // num_heads

Expand All @@ -209,7 +202,7 @@ def __init__(
is_decoder=self.is_decoder,
)
for _ in range(self.num_heads)
],
]
)

# if after concatenation the size of channels is bigger than embeddings size
Expand Down Expand Up @@ -260,15 +253,11 @@ def forward(self, x: Tensor, kv_cache: Optional[Tensor]) -> Tensor:

if all(x is not None for x in kv_cache):
kv_cache = torch.stack(
kv_cache,
dim=-2,
kv_cache, dim=-2
) # num_heads * (2, B, T, head_size) -> (2, B, T, num_heads, head_size)
kv_cache = kv_cache.transpose(2, 3) # (2, B, num_heads, T, head_size)

return (
output, # (B, T, num_heads * head_size)
kv_cache, # num_heads * None | (2, B, num_heads, T, head_size)
)
return (output, kv_cache) # (B, T, num_heads * head_size) # num_heads * None | (2, B, num_heads, T, head_size)


class CausalSelfAttention(nn.Module):
Expand Down Expand Up @@ -319,7 +308,7 @@ def __init__(
if embeddings_size % num_heads != 0:
log_error(
"Embeddings size should be divisible by the number of heads without a residual, "
f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}",
f"but was provided: embeddings_size={embeddings_size}; num_heads={num_heads}"
)
head_size = embeddings_size // num_heads

Expand Down Expand Up @@ -373,8 +362,7 @@ def forward(self, x: Tensor, kv_cache: Optional[Tensor]) -> Tensor:

# single pass for query, key and value; that's why we need to split into 3 parts
query, key, value = self.causal_self_attention(x).split(
self.head_size * self.num_heads,
dim=-1,
self.head_size * self.num_heads, dim=-1
) # (B, T, C) -> (B, T, 3 * hs * nh) -> (B, T, hs * nh)

# transform (B, T, nh * hs) -> (B, nh, T, hs) so it's similar to multi-head attention
Expand Down
25 changes: 7 additions & 18 deletions src/model/gpt_language_model/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(
use_causal_self_attention=True,
)
for _ in range(self.num_layers)
],
]
)
self.layer_norm_final = LayerNorm(self.embeddings_size, bias=self.bias) # final layer norm
self.language_model_head = nn.Linear(self.embeddings_size, self.vocab_size, bias=False)
Expand All @@ -114,8 +114,8 @@ def __init__(
# report number of parameters
logger.debug(
"GPT language model is created with number of parameters: {:.2f} million".format(
self.__get_parameters_number() / 1e6,
),
self.__get_parameters_number() / 1e6
)
)

def __get_parameters_number(self, exclude_positional_embeddings: bool = True) -> int:
Expand Down Expand Up @@ -147,13 +147,7 @@ def __init_weights(self, module: torch.nn.modules) -> None:
if hasattr(module, "bias") and module.bias is not None:
torch.nn.init.zeros_(module.bias)

def forward(
self,
idx: Tensor,
*,
inference: bool = False,
kv_cache: Optional[List[Tensor]] = None,
) -> Tensor:
def forward(self, idx: Tensor, *, inference: bool = False, kv_cache: Optional[List[Tensor]] = None) -> Tensor:
"""Do the whole forward pass for decoder part of transformer.
This forward method includes all steps for decoder:
Expand Down Expand Up @@ -279,10 +273,7 @@ def loss(self, logits: Tensor, targets: Tensor) -> Tensor:
tensor with loss value (of how good model's predictions are)
"""
B, T, C = logits.shape # noqa: N806
return F.cross_entropy(
logits.view(B * T, C),
targets.view(B * T),
)
return F.cross_entropy(logits.view(B * T, C), targets.view(B * T))

@classmethod
def from_pretrained(cls: "GPTLanguageModel", gpt2_type: str) -> "GPTLanguageModel":
Expand Down Expand Up @@ -398,7 +389,7 @@ def sync_name(name: str) -> str:
if source_weights.shape != target_state_dict[target_key].shape:
log_error(
f"Shape mismatch: shape of source '{source_weights.shape}' and destination - "
f"'{target_state_dict[target_key].shape}'",
f"'{target_state_dict[target_key].shape}'"
)
with torch.no_grad():
target_state_dict[target_key].copy_(source_weights)
Expand Down Expand Up @@ -473,9 +464,7 @@ def generate(
context = idx[:, -1:]
# get the predictions
logits, kv_cache = self(
context,
inference=True,
kv_cache=kv_cache if use_kv_cache else None,
context, inference=True, kv_cache=kv_cache if use_kv_cache else None
) # (B, T, C), with inference=True -> (1, 1, C)
# focus only on the last time step and scale by desired temperature
logits = logits[:, -1, :] / temperature # becomes (B, C)
Expand Down
24 changes: 7 additions & 17 deletions src/model/gpt_language_model/peft/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,7 @@


class LoRALayer:
def __init__(
self,
r: int,
lora_alpha: int,
lora_dropout: float,
merge_weights: bool,
) -> None:
def __init__(self, r: int, lora_alpha: int, lora_dropout: float, merge_weights: bool) -> None:
"""Store LoRA specific attributes in a class.
Parameters
Expand Down Expand Up @@ -160,7 +154,7 @@ def __init__(
if r > 0 and any(enable_lora):
self.lora_A = nn.Parameter(self.weight.new_zeros((r * sum(enable_lora), in_features))) # (4, 128)
self.lora_B = nn.Parameter(
self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r)), # (256, 2)
self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r)) # (256, 2)
) # weights for Conv1D with groups=sum(enable_lora)
# Notes about shapes above
# - self.lora_A has shape (4, 128): 4 because rank is 2 and LoRA is applied only to two matrices;
Expand Down Expand Up @@ -194,8 +188,7 @@ def __init__(
# | query | key | value |
# ----------------------------------------
self.lora_ind = self.weight.new_zeros((out_features,), dtype=torch.bool).view(
len(enable_lora),
-1,
len(enable_lora), -1
) # (3, 128)
self.lora_ind[enable_lora, :] = True # (3, 128)
self.lora_ind = self.lora_ind.view(-1) # (384,)
Expand Down Expand Up @@ -246,8 +239,7 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
result = x.new_zeros((*x.shape[:-1], self.out_features)) # (64, 64, 384)
result = result.view(-1, self.out_features) # (4096, 384)
result[:, self.lora_ind] = x.reshape(
-1,
self.out_features // len(self.enable_lora) * sum(self.enable_lora),
-1, self.out_features // len(self.enable_lora) * sum(self.enable_lora)
) # (4096, 256)
return result.view((*x.shape[:-1], self.out_features)).transpose(0, 1) # (64, 64, 384)

Expand Down Expand Up @@ -290,7 +282,7 @@ def T(w: torch.Tensor) -> torch.Tensor: # noqa: N802
self.lora_B.data.unsqueeze(-1), # (256, 2) -> (256, 2, 1)
groups=sum(self.enable_lora),
).squeeze( # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128)
0,
0
) # (1, 256, 128) -> (256, 128)
# -1: W = W - delta_W (unmerge), +1: W = W + delta_W (merge)
sign = -1 if mode else 1
Expand Down Expand Up @@ -335,8 +327,7 @@ def T(w: torch.Tensor) -> torch.Tensor: # noqa: N802
result = F.linear(x, T(self.weight), bias=self.bias) # (64, 64, 128) @ (384, 128) -> (64, 64, 384)
if self.r > 0:
after_A = F.linear( # noqa: N806
self.lora_dropout(x),
self.lora_A,
self.lora_dropout(x), self.lora_A
) # (64, 64, 128) @ (4, 128) -> (64, 64, 4)

# For F.conv1d:
Expand All @@ -349,8 +340,7 @@ def T(w: torch.Tensor) -> torch.Tensor: # noqa: N802
self.lora_B.unsqueeze(-1), # (256, 2) -> (256, 2, 1)
groups=sum(self.enable_lora),
).transpose( # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64)
-2,
-1,
-2, -1
) # (64, 256, 64) -> (64, 64, 256)

# (64, 64, 256) after zero_pad (64, 64, 384)
Expand Down
5 changes: 1 addition & 4 deletions src/model/gpt_language_model/transformer_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@

from torch import Tensor, nn

from src.model.gpt_language_model.attention import (
CausalSelfAttention,
MultiHeadAttention,
)
from src.model.gpt_language_model.attention import CausalSelfAttention, MultiHeadAttention
from src.model.gpt_language_model.feed_forward import FeedForward


Expand Down
Loading

0 comments on commit 6ba9332

Please sign in to comment.