Skip to content

Commit

Permalink
added doc strings for the training files
Browse files Browse the repository at this point in the history
  • Loading branch information
debrupf2946 committed Aug 21, 2024
1 parent 7ac02a5 commit 124a1f9
Show file tree
Hide file tree
Showing 2 changed files with 181 additions and 16 deletions.
Original file line number Diff line number Diff line change
@@ -1,24 +1,48 @@
"""
This script facilitates the fine-tuning of a language model using QLoRA (Quantized Low-Rank Adapter)
adapter tuning.
The main functionalities include:
- Preparing data from a specified repository with specific file extensions.
- Tokenizing the data for model training.
- Loading and configuring a pre-trained language model.
- Applying PEFT (Parameter-Efficient Fine-Tuning) using QLoRA.
- Defining training arguments and creating a Trainer instance.
- Executing the training process with the Trainer.
Requirements:
- A YAML configuration file that specifies model, training, and data parameters.
"""

import argparse
import yaml
import os
import glob
import torch
from datasets import Dataset
from transformers import Trainer, DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments,
logging,
set_seed,
BitsAndBytesConfig,
)


def prepare_data(repo_path, extensions, output_file):
def prepare_data(repo_path: str, extensions: list, output_file: str):
"""
Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file.
Args:
repo_path: Path to the repository to collect files from.
extensions: List of file extensions to include in the data preparation.
output_file: Path to the output file where the concatenated content will be saved.
Returns:
A string containing the entire content written to the output file.
"""

files = []
for ext in extensions:
Expand All @@ -38,7 +62,17 @@ def prepare_data(repo_path, extensions, output_file):
return f.read()


def data_for_training(content, config):
def data_for_training(content: str, config: dict):
"""
Tokenizes the content and prepares it for language model training, including creating a data collator.
Args:
content: The concatenated text content to be tokenized.
config: Dictionary containing the model and training configuration.
Returns:
A tuple containing the tokenized dataset,tokenizer,data collator for language model training.
"""
tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"])
context_length = config["Model"]["context_length"]
outputs = tokenizer(
Expand All @@ -52,15 +86,27 @@ def data_for_training(content, config):
print(f"Input chunk lengths: {outputs['length']}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
ds = Dataset.from_dict(outputs)
ds_removed = ds.remove_columns(["attention_mask", "length", "overflow_to_sample_mapping"])
ds_removed = ds.remove_columns(
["attention_mask", "length", "overflow_to_sample_mapping"]
)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
tokenizer, mlm=config["Training"]["masked_language_modelling"]
)
return ds_removed, data_collator, tokenizer


def load_base_model(config):
def load_base_model(config: dict):
"""
Loads the base language model with specified configurations, including quantization settings.
Args:
config: The configuration dictionary containing model and BNB (BitsAndBytes) parameters.
Returns:
PreTrainedModel: The loaded pre-trained language model ready for training.
"""

compute_dtype = getattr(torch, config["BNB_CONFIG"]["BNB_4BIT_COMPUTE_DTYPE"])

bnb_config = BitsAndBytesConfig(
Expand All @@ -82,7 +128,18 @@ def load_base_model(config):
return model


def load_peft_model(model, config):
def load_peft_model(model: object, config: dict):
"""
Applies PEFT (Parameter-Efficient Fine-Tuning) using QLoRA to the given model.
Args:
model: The pre-trained language model to be fine-tuned.
config: The configuration dictionary containing LORA (Low-Rank Adapter) parameters.
Returns:
PreTrainedModel: The PEFT-configured model ready for training.
"""

model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
lora_alpha=config["LORA"]["LORA_ALPHA"],
Expand All @@ -97,7 +154,17 @@ def load_peft_model(model, config):
return model


def create_training_arguments(config):
def create_training_arguments(config: dict):
"""
Creates and returns the training arguments for the Trainer.
Args:
config: The configuration dictionary containing training arguments.
Returns:
TrainingArguments: The configured training arguments.
"""

training_args = TrainingArguments(
output_dir=f"results/{config['TRAINING_ARGUMENTS']['OUTPUT_DIR']}",
num_train_epochs=3,
Expand All @@ -123,7 +190,21 @@ def create_training_arguments(config):
return training_args


def create_trainer(tokenizer, train_data, data_collator, model):
def create_trainer(
tokenizer: object, train_data: object, data_collator: object, model: object
):
"""
Creates a Trainer instance with the provided tokenizer, training data, data collator, and model.
Args:
tokenizer: The tokenizer to be used during training.
train_data : The tokenized training dataset.
data_collator: The data collator for language modeling.
model : The pre-trained and fine-tuned model.
Returns:
Trainer: The Trainer instance for model training.
"""
training_args = create_training_arguments()
trainer = Trainer(
model=model,
Expand All @@ -137,6 +218,11 @@ def create_trainer(tokenizer, train_data, data_collator, model):


def main():
"""
The main function that orchestrates the data preparation, model loading,
and training processes using the provided YAML configuration.
"""

parser = argparse.ArgumentParser(
description="Training script for QLoRA adapter tuning"
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
"""
This script prepares data from a repository for training a P-tuning model using the PEFT library.
It reads source files, processes them into tokenized chunks, and trains a language model using the specified configuration.
Functions:
- prepare_data: Collects files from a repository, concatenates their content, and saves it to an output file.
- data_for_training: Tokenizes the concatenated content and prepares it for language model training.
- get_peft_model: Initializes and configures a P-tuning model using the specified configuration.
- create_training_arguments: Generates training arguments for the Trainer using the configuration settings.
- create_trainer: Creates a Trainer object with the model, data, and training arguments.
- main: Parses the YAML configuration file and runs the training process.
Requirements:
- A YAML configuration file that specifies model, training, and data parameters.
"""

import argparse
import yaml
import os
Expand All @@ -9,7 +25,18 @@
from transformers import TrainingArguments


def prepare_data(repo_path, extensions, output_file):
def prepare_data(repo_path: str, extensions: list, output_file: str):
"""
Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file.
Args:
repo_path: Path to the repository to collect files from.
extensions: List of file extensions to include in the data preparation.
output_file: Path to the output file where the concatenated content will be saved.
Returns:
A string containing the entire content written to the output file.
"""

files = []
for ext in extensions:
Expand All @@ -29,7 +56,18 @@ def prepare_data(repo_path, extensions, output_file):
return f.read()


def data_for_training(content, config):
def data_for_training(content: str, config: dict):
"""
Tokenizes the content and prepares it for language model training, including creating a data collator.
Args:
content: The concatenated text content to be tokenized.
config: Dictionary containing the model and training configuration.
Returns:
A tuple containing the tokenized dataset and the data collator for language model training.
"""

tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"])
context_length = config["Model"]["context_length"]
outputs = tokenizer(
Expand All @@ -43,15 +81,27 @@ def data_for_training(content, config):
print(f"Input chunk lengths: {outputs['length']}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
ds = Dataset.from_dict(outputs)
ds_removed = ds.remove_columns(["attention_mask", "length", "overflow_to_sample_mapping"])
ds_removed = ds.remove_columns(
["attention_mask", "length", "overflow_to_sample_mapping"]
)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
tokenizer, mlm=config["Training"]["masked_language_modelling"]
)
return ds_removed, data_collator


def get_peft_model(config):
def get_peft_model(config: dict):
"""
Initializes and configures a P-tuning model using the specified foundational model and prompt tuning configuration.
Args:
config: Dictionary containing the model and training configuration.
Returns:
A P-tuned model ready for training.
"""

foundational_model = AutoModelForCausalLM.from_pretrained(
config["Model"]["model"], trust_remote_code=True
)
Expand All @@ -66,7 +116,17 @@ def get_peft_model(config):
return peft_model_prompt


def create_training_arguments(config):
def create_training_arguments(config: dict):
"""
Creates and configures the training arguments for the Trainer object.
Args:
config: Dictionary containing the training configuration.
Returns:
A TrainingArguments object with the specified settings.
"""

training_args = TrainingArguments(
output_dir=config["Training"]["output_dir"],
save_strategy="steps",
Expand All @@ -79,7 +139,22 @@ def create_training_arguments(config):
return training_args


def create_trainer(config, train_data, data_collator, model):
def create_trainer(
config: dict, train_data: object, data_collator: object, model: object
):
"""
Creates a Trainer object for training the model with the provided data and configuration.
Args:
config: Dictionary containing the training configuration.
train_data: The tokenized dataset to be used for training hf Dataset object.
data_collator: The data collator for handling the tokenized data during training.
model: The P-tuned model to be trained.
Returns:
A Trainer object configured for training the model.
"""

training_args = create_training_arguments(config)
trainer = Trainer(
model=model,
Expand All @@ -91,6 +166,10 @@ def create_trainer(config, train_data, data_collator, model):


def main():
"""
Main function to execute the training pipeline. It parses the YAML configuration file, prepares the data, initializes
the model, and starts the training process.
"""
parser = argparse.ArgumentParser(description="Training script for P-tuning model")
parser.add_argument(
"--config", type=str, required=True, help="Path to the YAML configuration file"
Expand Down

0 comments on commit 124a1f9

Please sign in to comment.