From 124a1f936bc6af6d1e0c42c5406466db5e2892ae Mon Sep 17 00:00:00 2001 From: debrupf2946 Date: Wed, 21 Aug 2024 13:13:23 +0530 Subject: [PATCH] added doc strings for the training files --- .../QLoRA_tuning/qlora_adapter.py | 106 ++++++++++++++++-- .../prompt_tuning/p_tuning.py | 91 ++++++++++++++- 2 files changed, 181 insertions(+), 16 deletions(-) diff --git a/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py index 740e2f5..b4e30cf 100644 --- a/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py +++ b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py @@ -1,3 +1,19 @@ +""" +This script facilitates the fine-tuning of a language model using QLoRA (Quantized Low-Rank Adapter) +adapter tuning. + +The main functionalities include: +- Preparing data from a specified repository with specific file extensions. +- Tokenizing the data for model training. +- Loading and configuring a pre-trained language model. +- Applying PEFT (Parameter-Efficient Fine-Tuning) using QLoRA. +- Defining training arguments and creating a Trainer instance. +- Executing the training process with the Trainer. + +Requirements: +- A YAML configuration file that specifies model, training, and data parameters. +""" + import argparse import yaml import os @@ -5,20 +21,28 @@ import torch from datasets import Dataset from transformers import Trainer, DataCollatorForLanguageModeling -from transformers import AutoModelForCausalLM, AutoTokenizer from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from transformers import ( AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, - logging, - set_seed, BitsAndBytesConfig, ) -def prepare_data(repo_path, extensions, output_file): +def prepare_data(repo_path: str, extensions: list, output_file: str): + """ + Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file. + + Args: + repo_path: Path to the repository to collect files from. + extensions: List of file extensions to include in the data preparation. + output_file: Path to the output file where the concatenated content will be saved. + + Returns: + A string containing the entire content written to the output file. + """ files = [] for ext in extensions: @@ -38,7 +62,17 @@ def prepare_data(repo_path, extensions, output_file): return f.read() -def data_for_training(content, config): +def data_for_training(content: str, config: dict): + """ + Tokenizes the content and prepares it for language model training, including creating a data collator. + + Args: + content: The concatenated text content to be tokenized. + config: Dictionary containing the model and training configuration. + + Returns: + A tuple containing the tokenized dataset,tokenizer,data collator for language model training. + """ tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"]) context_length = config["Model"]["context_length"] outputs = tokenizer( @@ -52,7 +86,9 @@ def data_for_training(content, config): print(f"Input chunk lengths: {outputs['length']}") print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") ds = Dataset.from_dict(outputs) - ds_removed = ds.remove_columns(["attention_mask", "length", "overflow_to_sample_mapping"]) + ds_removed = ds.remove_columns( + ["attention_mask", "length", "overflow_to_sample_mapping"] + ) tokenizer.pad_token = tokenizer.eos_token data_collator = DataCollatorForLanguageModeling( tokenizer, mlm=config["Training"]["masked_language_modelling"] @@ -60,7 +96,17 @@ def data_for_training(content, config): return ds_removed, data_collator, tokenizer -def load_base_model(config): +def load_base_model(config: dict): + """ + Loads the base language model with specified configurations, including quantization settings. + + Args: + config: The configuration dictionary containing model and BNB (BitsAndBytes) parameters. + + Returns: + PreTrainedModel: The loaded pre-trained language model ready for training. + """ + compute_dtype = getattr(torch, config["BNB_CONFIG"]["BNB_4BIT_COMPUTE_DTYPE"]) bnb_config = BitsAndBytesConfig( @@ -82,7 +128,18 @@ def load_base_model(config): return model -def load_peft_model(model, config): +def load_peft_model(model: object, config: dict): + """ + Applies PEFT (Parameter-Efficient Fine-Tuning) using QLoRA to the given model. + + Args: + model: The pre-trained language model to be fine-tuned. + config: The configuration dictionary containing LORA (Low-Rank Adapter) parameters. + + Returns: + PreTrainedModel: The PEFT-configured model ready for training. + """ + model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( lora_alpha=config["LORA"]["LORA_ALPHA"], @@ -97,7 +154,17 @@ def load_peft_model(model, config): return model -def create_training_arguments(config): +def create_training_arguments(config: dict): + """ + Creates and returns the training arguments for the Trainer. + + Args: + config: The configuration dictionary containing training arguments. + + Returns: + TrainingArguments: The configured training arguments. + """ + training_args = TrainingArguments( output_dir=f"results/{config['TRAINING_ARGUMENTS']['OUTPUT_DIR']}", num_train_epochs=3, @@ -123,7 +190,21 @@ def create_training_arguments(config): return training_args -def create_trainer(tokenizer, train_data, data_collator, model): +def create_trainer( + tokenizer: object, train_data: object, data_collator: object, model: object +): + """ + Creates a Trainer instance with the provided tokenizer, training data, data collator, and model. + + Args: + tokenizer: The tokenizer to be used during training. + train_data : The tokenized training dataset. + data_collator: The data collator for language modeling. + model : The pre-trained and fine-tuned model. + + Returns: + Trainer: The Trainer instance for model training. + """ training_args = create_training_arguments() trainer = Trainer( model=model, @@ -137,6 +218,11 @@ def create_trainer(tokenizer, train_data, data_collator, model): def main(): + """ + The main function that orchestrates the data preparation, model loading, + and training processes using the provided YAML configuration. + """ + parser = argparse.ArgumentParser( description="Training script for QLoRA adapter tuning" ) diff --git a/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py index 149c973..f24d954 100644 --- a/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py +++ b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py @@ -1,3 +1,19 @@ +""" +This script prepares data from a repository for training a P-tuning model using the PEFT library. +It reads source files, processes them into tokenized chunks, and trains a language model using the specified configuration. + +Functions: +- prepare_data: Collects files from a repository, concatenates their content, and saves it to an output file. +- data_for_training: Tokenizes the concatenated content and prepares it for language model training. +- get_peft_model: Initializes and configures a P-tuning model using the specified configuration. +- create_training_arguments: Generates training arguments for the Trainer using the configuration settings. +- create_trainer: Creates a Trainer object with the model, data, and training arguments. +- main: Parses the YAML configuration file and runs the training process. + +Requirements: +- A YAML configuration file that specifies model, training, and data parameters. +""" + import argparse import yaml import os @@ -9,7 +25,18 @@ from transformers import TrainingArguments -def prepare_data(repo_path, extensions, output_file): +def prepare_data(repo_path: str, extensions: list, output_file: str): + """ + Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file. + + Args: + repo_path: Path to the repository to collect files from. + extensions: List of file extensions to include in the data preparation. + output_file: Path to the output file where the concatenated content will be saved. + + Returns: + A string containing the entire content written to the output file. + """ files = [] for ext in extensions: @@ -29,7 +56,18 @@ def prepare_data(repo_path, extensions, output_file): return f.read() -def data_for_training(content, config): +def data_for_training(content: str, config: dict): + """ + Tokenizes the content and prepares it for language model training, including creating a data collator. + + Args: + content: The concatenated text content to be tokenized. + config: Dictionary containing the model and training configuration. + + Returns: + A tuple containing the tokenized dataset and the data collator for language model training. + """ + tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"]) context_length = config["Model"]["context_length"] outputs = tokenizer( @@ -43,7 +81,9 @@ def data_for_training(content, config): print(f"Input chunk lengths: {outputs['length']}") print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}") ds = Dataset.from_dict(outputs) - ds_removed = ds.remove_columns(["attention_mask", "length", "overflow_to_sample_mapping"]) + ds_removed = ds.remove_columns( + ["attention_mask", "length", "overflow_to_sample_mapping"] + ) tokenizer.pad_token = tokenizer.eos_token data_collator = DataCollatorForLanguageModeling( tokenizer, mlm=config["Training"]["masked_language_modelling"] @@ -51,7 +91,17 @@ def data_for_training(content, config): return ds_removed, data_collator -def get_peft_model(config): +def get_peft_model(config: dict): + """ + Initializes and configures a P-tuning model using the specified foundational model and prompt tuning configuration. + + Args: + config: Dictionary containing the model and training configuration. + + Returns: + A P-tuned model ready for training. + """ + foundational_model = AutoModelForCausalLM.from_pretrained( config["Model"]["model"], trust_remote_code=True ) @@ -66,7 +116,17 @@ def get_peft_model(config): return peft_model_prompt -def create_training_arguments(config): +def create_training_arguments(config: dict): + """ + Creates and configures the training arguments for the Trainer object. + + Args: + config: Dictionary containing the training configuration. + + Returns: + A TrainingArguments object with the specified settings. + """ + training_args = TrainingArguments( output_dir=config["Training"]["output_dir"], save_strategy="steps", @@ -79,7 +139,22 @@ def create_training_arguments(config): return training_args -def create_trainer(config, train_data, data_collator, model): +def create_trainer( + config: dict, train_data: object, data_collator: object, model: object +): + """ + Creates a Trainer object for training the model with the provided data and configuration. + + Args: + config: Dictionary containing the training configuration. + train_data: The tokenized dataset to be used for training hf Dataset object. + data_collator: The data collator for handling the tokenized data during training. + model: The P-tuned model to be trained. + + Returns: + A Trainer object configured for training the model. + """ + training_args = create_training_arguments(config) trainer = Trainer( model=model, @@ -91,6 +166,10 @@ def create_trainer(config, train_data, data_collator, model): def main(): + """ + Main function to execute the training pipeline. It parses the YAML configuration file, prepares the data, initializes + the model, and starts the training process. + """ parser = argparse.ArgumentParser(description="Training script for P-tuning model") parser.add_argument( "--config", type=str, required=True, help="Path to the YAML configuration file"