From 124a1f936bc6af6d1e0c42c5406466db5e2892ae Mon Sep 17 00:00:00 2001
From: debrupf2946 <f20212946@goa.bits-pilani.ac.in>
Date: Wed, 21 Aug 2024 13:13:23 +0530
Subject: [PATCH] added doc strings for the training files

---
 .../QLoRA_tuning/qlora_adapter.py             | 106 ++++++++++++++++--
 .../prompt_tuning/p_tuning.py                 |  91 ++++++++++++++-
 2 files changed, 181 insertions(+), 16 deletions(-)

diff --git a/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py
index 740e2f5..b4e30cf 100644
--- a/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py
+++ b/graph_rag/graph_retrieval/training_scripts/QLoRA_tuning/qlora_adapter.py
@@ -1,3 +1,19 @@
+"""
+This script facilitates the fine-tuning of a language model using QLoRA (Quantized Low-Rank Adapter)
+adapter tuning.
+
+The main functionalities include:
+- Preparing data from a specified repository with specific file extensions.
+- Tokenizing the data for model training.
+- Loading and configuring a pre-trained language model.
+- Applying PEFT (Parameter-Efficient Fine-Tuning) using QLoRA.
+- Defining training arguments and creating a Trainer instance.
+- Executing the training process with the Trainer.
+
+Requirements:
+- A YAML configuration file that specifies model, training, and data parameters.
+"""
+
 import argparse
 import yaml
 import os
@@ -5,20 +21,28 @@
 import torch
 from datasets import Dataset
 from transformers import Trainer, DataCollatorForLanguageModeling
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     Trainer,
     TrainingArguments,
-    logging,
-    set_seed,
     BitsAndBytesConfig,
 )
 
 
-def prepare_data(repo_path, extensions, output_file):
+def prepare_data(repo_path: str, extensions: list, output_file: str):
+    """
+    Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file.
+
+    Args:
+        repo_path: Path to the repository to collect files from.
+        extensions: List of file extensions to include in the data preparation.
+        output_file: Path to the output file where the concatenated content will be saved.
+
+    Returns:
+        A string containing the entire content written to the output file.
+    """
 
     files = []
     for ext in extensions:
@@ -38,7 +62,17 @@ def prepare_data(repo_path, extensions, output_file):
         return f.read()
 
 
-def data_for_training(content, config):
+def data_for_training(content: str, config: dict):
+    """
+    Tokenizes the content and prepares it for language model training, including creating a data collator.
+
+    Args:
+        content: The concatenated text content to be tokenized.
+        config: Dictionary containing the model and training configuration.
+
+    Returns:
+        A tuple containing the tokenized dataset,tokenizer,data collator for language model training.
+    """
     tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"])
     context_length = config["Model"]["context_length"]
     outputs = tokenizer(
@@ -52,7 +86,9 @@ def data_for_training(content, config):
     print(f"Input chunk lengths: {outputs['length']}")
     print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
     ds = Dataset.from_dict(outputs)
-    ds_removed = ds.remove_columns(["attention_mask", "length", "overflow_to_sample_mapping"])
+    ds_removed = ds.remove_columns(
+        ["attention_mask", "length", "overflow_to_sample_mapping"]
+    )
     tokenizer.pad_token = tokenizer.eos_token
     data_collator = DataCollatorForLanguageModeling(
         tokenizer, mlm=config["Training"]["masked_language_modelling"]
@@ -60,7 +96,17 @@ def data_for_training(content, config):
     return ds_removed, data_collator, tokenizer
 
 
-def load_base_model(config):
+def load_base_model(config: dict):
+    """
+    Loads the base language model with specified configurations, including quantization settings.
+
+    Args:
+        config: The configuration dictionary containing model and BNB (BitsAndBytes) parameters.
+
+    Returns:
+        PreTrainedModel: The loaded pre-trained language model ready for training.
+    """
+
     compute_dtype = getattr(torch, config["BNB_CONFIG"]["BNB_4BIT_COMPUTE_DTYPE"])
 
     bnb_config = BitsAndBytesConfig(
@@ -82,7 +128,18 @@ def load_base_model(config):
     return model
 
 
-def load_peft_model(model, config):
+def load_peft_model(model: object, config: dict):
+    """
+    Applies PEFT (Parameter-Efficient Fine-Tuning) using QLoRA to the given model.
+
+    Args:
+        model: The pre-trained language model to be fine-tuned.
+        config: The configuration dictionary containing LORA (Low-Rank Adapter) parameters.
+
+    Returns:
+        PreTrainedModel: The PEFT-configured model ready for training.
+    """
+
     model = prepare_model_for_kbit_training(model)
     peft_config = LoraConfig(
         lora_alpha=config["LORA"]["LORA_ALPHA"],
@@ -97,7 +154,17 @@ def load_peft_model(model, config):
     return model
 
 
-def create_training_arguments(config):
+def create_training_arguments(config: dict):
+    """
+    Creates and returns the training arguments for the Trainer.
+
+    Args:
+        config: The configuration dictionary containing training arguments.
+
+    Returns:
+        TrainingArguments: The configured training arguments.
+    """
+
     training_args = TrainingArguments(
         output_dir=f"results/{config['TRAINING_ARGUMENTS']['OUTPUT_DIR']}",
         num_train_epochs=3,
@@ -123,7 +190,21 @@ def create_training_arguments(config):
     return training_args
 
 
-def create_trainer(tokenizer, train_data, data_collator, model):
+def create_trainer(
+    tokenizer: object, train_data: object, data_collator: object, model: object
+):
+    """
+    Creates a Trainer instance with the provided tokenizer, training data, data collator, and model.
+
+    Args:
+        tokenizer: The tokenizer to be used during training.
+        train_data : The tokenized training dataset.
+        data_collator: The data collator for language modeling.
+        model : The pre-trained and fine-tuned model.
+
+    Returns:
+        Trainer: The Trainer instance for model training.
+    """
     training_args = create_training_arguments()
     trainer = Trainer(
         model=model,
@@ -137,6 +218,11 @@ def create_trainer(tokenizer, train_data, data_collator, model):
 
 
 def main():
+    """
+    The main function that orchestrates the data preparation, model loading,
+    and training processes using the provided YAML configuration.
+    """
+
     parser = argparse.ArgumentParser(
         description="Training script for QLoRA adapter tuning"
     )
diff --git a/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py
index 149c973..f24d954 100644
--- a/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py
+++ b/graph_rag/graph_retrieval/training_scripts/prompt_tuning/p_tuning.py
@@ -1,3 +1,19 @@
+"""
+This script prepares data from a repository for training a P-tuning model using the PEFT library.
+It reads source files, processes them into tokenized chunks, and trains a language model using the specified configuration.
+
+Functions:
+- prepare_data: Collects files from a repository, concatenates their content, and saves it to an output file.
+- data_for_training: Tokenizes the concatenated content and prepares it for language model training.
+- get_peft_model: Initializes and configures a P-tuning model using the specified configuration.
+- create_training_arguments: Generates training arguments for the Trainer using the configuration settings.
+- create_trainer: Creates a Trainer object with the model, data, and training arguments.
+- main: Parses the YAML configuration file and runs the training process.
+
+Requirements:
+- A YAML configuration file that specifies model, training, and data parameters.
+"""
+
 import argparse
 import yaml
 import os
@@ -9,7 +25,18 @@
 from transformers import TrainingArguments
 
 
-def prepare_data(repo_path, extensions, output_file):
+def prepare_data(repo_path: str, extensions: list, output_file: str):
+    """
+    Collects files with specified extensions from a repository, concatenates their content, and writes it to an output file.
+
+    Args:
+        repo_path: Path to the repository to collect files from.
+        extensions: List of file extensions to include in the data preparation.
+        output_file: Path to the output file where the concatenated content will be saved.
+
+    Returns:
+        A string containing the entire content written to the output file.
+    """
 
     files = []
     for ext in extensions:
@@ -29,7 +56,18 @@ def prepare_data(repo_path, extensions, output_file):
         return f.read()
 
 
-def data_for_training(content, config):
+def data_for_training(content: str, config: dict):
+    """
+    Tokenizes the content and prepares it for language model training, including creating a data collator.
+
+    Args:
+        content: The concatenated text content to be tokenized.
+        config: Dictionary containing the model and training configuration.
+
+    Returns:
+        A tuple containing the tokenized dataset and the data collator for language model training.
+    """
+
     tokenizer = AutoTokenizer.from_pretrained(config["Model"]["model"])
     context_length = config["Model"]["context_length"]
     outputs = tokenizer(
@@ -43,7 +81,9 @@ def data_for_training(content, config):
     print(f"Input chunk lengths: {outputs['length']}")
     print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
     ds = Dataset.from_dict(outputs)
-    ds_removed = ds.remove_columns(["attention_mask", "length", "overflow_to_sample_mapping"])
+    ds_removed = ds.remove_columns(
+        ["attention_mask", "length", "overflow_to_sample_mapping"]
+    )
     tokenizer.pad_token = tokenizer.eos_token
     data_collator = DataCollatorForLanguageModeling(
         tokenizer, mlm=config["Training"]["masked_language_modelling"]
@@ -51,7 +91,17 @@ def data_for_training(content, config):
     return ds_removed, data_collator
 
 
-def get_peft_model(config):
+def get_peft_model(config: dict):
+    """
+    Initializes and configures a P-tuning model using the specified foundational model and prompt tuning configuration.
+
+    Args:
+        config: Dictionary containing the model and training configuration.
+
+    Returns:
+        A P-tuned model ready for training.
+    """
+
     foundational_model = AutoModelForCausalLM.from_pretrained(
         config["Model"]["model"], trust_remote_code=True
     )
@@ -66,7 +116,17 @@ def get_peft_model(config):
     return peft_model_prompt
 
 
-def create_training_arguments(config):
+def create_training_arguments(config: dict):
+    """
+    Creates and configures the training arguments for the Trainer object.
+
+    Args:
+        config: Dictionary containing the training configuration.
+
+    Returns:
+        A TrainingArguments object with the specified settings.
+    """
+
     training_args = TrainingArguments(
         output_dir=config["Training"]["output_dir"],
         save_strategy="steps",
@@ -79,7 +139,22 @@ def create_training_arguments(config):
     return training_args
 
 
-def create_trainer(config, train_data, data_collator, model):
+def create_trainer(
+    config: dict, train_data: object, data_collator: object, model: object
+):
+    """
+    Creates a Trainer object for training the model with the provided data and configuration.
+
+    Args:
+        config: Dictionary containing the training configuration.
+        train_data: The tokenized dataset to be used for training hf Dataset object.
+        data_collator: The data collator for handling the tokenized data during training.
+        model: The P-tuned model to be trained.
+
+    Returns:
+        A Trainer object configured for training the model.
+    """
+
     training_args = create_training_arguments(config)
     trainer = Trainer(
         model=model,
@@ -91,6 +166,10 @@ def create_trainer(config, train_data, data_collator, model):
 
 
 def main():
+    """
+    Main function to execute the training pipeline. It parses the YAML configuration file, prepares the data, initializes
+    the model, and starts the training process.
+    """
     parser = argparse.ArgumentParser(description="Training script for P-tuning model")
     parser.add_argument(
         "--config", type=str, required=True, help="Path to the YAML configuration file"