From 6cd065076622c999cc516bc8f09aebc06cd3cf65 Mon Sep 17 00:00:00 2001
From: Remi Delacourt <remi.delacourt@gmail.com>
Date: Mon, 22 Jul 2024 04:00:35 +0000
Subject: [PATCH] rename demo_class to demo

---
 inference/python/peft_demo/demo.py       | 238 +++++++++++++++++++++++
 inference/python/peft_demo/demo_class.py | 231 ----------------------
 2 files changed, 238 insertions(+), 231 deletions(-)
 create mode 100644 inference/python/peft_demo/demo.py
 delete mode 100644 inference/python/peft_demo/demo_class.py

diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py
new file mode 100644
index 0000000000..43e6339184
--- /dev/null
+++ b/inference/python/peft_demo/demo.py
@@ -0,0 +1,238 @@
+import json, random, subprocess
+from datasets import load_dataset
+from inference.python.peft_demo.demo import FlexFlowDemo
+from types import SimpleNamespace
+from huggingface_hub import HfFolder
+import os
+import flexflow.serve as ff
+import matplotlib.pyplot as plt
+
+
+def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):
+    """Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.
+    Only the 'open_qa' and 'closed_qa' prompts without context are kept.
+    The datasets are saved into the files given as arguments.
+
+    Keyword arguments:
+    dataset_size -- the number of prompts to consider
+    inference_file_path -- the file in which to save the inference data
+    finetuning_file_path -- the file in which to save the finetuning data
+    """
+    dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
+    inference_data = []
+    finetuning_data = []
+    for row in dataset:
+        if len(finetuning_data) == finetune_dataset_size:
+            break
+        if ("open_qa" in row['category'] or "closed_qa" in row['category']) and len(row['context']) == 0:
+            inference_data.append(row['instruction'])
+            finetuning_data.append(row['instruction'] + " " + row['response'])
+    with open(inference_file_path, 'w') as file:
+        json.dump(inference_data[:1], file)
+    with open(finetuning_file_path, 'w') as file:
+        json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': '))
+
+
+configs_dict = {
+    "num_gpus": 4,
+    "memory_per_gpu": 14000,
+    "zero_copy_memory_per_node": 40000,
+    "num_cpus": 4,
+    "legion_utility_processors": 4,
+    "data_parallelism_degree": 1,
+    "tensor_parallelism_degree": 1,
+    "pipeline_parallelism_degree": 4,
+    "offload": False,
+    "offload_reserve_space_size": 8 * 1024,  # 8GB
+    "use_4bit_quantization": False,
+    "use_8bit_quantization": False,
+    "enable_peft": True,
+    "peft_activation_reserve_space_size": 1024,  # 1GB
+    "peft_weight_reserve_space_size": 1024,  # 1GB
+    "profiling": False,
+    "inference_debugging": False,
+    "fusion": False,
+    "max_requests_per_batch": 1,
+    "max_sequence_length": 256,
+    "max_tokens_per_batch": 128,
+    "max_training_steps": 10,
+    "seed": 42,
+}
+model_configs = {
+    "base_model": "meta-llama/Meta-Llama-3-8B",
+    "inference_peft_model_id": "goliaro/llama-3-8b-lora",
+    "finetuning_peft_model_id": "flechman/llama-3-8b-lora-dolly",
+    "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+    "refresh_cache": False,
+    "full_precision": True,
+    # relative paths
+    "inference_dataset": "inference_dataset.json",
+    "finetuning_dataset": "finetuning_dataset.json",
+    "output_file": "peft_demo.txt",
+}
+generation_configs = {
+    "do_sample": False,
+    "temperature": 0.9,
+    "topp": 0.8,
+    "topk": 1,
+}
+finetuning_configs = {
+    "learning_rate": 1.0,
+    "momentum": 0.0,
+    "weight_decay": 0.0,
+    "nesterov": False,
+}
+# Merge dictionaries
+configs_dict.update(model_configs)
+configs_dict.update(generation_configs)
+configs_dict.update(finetuning_configs)
+
+
+random.seed(configs_dict["seed"])
+
+create_datasets(inference_file_path=configs_dict["inference_dataset"], 
+                finetuning_file_path=configs_dict["finetuning_dataset"])
+
+configs = SimpleNamespace(**configs_dict)
+
+# Clear output file
+with open(configs.output_file, 'w') as file:
+    file.write('')
+
+# Download base and peft inference models
+args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]
+hf_token = input("Please enter your HuggingFace personal access token: ")
+subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
+
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+ff.init(configs_dict)
+
+# Create the FlexFlow LLM
+ff_data_type = (
+    ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+)
+llm = ff.LLM(
+    configs.base_model,
+    data_type=ff_data_type,
+    cache_path=configs.cache_path,
+    refresh_cache=configs.refresh_cache,
+    output_file=configs.output_file,
+)
+# Add inference and/or finetuning lora
+lora_inference_config = None
+lora_finetuning_config = None
+if len(configs.inference_dataset) > 0:
+    lora_inference_config = ff.LoraLinearConfig(
+        llm.cache_path, 
+        configs.inference_peft_model_id,
+        base_model_name_or_path=configs.base_model
+    )
+    llm.add_peft(lora_inference_config)
+if len(configs.finetuning_dataset) > 0:
+    lora_finetuning_config = ff.LoraLinearConfig(
+        llm.cache_path,
+        configs.finetuning_peft_model_id,
+        trainable=True,
+        init_lora_weights=True,
+        rank=16,
+        lora_alpha=16.0,
+        target_modules = ["down_proj"],
+        base_model_name_or_path=configs.base_model,
+        optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+        optimizer_kwargs={
+            "learning_rate": configs.learning_rate,
+            "momentum": configs.momentum,
+            "weight_decay": configs.weight_decay,
+            "nesterov": configs.nesterov,
+        },
+    )
+    llm.add_peft(lora_finetuning_config)
+
+# Compile the LLM for inference and load the weights into memory
+generation_config = ff.GenerationConfig(
+    do_sample=configs.do_sample,
+    temperature=configs.temperature,
+    topp=configs.topp,
+    topk=configs.topk
+)
+enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+llm.compile(
+    generation_config,
+    enable_peft_finetuning=enable_peft_finetuning,
+    max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning),
+    max_seq_length=configs.max_sequence_length,
+    max_tokens_per_batch=configs.max_tokens_per_batch,
+)
+
+
+llm.start_server()
+
+
+prompts = [s for s in json.load(open(configs.inference_dataset))]
+inference_requests = [
+    ff.Request(
+        ff.RequestType.REQ_INFERENCE,
+        prompt=prompt,
+        max_sequence_length=configs.max_sequence_length,
+        peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+    )
+    for prompt in prompts
+]
+inf_req_res_1 = llm.generate(inference_requests)
+
+
+finetuning_request = ff.Request(
+    ff.RequestType.REQ_FINETUNING,
+    max_sequence_length=configs.max_sequence_length,
+    peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
+    dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset),
+    max_training_steps=configs.max_training_steps,
+)
+ft_res = llm.generate([finetuning_request])
+
+
+hf_token = input("Please enter your HuggingFace personal access token: ")
+subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+subprocess.run(['python', '../../utils/upload_peft_model.py'] + [configs.finetuning_peft_model_id])
+
+
+lora_inference_config = ff.LoraLinearConfig(
+    llm.cache_path, 
+    configs.finetuning_peft_model_id,
+    base_model_name_or_path=configs.base_model
+)
+llm.add_peft(lora_inference_config)
+
+args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model]
+#hf_token = input("Please enter your HuggingFace personal access token: ")
+subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
+
+
+prompts = [s for s in json.load(open(configs.inference_dataset))]
+inference_requests = [
+    ff.Request(
+        ff.RequestType.REQ_INFERENCE,
+        prompt=prompt,
+        max_sequence_length=configs.max_sequence_length,
+        peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+    )
+    for prompt in prompts
+]
+inf_req_res_2 = llm.generate(inference_requests)
+
+
+llm.stop_server()
+
+
+print("==Inference result before finetuning: ", inf_req_res_1[0].output_text)
+print("==Inference result after finetuning: ", inf_req_res_2[0].output_text)
+
+
+epochs = list(range(configs_dict["max_training_steps"]))
+loss_values = ft_res[0].finetuning_losses
+
+plt.figure(figsize=(10, 6))
+plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b')
\ No newline at end of file
diff --git a/inference/python/peft_demo/demo_class.py b/inference/python/peft_demo/demo_class.py
deleted file mode 100644
index 90ff9c63c2..0000000000
--- a/inference/python/peft_demo/demo_class.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import flexflow.serve as ff
-import json, os, warnings
-from types import SimpleNamespace
-from huggingface_hub import HfFolder
-import random
-import subprocess
-
-
-class FlexFlowDemo(object):
-
-    def __init__(self, configs_dict):
-        self.configs_dict = configs_dict
-        self.configs = SimpleNamespace(**configs_dict)
-        self.llm = None
-        self.server_started = False
-        self.server_stopped = False
-
-        # Clear output file
-        with open(self.configs.output_file, 'w') as file:
-            file.write('')
-
-    def initialize_flexflow(self):
-        if self.llm is None:
-            # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
-            ff.init(self.configs_dict)
-
-            # Create the FlexFlow LLM
-            ff_data_type = (
-                ff.DataType.DT_FLOAT if self.configs.full_precision else ff.DataType.DT_HALF
-            )
-            self.llm = ff.LLM(
-                self.configs.base_model,
-                data_type=ff_data_type,
-                cache_path=self.configs.cache_path,
-                refresh_cache=self.configs.refresh_cache,
-                output_file=self.configs.output_file,
-            )
-            # Add inference and/or finetuning lora
-            self.lora_inference_config = None
-            self.lora_finetuning_config = None
-            if len(self.configs.inference_dataset) > 0:
-                self.lora_inference_config = ff.LoraLinearConfig(
-                    self.llm.cache_path, 
-                    self.configs.inference_peft_model_id,
-                    base_model_name_or_path=self.configs.base_model
-                )
-                self.llm.add_peft(self.lora_inference_config)
-            if len(self.configs.finetuning_dataset) > 0:
-                self.lora_finetuning_config = ff.LoraLinearConfig(
-                    self.llm.cache_path,
-                    self.configs.finetuning_peft_model_id,
-                    trainable=True,
-                    init_lora_weights=True,
-                    rank=16,
-                    lora_alpha=16.0,
-                    target_modules = ["down_proj"],
-                    base_model_name_or_path=self.configs.base_model,
-                    optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
-                    optimizer_kwargs={
-                        "learning_rate": self.configs.learning_rate,
-                        "momentum": self.configs.momentum,
-                        "weight_decay": self.configs.weight_decay,
-                        "nesterov": self.configs.nesterov,
-                    },
-                )
-                self.llm.add_peft(self.lora_finetuning_config)
-
-            # Compile the LLM for inference and load the weights into memory
-            generation_config = ff.GenerationConfig(
-                do_sample=self.configs.do_sample,
-                temperature=self.configs.temperature,
-                topp=self.configs.topp,
-                topk=self.configs.topk
-            )
-            enable_peft_finetuning = len(self.configs.finetuning_dataset) > 0
-            self.llm.compile(
-                generation_config,
-                enable_peft_finetuning=enable_peft_finetuning,
-                max_requests_per_batch=self.configs.max_requests_per_batch+int(enable_peft_finetuning),
-                max_seq_length=self.configs.max_sequence_length,
-                max_tokens_per_batch=self.configs.max_tokens_per_batch,
-            )
-        else:
-            warnings.warn("FlexFlow has already been initialized. The behavior of the program from now on is undefined.")
-
-    def start_server(self):
-        if self.llm is None:
-            raise Exception("FlexFlow has not been initialized.")
-        if not self.server_started and not self.server_stopped:
-            self.llm.start_server()
-            self.server_started = True
-
-    def stop_server(self):
-        if self.llm is None:
-            raise Exception("FlexFlow has not been initialized.")
-        if self.server_started and not self.server_stopped:
-            self.llm.stop_server()
-            self.server_stopped = True
-
-    def generate_inference(self):
-        if self.llm is None:
-            raise Exception("FlexFlow has not been initialized.")
-        if not self.server_started:
-            raise Exception("Server has not started.")
-        if self.server_stopped:
-            raise Exception("Server stopped.")
-        
-        if len(self.configs.inference_dataset) > 0:
-            prompts = [s for s in json.load(open(self.configs.inference_dataset))]
-            inference_requests = [
-                ff.Request(
-                    ff.RequestType.REQ_INFERENCE,
-                    prompt=prompt,
-                    max_sequence_length=self.configs.max_sequence_length,
-                    peft_model_id=self.llm.get_ff_peft_id(self.lora_inference_config),
-                )
-                for prompt in prompts
-            ]
-            return self.llm.generate(inference_requests)
-        return None
-
-    def generate_finetuning(self):
-        if self.llm is None:
-            raise Exception("FlexFlow has not been initialized.")
-        if not self.server_started:
-            raise Exception("Server has not started.")
-        if self.server_stopped:
-            raise Exception("Server stopped.")
-
-        if len(self.configs.finetuning_dataset) > 0:
-            finetuning_request = ff.Request(
-                ff.RequestType.REQ_FINETUNING,
-                max_sequence_length=self.configs.max_sequence_length,
-                peft_model_id=self.llm.get_ff_peft_id(self.lora_finetuning_config),
-                dataset_filepath=os.path.join(os.getcwd(), self.configs.finetuning_dataset),
-                max_training_steps=self.configs.max_training_steps,
-            )
-            return self.llm.generate([finetuning_request])
-        return None
-
-    def download_models(self, refresh_cache=True, ask_for_token=False):
-        args = [self.configs.inference_peft_model_id, '--base_model_name', self.configs.base_model]
-        if refresh_cache:
-            args.append('--refresh-cache')
-        if not HfFolder.get_token() or ask_for_token:
-            hf_token = input("Please enter your HuggingFace personal access token: ")
-            subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
-        subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
-
-    def upload_finetuned_model(self, ask_for_token=False):
-        if self.llm is None:
-            raise Exception("FlexFlow has not been initialized.")
-        if self.lora_finetuning_config is not None:
-            args = [self.configs.finetuning_peft_model_id]
-            if not HfFolder.get_token() or ask_for_token:
-                hf_token = input("Please enter your HuggingFace personal access token: ")
-                subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
-            subprocess.run(['python', '../../utils/upload_peft_model.py'] + args)
-
-def main():
-    configs_dict = {
-        "num_gpus": 4,
-        "memory_per_gpu": 14000,
-        "zero_copy_memory_per_node": 40000,
-        "num_cpus": 4,
-        "legion_utility_processors": 4,
-        "data_parallelism_degree": 1,
-        "tensor_parallelism_degree": 1,
-        "pipeline_parallelism_degree": 4,
-        "offload": False,
-        "offload_reserve_space_size": 8 * 1024,  # 8GB
-        "use_4bit_quantization": False,
-        "use_8bit_quantization": False,
-        "enable_peft": True,
-        "peft_activation_reserve_space_size": 1024,  # 1GB
-        "peft_weight_reserve_space_size": 1024,  # 1GB
-        "profiling": False,
-        "inference_debugging": False,
-        "fusion": False,
-        "max_requests_per_batch": 1,
-        "max_sequence_length": 256,
-        "max_tokens_per_batch": 128,
-        "max_training_steps": 10,
-        "seed": 42,
-    }
-    model_configs = {
-        "base_model": "meta-llama/Meta-Llama-3-8B",
-        "inference_peft_model_id": "goliaro/llama-3-8b-lora",
-        "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly",
-        "cache_path": os.environ.get("FF_CACHE_PATH", ""),
-        "refresh_cache": False,
-        "full_precision": True,
-        # relative paths
-        "inference_dataset": "inference_dataset.json",
-        "finetuning_dataset": "finetuning_dataset.json",
-        "output_file": "peft_demo.txt",
-    }
-    generation_configs = {
-        "do_sample": False,
-        "temperature": 0.9,
-        "topp": 0.8,
-        "topk": 1,
-    }
-    finetuning_configs = {
-        "learning_rate": 1.0,
-        "momentum": 0.0,
-        "weight_decay": 0.0,
-        "nesterov": False,
-    }
-    # Merge dictionaries
-    configs_dict.update(model_configs)
-    configs_dict.update(generation_configs)
-    configs_dict.update(finetuning_configs)
-
-    random.seed(configs_dict["seed"])
-
-    demo = FlexFlowDemo(configs_dict)
-
-    demo.download_models(ask_for_token=True)
-    demo.initialize_flexflow()
-    demo.start_server()
-    inf_results_1 = demo.generate_inference()
-    ft_results = demo.generate_finetuning()
-    demo.upload_finetuned_model(ask_for_token=True)
-    #inf_results_2 = demo.generate_inference()
-    demo.stop_server()
-    #print(inf_results_1[0].output_text)
-    #print(ft_results[0].finetuning_losses)
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file