From 6cd065076622c999cc516bc8f09aebc06cd3cf65 Mon Sep 17 00:00:00 2001 From: Remi Delacourt Date: Mon, 22 Jul 2024 04:00:35 +0000 Subject: [PATCH] rename demo_class to demo --- inference/python/peft_demo/demo.py | 238 +++++++++++++++++++++++ inference/python/peft_demo/demo_class.py | 231 ---------------------- 2 files changed, 238 insertions(+), 231 deletions(-) create mode 100644 inference/python/peft_demo/demo.py delete mode 100644 inference/python/peft_demo/demo_class.py diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py new file mode 100644 index 0000000000..43e6339184 --- /dev/null +++ b/inference/python/peft_demo/demo.py @@ -0,0 +1,238 @@ +import json, random, subprocess +from datasets import load_dataset +from inference.python.peft_demo.demo import FlexFlowDemo +from types import SimpleNamespace +from huggingface_hub import HfFolder +import os +import flexflow.serve as ff +import matplotlib.pyplot as plt + + +def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'): + """Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k. + Only the 'open_qa' and 'closed_qa' prompts without context are kept. + The datasets are saved into the files given as arguments. + + Keyword arguments: + dataset_size -- the number of prompts to consider + inference_file_path -- the file in which to save the inference data + finetuning_file_path -- the file in which to save the finetuning data + """ + dataset = load_dataset("databricks/databricks-dolly-15k", split="train") + inference_data = [] + finetuning_data = [] + for row in dataset: + if len(finetuning_data) == finetune_dataset_size: + break + if ("open_qa" in row['category'] or "closed_qa" in row['category']) and len(row['context']) == 0: + inference_data.append(row['instruction']) + finetuning_data.append(row['instruction'] + " " + row['response']) + with open(inference_file_path, 'w') as file: + json.dump(inference_data[:1], file) + with open(finetuning_file_path, 'w') as file: + json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': ')) + + +configs_dict = { + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 4, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": True, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "inference_debugging": False, + "fusion": False, + "max_requests_per_batch": 1, + "max_sequence_length": 256, + "max_tokens_per_batch": 128, + "max_training_steps": 10, + "seed": 42, +} +model_configs = { + "base_model": "meta-llama/Meta-Llama-3-8B", + "inference_peft_model_id": "goliaro/llama-3-8b-lora", + "finetuning_peft_model_id": "flechman/llama-3-8b-lora-dolly", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": True, + # relative paths + "inference_dataset": "inference_dataset.json", + "finetuning_dataset": "finetuning_dataset.json", + "output_file": "peft_demo.txt", +} +generation_configs = { + "do_sample": False, + "temperature": 0.9, + "topp": 0.8, + "topk": 1, +} +finetuning_configs = { + "learning_rate": 1.0, + "momentum": 0.0, + "weight_decay": 0.0, + "nesterov": False, +} +# Merge dictionaries +configs_dict.update(model_configs) +configs_dict.update(generation_configs) +configs_dict.update(finetuning_configs) + + +random.seed(configs_dict["seed"]) + +create_datasets(inference_file_path=configs_dict["inference_dataset"], + finetuning_file_path=configs_dict["finetuning_dataset"]) + +configs = SimpleNamespace(**configs_dict) + +# Clear output file +with open(configs.output_file, 'w') as file: + file.write('') + +# Download base and peft inference models +args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model] +hf_token = input("Please enter your HuggingFace personal access token: ") +subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +subprocess.run(['python', '../../utils/download_peft_model.py'] + args) + + +# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs +ff.init(configs_dict) + +# Create the FlexFlow LLM +ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF +) +llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, +) +# Add inference and/or finetuning lora +lora_inference_config = None +lora_finetuning_config = None +if len(configs.inference_dataset) > 0: + lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.inference_peft_model_id, + base_model_name_or_path=configs.base_model + ) + llm.add_peft(lora_inference_config) +if len(configs.finetuning_dataset) > 0: + lora_finetuning_config = ff.LoraLinearConfig( + llm.cache_path, + configs.finetuning_peft_model_id, + trainable=True, + init_lora_weights=True, + rank=16, + lora_alpha=16.0, + target_modules = ["down_proj"], + base_model_name_or_path=configs.base_model, + optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, + optimizer_kwargs={ + "learning_rate": configs.learning_rate, + "momentum": configs.momentum, + "weight_decay": configs.weight_decay, + "nesterov": configs.nesterov, + }, + ) + llm.add_peft(lora_finetuning_config) + +# Compile the LLM for inference and load the weights into memory +generation_config = ff.GenerationConfig( + do_sample=configs.do_sample, + temperature=configs.temperature, + topp=configs.topp, + topk=configs.topk +) +enable_peft_finetuning = len(configs.finetuning_dataset) > 0 +llm.compile( + generation_config, + enable_peft_finetuning=enable_peft_finetuning, + max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning), + max_seq_length=configs.max_sequence_length, + max_tokens_per_batch=configs.max_tokens_per_batch, +) + + +llm.start_server() + + +prompts = [s for s in json.load(open(configs.inference_dataset))] +inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, + prompt=prompt, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(lora_inference_config), + ) + for prompt in prompts +] +inf_req_res_1 = llm.generate(inference_requests) + + +finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(lora_finetuning_config), + dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset), + max_training_steps=configs.max_training_steps, +) +ft_res = llm.generate([finetuning_request]) + + +hf_token = input("Please enter your HuggingFace personal access token: ") +subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +subprocess.run(['python', '../../utils/upload_peft_model.py'] + [configs.finetuning_peft_model_id]) + + +lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.finetuning_peft_model_id, + base_model_name_or_path=configs.base_model +) +llm.add_peft(lora_inference_config) + +args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model] +#hf_token = input("Please enter your HuggingFace personal access token: ") +subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +subprocess.run(['python', '../../utils/download_peft_model.py'] + args) + + +prompts = [s for s in json.load(open(configs.inference_dataset))] +inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, + prompt=prompt, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(lora_inference_config), + ) + for prompt in prompts +] +inf_req_res_2 = llm.generate(inference_requests) + + +llm.stop_server() + + +print("==Inference result before finetuning: ", inf_req_res_1[0].output_text) +print("==Inference result after finetuning: ", inf_req_res_2[0].output_text) + + +epochs = list(range(configs_dict["max_training_steps"])) +loss_values = ft_res[0].finetuning_losses + +plt.figure(figsize=(10, 6)) +plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b') \ No newline at end of file diff --git a/inference/python/peft_demo/demo_class.py b/inference/python/peft_demo/demo_class.py deleted file mode 100644 index 90ff9c63c2..0000000000 --- a/inference/python/peft_demo/demo_class.py +++ /dev/null @@ -1,231 +0,0 @@ -import flexflow.serve as ff -import json, os, warnings -from types import SimpleNamespace -from huggingface_hub import HfFolder -import random -import subprocess - - -class FlexFlowDemo(object): - - def __init__(self, configs_dict): - self.configs_dict = configs_dict - self.configs = SimpleNamespace(**configs_dict) - self.llm = None - self.server_started = False - self.server_stopped = False - - # Clear output file - with open(self.configs.output_file, 'w') as file: - file.write('') - - def initialize_flexflow(self): - if self.llm is None: - # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs - ff.init(self.configs_dict) - - # Create the FlexFlow LLM - ff_data_type = ( - ff.DataType.DT_FLOAT if self.configs.full_precision else ff.DataType.DT_HALF - ) - self.llm = ff.LLM( - self.configs.base_model, - data_type=ff_data_type, - cache_path=self.configs.cache_path, - refresh_cache=self.configs.refresh_cache, - output_file=self.configs.output_file, - ) - # Add inference and/or finetuning lora - self.lora_inference_config = None - self.lora_finetuning_config = None - if len(self.configs.inference_dataset) > 0: - self.lora_inference_config = ff.LoraLinearConfig( - self.llm.cache_path, - self.configs.inference_peft_model_id, - base_model_name_or_path=self.configs.base_model - ) - self.llm.add_peft(self.lora_inference_config) - if len(self.configs.finetuning_dataset) > 0: - self.lora_finetuning_config = ff.LoraLinearConfig( - self.llm.cache_path, - self.configs.finetuning_peft_model_id, - trainable=True, - init_lora_weights=True, - rank=16, - lora_alpha=16.0, - target_modules = ["down_proj"], - base_model_name_or_path=self.configs.base_model, - optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, - optimizer_kwargs={ - "learning_rate": self.configs.learning_rate, - "momentum": self.configs.momentum, - "weight_decay": self.configs.weight_decay, - "nesterov": self.configs.nesterov, - }, - ) - self.llm.add_peft(self.lora_finetuning_config) - - # Compile the LLM for inference and load the weights into memory - generation_config = ff.GenerationConfig( - do_sample=self.configs.do_sample, - temperature=self.configs.temperature, - topp=self.configs.topp, - topk=self.configs.topk - ) - enable_peft_finetuning = len(self.configs.finetuning_dataset) > 0 - self.llm.compile( - generation_config, - enable_peft_finetuning=enable_peft_finetuning, - max_requests_per_batch=self.configs.max_requests_per_batch+int(enable_peft_finetuning), - max_seq_length=self.configs.max_sequence_length, - max_tokens_per_batch=self.configs.max_tokens_per_batch, - ) - else: - warnings.warn("FlexFlow has already been initialized. The behavior of the program from now on is undefined.") - - def start_server(self): - if self.llm is None: - raise Exception("FlexFlow has not been initialized.") - if not self.server_started and not self.server_stopped: - self.llm.start_server() - self.server_started = True - - def stop_server(self): - if self.llm is None: - raise Exception("FlexFlow has not been initialized.") - if self.server_started and not self.server_stopped: - self.llm.stop_server() - self.server_stopped = True - - def generate_inference(self): - if self.llm is None: - raise Exception("FlexFlow has not been initialized.") - if not self.server_started: - raise Exception("Server has not started.") - if self.server_stopped: - raise Exception("Server stopped.") - - if len(self.configs.inference_dataset) > 0: - prompts = [s for s in json.load(open(self.configs.inference_dataset))] - inference_requests = [ - ff.Request( - ff.RequestType.REQ_INFERENCE, - prompt=prompt, - max_sequence_length=self.configs.max_sequence_length, - peft_model_id=self.llm.get_ff_peft_id(self.lora_inference_config), - ) - for prompt in prompts - ] - return self.llm.generate(inference_requests) - return None - - def generate_finetuning(self): - if self.llm is None: - raise Exception("FlexFlow has not been initialized.") - if not self.server_started: - raise Exception("Server has not started.") - if self.server_stopped: - raise Exception("Server stopped.") - - if len(self.configs.finetuning_dataset) > 0: - finetuning_request = ff.Request( - ff.RequestType.REQ_FINETUNING, - max_sequence_length=self.configs.max_sequence_length, - peft_model_id=self.llm.get_ff_peft_id(self.lora_finetuning_config), - dataset_filepath=os.path.join(os.getcwd(), self.configs.finetuning_dataset), - max_training_steps=self.configs.max_training_steps, - ) - return self.llm.generate([finetuning_request]) - return None - - def download_models(self, refresh_cache=True, ask_for_token=False): - args = [self.configs.inference_peft_model_id, '--base_model_name', self.configs.base_model] - if refresh_cache: - args.append('--refresh-cache') - if not HfFolder.get_token() or ask_for_token: - hf_token = input("Please enter your HuggingFace personal access token: ") - subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) - subprocess.run(['python', '../../utils/download_peft_model.py'] + args) - - def upload_finetuned_model(self, ask_for_token=False): - if self.llm is None: - raise Exception("FlexFlow has not been initialized.") - if self.lora_finetuning_config is not None: - args = [self.configs.finetuning_peft_model_id] - if not HfFolder.get_token() or ask_for_token: - hf_token = input("Please enter your HuggingFace personal access token: ") - subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) - subprocess.run(['python', '../../utils/upload_peft_model.py'] + args) - -def main(): - configs_dict = { - "num_gpus": 4, - "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 40000, - "num_cpus": 4, - "legion_utility_processors": 4, - "data_parallelism_degree": 1, - "tensor_parallelism_degree": 1, - "pipeline_parallelism_degree": 4, - "offload": False, - "offload_reserve_space_size": 8 * 1024, # 8GB - "use_4bit_quantization": False, - "use_8bit_quantization": False, - "enable_peft": True, - "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB - "profiling": False, - "inference_debugging": False, - "fusion": False, - "max_requests_per_batch": 1, - "max_sequence_length": 256, - "max_tokens_per_batch": 128, - "max_training_steps": 10, - "seed": 42, - } - model_configs = { - "base_model": "meta-llama/Meta-Llama-3-8B", - "inference_peft_model_id": "goliaro/llama-3-8b-lora", - "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly", - "cache_path": os.environ.get("FF_CACHE_PATH", ""), - "refresh_cache": False, - "full_precision": True, - # relative paths - "inference_dataset": "inference_dataset.json", - "finetuning_dataset": "finetuning_dataset.json", - "output_file": "peft_demo.txt", - } - generation_configs = { - "do_sample": False, - "temperature": 0.9, - "topp": 0.8, - "topk": 1, - } - finetuning_configs = { - "learning_rate": 1.0, - "momentum": 0.0, - "weight_decay": 0.0, - "nesterov": False, - } - # Merge dictionaries - configs_dict.update(model_configs) - configs_dict.update(generation_configs) - configs_dict.update(finetuning_configs) - - random.seed(configs_dict["seed"]) - - demo = FlexFlowDemo(configs_dict) - - demo.download_models(ask_for_token=True) - demo.initialize_flexflow() - demo.start_server() - inf_results_1 = demo.generate_inference() - ft_results = demo.generate_finetuning() - demo.upload_finetuned_model(ask_for_token=True) - #inf_results_2 = demo.generate_inference() - demo.stop_server() - #print(inf_results_1[0].output_text) - #print(ft_results[0].finetuning_losses) - -if __name__ == "__main__": - main() \ No newline at end of file