Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizers python interface #1441

Merged
merged 10 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 81 additions & 1 deletion include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
// FF_NEW_OPAQUE_TYPE(flexflow_lora_optimizer_config_t);
// FF_NEW_OPAQUE_TYPE(flexflow_lora_sgd_optimizer_config_t);
// FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t);
FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);

Expand Down Expand Up @@ -1050,16 +1053,93 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
flexflow_model_t model_handle_);

// // -----------------------------------------------------------------------
// // LoraSGDOptimizerConfig
// // -----------------------------------------------------------------------

// flexflow_lora_sgd_optimizer_config_t
// flexflow_lora_sgd_optimizer_config_create(
// double lr, double momentum, bool nesterov, bool weight_decay);

// void flexflow_lora_sgd_optimizer_config_destroy(
// flexflow_lora_sgd_optimizer_config_t handle_);

// // -----------------------------------------------------------------------
// // LoraAdamOptimizerConfig
// // -----------------------------------------------------------------------

// flexflow_lora_adam_optimizer_config_t
// flexflow_lora_adam_optimizer_config_create(double alpha,
// double beta1,
// double beta2,
// double weight_decay,
// double epsilon);

// void flexflow_lora_adam_optimizer_config_destroy(
// flexflow_lora_adam_optimizer_config_t handle_);

// -----------------------------------------------------------------------
// LoraLinearConfig
// -----------------------------------------------------------------------

flexflow_lora_linear_config_t
flexflow_lora_linear_config_create(char const *cache_folder_,
char const *peft_model_id_);
char const *peft_model_id_,
bool trainable,
bool init_lora_weights,
int rank,
float lora_alpha,
float lora_dropout,
int num_target_modules,
char const **target_modules_,
enum OptimizerType optimizer_type,
float sgd_learning_rate,
float sgd_momentum,
bool sgd_nesterov,
float sgd_weight_decay,
float adam_alpha,
float adam_beta1,
float adam_beta2,
float adam_weight_decay,
float adam_epsilon);

void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);

char const *flexflow_lora_linear_config_get_cache_folder(
flexflow_lora_linear_config_t handle_);

char const *flexflow_lora_linear_config_get_peft_model_id(
flexflow_lora_linear_config_t handle_);

int flexflow_lora_linear_config_get_rank(flexflow_lora_linear_config_t handle_);

float flexflow_lora_linear_config_get_lora_alpha(
flexflow_lora_linear_config_t handle_);

float flexflow_lora_linear_config_get_lora_dropout(
flexflow_lora_linear_config_t handle_);

bool flexflow_lora_linear_config_get_trainable(
flexflow_lora_linear_config_t handle_);

bool flexflow_lora_linear_config_get_init_lora_weights(
flexflow_lora_linear_config_t handle_);

char const **flexflow_lora_linear_config_get_target_modules(
flexflow_lora_linear_config_t handle_, int *num_target_modules);

void flexflow_lora_linear_config_set_lora_alpha(
flexflow_lora_linear_config_t handle_, float value);

void flexflow_lora_linear_config_set_lora_dropout(
flexflow_lora_linear_config_t handle_, float value);

void flexflow_lora_linear_config_set_trainable(
flexflow_lora_linear_config_t handle_, bool value);

void flexflow_lora_linear_config_set_init_lora_weights(
flexflow_lora_linear_config_t handle_, bool value);

// -----------------------------------------------------------------------
// PEFTModelID
// -----------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/ops/kernels/lora_linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ struct LoraLinearWeight {
struct LoraLinearModelState {
LoraLinearWeight weights;
LoraOptimizerConfig const *optimizer_config;
double lora_alpha;
float lora_alpha;
};

class LoraLinearMeta : public OpMeta {
Expand Down
61 changes: 47 additions & 14 deletions include/flexflow/ops/lora_linear_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#include "flexflow/op_meta.h"
#include "flexflow/operator.h"
#include "flexflow/parallel_tensor.h"
#include <filesystem>
#include <fstream>
#include <iostream>
#include <nlohmann/json.hpp>

namespace FlexFlow {

Expand All @@ -26,6 +30,9 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
friend std::ostream &operator<<(std::ostream &os,
LoraSGDOptimizerConfig const &llc);

NLOHMANN_DEFINE_TYPE_INTRUSIVE(
LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay)

public:
double lr = 0.001f;
double momentum = 0.0f;
Expand All @@ -44,6 +51,9 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
friend std::ostream &operator<<(std::ostream &os,
LoraAdamOptimizerConfig const &llc);

NLOHMANN_DEFINE_TYPE_INTRUSIVE(
LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon)

public:
// Adam
double alpha = 0.001f;
Expand All @@ -53,36 +63,59 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
double epsilon = 1e-8;
};

// Serialization helpers
template <typename T>
void serialize_to_json_file(T const &obj, fs::path const &filepath);

// Function to deserialize JSON from file and create object
template <typename T>
std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath);

class LoraLinearConfig {
public:
static const LoraLinearConfig EmptyConfig;
LoraLinearConfig();
LoraLinearConfig(int _rank,
bool _trainable = false,
LoraOptimizerConfig *_optimizer_config = nullptr);
LoraLinearConfig(std::string const &cache_folder_,
std::string const &peft_model_id_,
bool trainable_ = false,
LoraOptimizerConfig *optimizer_config_ = nullptr);
LoraOptimizerConfig *optimizer_config_ = nullptr,
bool init_lora_weights_ = false,
int rank_ = 8,
float lora_alpha_ = 8.0f,
float lora_dropout_ = 0.0f,
std::vector<std::string> const &target_modules_ = {});
// constructor used to support std::unordered_map
LoraLinearConfig();
friend bool operator==(LoraLinearConfig const &lhs,
LoraLinearConfig const &rhs);
friend std::ostream &operator<<(std::ostream &os,
LoraLinearConfig const &llc);

public:
NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig,
cache_folder,
peft_model_id,
rank,
lora_alpha,
lora_dropout,
target_modules,
trainable,
init_lora_weights)

std::string cache_folder;
// Huggingface model ID (for download and/or upload)
std::string peft_model_id;
// Lora parameters
int rank;
float lora_alpha;
float lora_dropout;
std::vector<std::string> target_modules;
// Training parameters
// whether the weights are trainable (fine-tuning scenario) or not
// (inference-only). If set to true, allocate space for the gradients
bool trainable = false;
LoraOptimizerConfig *optimizer_config;
std::string cache_folder;
// Huggingface
std::string peft_model_id;
int lora_alpha;
float lora_dropout;
std::vector<std::string> target_modules;
// whether to load weights from file, instead of initializing them randomly
bool load_weights_from_file;
// whether to initialize weights randomly (instead of attempting to load them
// from file)
bool init_lora_weights;
};

class LoraLinearParams {
Expand Down
3 changes: 2 additions & 1 deletion inference/peft/peft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ void FlexFlow::top_level_task(Task const *task,
: LoraLinearConfig(file_paths.cache_folder_path,
peft_model_name,
true /*trainable*/,
optim_config);
optim_config,
false /*init_lora_weights*/);

GenerationConfig generationConfig(do_sample, temperature, topp);
RequestManager *rm = RequestManager::get_request_manager();
Expand Down
71 changes: 52 additions & 19 deletions inference/python/ff_peft.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,21 @@ def get_configs():
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"inference_debugging": True,
"fusion": True,
"fusion": False,
}
model_configs = {
# required parameters
"base_model": "JackFram/llama-160m",
"peft_model_ids": [
"goliaro/llama-160m-lora",
],
"inference_peft_model_id": "goliaro/llama-160m-lora",
"finetuning_peft_model_id": "goliaro/llama-160m-lora",
# optional parameters
"cache_path": "",
"refresh_cache": False,
"full_precision": False,
"full_precision": True,
"prompt": "",
"finetuning_dataset": os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../prompt/peft.json"
os.path.dirname(os.path.abspath(__file__)),
"../prompt/peft_dataset.json",
),
"output_file": "",
}
Expand All @@ -100,19 +100,49 @@ def main():
refresh_cache=configs.refresh_cache,
output_file=configs.output_file,
)
for peft_model_id in configs.peft_model_ids:
llm.add_peft(peft_model_id)
# Add inference and/or finetuning lora
lora_inference_config = None
lora_finetuning_config = None
if len(configs.prompt) > 0:
lora_inference_config = ff.LoraLinearConfig(
llm.cache_path, configs.inference_peft_model_id
)
llm.add_peft(lora_inference_config)
if len(configs.finetuning_dataset) > 0:
# lora_finetuning_config = ff.LoraLinearConfig(
# llm.cache_path,
# configs.finetuning_peft_model_id,
# target_modules=["down_proj"],
# rank=16,
# lora_alpha=16,
# trainable=True,
# init_lora_weights=True,
# optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
# )
lora_finetuning_config = ff.LoraLinearConfig(
llm.cache_path,
configs.inference_peft_model_id,
trainable=True,
optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
optimizer_kwargs={
"learning_rate": 1.0,
"momentum": 0.0,
"weight_decay": 0.0,
"nesterov": False,
},
)
llm.add_peft(lora_finetuning_config)

# Compile the LLM for inference and load the weights into memory
generation_config = ff.GenerationConfig(
do_sample=False, temperature=0.9, topp=0.8, topk=1
)
llm.compile(
generation_config,
enable_peft_finetuning = (len(configs.finetuning_dataset) > 0),
enable_peft_finetuning=(len(configs.finetuning_dataset) > 0),
max_requests_per_batch=1,
max_seq_length=256,
max_tokens_per_batch=64,
max_tokens_per_batch=128,
)

llm.start_server()
Expand All @@ -123,21 +153,24 @@ def main():
prompts = [s for s in json.load(open(configs.prompt))]
inference_requests = [
ff.Request(
ff.RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128
ff.RequestType.REQ_INFERENCE,
prompt=prompt,
max_sequence_length=128,
peft_model_id=llm.get_ff_peft_id(lora_inference_config),
)
for prompt in prompts
]
requests += inference_requests
# Finetuning
if len(configs.finetuning_dataset) > 0:
for peft_model_id in configs.peft_model_ids:
finetuning_request = ff.Request(
ff.RequestType.REQ_FINETUNING,
max_sequence_length=128,
peft_model_id=llm.get_ff_peft_id(peft_model_id),
dataset_filepath=configs.finetuning_dataset,
)
requests.append(finetuning_request)
finetuning_request = ff.Request(
ff.RequestType.REQ_FINETUNING,
max_sequence_length=128,
peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
dataset_filepath=configs.finetuning_dataset,
max_training_steps=2,
)
requests.append(finetuning_request)

llm.generate(requests)

Expand Down
9 changes: 6 additions & 3 deletions inference/utils/download_peft_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ def parse_args():
"--base_model_name", type=str, help="Name of the model to download"
)
parser.add_argument(
"peft_model_ids", type=str, nargs="+", help="Name of the PEFT model(s) to download"
"peft_model_ids",
type=str,
nargs="+",
help="Name of the PEFT model(s) to download",
)
parser.add_argument(
"--cache-folder",
Expand Down Expand Up @@ -45,7 +48,6 @@ def main(args):
else:
data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)


for data_type in data_types:
llm = ff.LLM(
args.base_model_name,
Expand All @@ -54,7 +56,8 @@ def main(args):
refresh_cache=args.refresh_cache,
)
for peft_model_id in args.peft_model_ids:
llm.add_peft(peft_model_id)
lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id)
llm.add_peft(lora_config)
llm.download_hf_weights_if_needed()
llm.download_hf_config()
llm.download_hf_tokenizer_if_needed()
Expand Down
Loading
Loading