From a858088145b5e4b2bbe0bc68a1a0ec59ae9530f5 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Sun, 28 May 2023 16:17:26 +0200 Subject: [PATCH 1/5] Huggingface causal lms evaluation script --- evaluate_hf.py | 179 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 evaluate_hf.py diff --git a/evaluate_hf.py b/evaluate_hf.py new file mode 100644 index 000000000..caf8325f2 --- /dev/null +++ b/evaluate_hf.py @@ -0,0 +1,179 @@ +import argparse +import os +import torch +import numpy as np +import pandas as pd +from categories import subcategories, categories +from transformers import AutoModelForCausalLM, AutoTokenizer +import time + +choices = ["A", "B", "C", "D"] + + +def format_subject(subject): + l = subject.split("_") + s = "" + for entry in l: + s += " " + entry + return s + + +def format_example(df, idx, include_answer=True): + prompt = df.iloc[idx, 0] + k = df.shape[1] - 2 + for j in range(k): + prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1]) + prompt += "\nAnswer:" + if include_answer: + prompt += " {}\n\n".format(df.iloc[idx, k + 1]) + return prompt + + +def gen_prompt(train_df, subject, k=-1): + prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format( + format_subject(subject) + ) + if k == -1: + k = train_df.shape[0] + for i in range(k): + prompt += format_example(train_df, i) + return prompt + + +@torch.no_grad() +def eval(args, subject, model, tokenizer, dev_df, test_df): + cors = [] + all_probs = [] + answers = choices[: test_df.shape[1] - 2] + + for i in range(test_df.shape[0]): + # get prompt and make sure it fits + k = args.ntrain + prompt_end = format_example(test_df, i, include_answer=False) + train_prompt = gen_prompt(dev_df, subject, k) + prompt = train_prompt + prompt_end + + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) + + while input_ids.shape[-1] > 2048: + k -= 1 + train_prompt = gen_prompt(dev_df, subject, k) + prompt = train_prompt + prompt_end + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) + + label = test_df.iloc[i, test_df.shape[1] - 1] + + logits = model(input_ids=input_ids).logits[0, -1] + + probs = ( + torch.nn.functional.softmax( + torch.tensor( + [ + logits[tokenizer("A").input_ids[-1]], + logits[tokenizer("B").input_ids[-1]], + logits[tokenizer("C").input_ids[-1]], + logits[tokenizer("D").input_ids[-1]], + ] + ), + dim=0, + ) + .detach() + .cpu() + .numpy() + ) + pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] + + cor = pred == label + cors.append(cor) + all_probs.append(probs) + + acc = np.mean(cors) + cors = np.array(cors) + + all_probs = np.array(all_probs) + print("Average accuracy {:.3f} - {}".format(acc, subject)) + + return cors, acc, all_probs + + +def main(args): + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.float32, + load_in_8bit=False, + low_cpu_mem_usage=True, + device_map="auto", + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + model.eval() + subjects = sorted( + [ + f.split("_test.csv")[0] + for f in os.listdir(os.path.join(args.data_dir, "test")) + if "_test.csv" in f + ] + ) + + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + if not os.path.exists(os.path.join(args.save_dir, "results_{}".format(args.model))): + os.makedirs(os.path.join(args.save_dir, "results_{}".format(args.model))) + + all_cors = [] + subcat_cors = { + subcat: [] for subcat_lists in subcategories.values() for subcat in subcat_lists + } + cat_cors = {cat: [] for cat in categories} + + for subject in subjects: + dev_df = pd.read_csv( + os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None + )[: args.ntrain] + test_df = pd.read_csv( + os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None + ) + + cors, acc, probs = eval(args, subject, model, tokenizer, dev_df, test_df) + subcats = subcategories[subject] + for subcat in subcats: + subcat_cors[subcat].append(cors) + for key in categories.keys(): + if subcat in categories[key]: + cat_cors[key].append(cors) + all_cors.append(cors) + + test_df["{}_correct".format(args.model)] = cors + for j in range(probs.shape[1]): + choice = choices[j] + test_df["{}_choice{}_probs".format(args.model, choice)] = probs[:, j] + test_df.to_csv( + os.path.join( + args.save_dir, "results_{}".format(args.model), "{}.csv".format(subject) + ), + index=None, + ) + + for subcat in subcat_cors: + subcat_acc = np.mean(np.concatenate(subcat_cors[subcat])) + print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat)) + + for cat in cat_cors: + cat_acc = np.mean(np.concatenate(cat_cors[cat])) + print("Average accuracy {:.3f} - {}".format(cat_acc, cat)) + weighted_acc = np.mean(np.concatenate(all_cors)) + print("Average accuracy: {:.3f}".format(weighted_acc)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--ntrain", "-k", type=int, default=5) + parser.add_argument("--data_dir", "-d", type=str, default="data") + parser.add_argument("--save_dir", "-s", type=str, default="results") + parser.add_argument( + "--model", + "-m", + type=str, + default="google/flan-t5-small", + ) + args = parser.parse_args() + main(args) From f7b914020de0ca0e99bf6dfc4c42c07e474a11a0 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Sun, 28 May 2023 18:45:00 +0200 Subject: [PATCH 2/5] save resulting accuracies as json --- evaluate_hf.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/evaluate_hf.py b/evaluate_hf.py index caf8325f2..6cd2f2461 100644 --- a/evaluate_hf.py +++ b/evaluate_hf.py @@ -153,27 +153,30 @@ def main(args): index=None, ) + results = {"subcategories": {}, "categories": {}} for subcat in subcat_cors: subcat_acc = np.mean(np.concatenate(subcat_cors[subcat])) + results["subcategories"][subcat] = subcat_acc print("Average accuracy {:.3f} - {}".format(subcat_acc, subcat)) for cat in cat_cors: cat_acc = np.mean(np.concatenate(cat_cors[cat])) + results["categories"][cat] = cat_acc print("Average accuracy {:.3f} - {}".format(cat_acc, cat)) weighted_acc = np.mean(np.concatenate(all_cors)) + results["weighted_accuracy"] = weighted_acc print("Average accuracy: {:.3f}".format(weighted_acc)) + results_file = os.path.join(args.save_dir, "accuracies_{}.json".format(args.model.replace("/", "_"))) + with open(results_file, "w") as f: + json.dump(results, f) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--ntrain", "-k", type=int, default=5) parser.add_argument("--data_dir", "-d", type=str, default="data") parser.add_argument("--save_dir", "-s", type=str, default="results") - parser.add_argument( - "--model", - "-m", - type=str, - default="google/flan-t5-small", - ) + parser.add_argument("--model", "-m", type=str) args = parser.parse_args() main(args) From 82d60d2517520eefc6cebb82c0221578883d244c Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Sun, 28 May 2023 18:48:01 +0200 Subject: [PATCH 3/5] use fp16 as default --- evaluate_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluate_hf.py b/evaluate_hf.py index 6cd2f2461..659fd68cb 100644 --- a/evaluate_hf.py +++ b/evaluate_hf.py @@ -99,7 +99,7 @@ def eval(args, subject, model, tokenizer, dev_df, test_df): def main(args): model = AutoModelForCausalLM.from_pretrained( args.model, - torch_dtype=torch.float32, + torch_dtype=torch.float16, load_in_8bit=False, low_cpu_mem_usage=True, device_map="auto", From d108312c6f0c0724558ee1e1bd38467b772adb29 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Sun, 28 May 2023 19:04:49 +0200 Subject: [PATCH 4/5] upcast before softmax --- evaluate_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluate_hf.py b/evaluate_hf.py index 659fd68cb..88ae6ca51 100644 --- a/evaluate_hf.py +++ b/evaluate_hf.py @@ -74,7 +74,7 @@ def eval(args, subject, model, tokenizer, dev_df, test_df): logits[tokenizer("C").input_ids[-1]], logits[tokenizer("D").input_ids[-1]], ] - ), + ).float(), dim=0, ) .detach() From 461a38863ce51d29f657a92c6b90be501be53fc5 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Sun, 28 May 2023 19:56:03 +0200 Subject: [PATCH 5/5] fix imports --- evaluate_hf.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/evaluate_hf.py b/evaluate_hf.py index 88ae6ca51..139954a61 100644 --- a/evaluate_hf.py +++ b/evaluate_hf.py @@ -1,11 +1,14 @@ import argparse +import json import os -import torch +import time + import numpy as np import pandas as pd -from categories import subcategories, categories +import torch from transformers import AutoModelForCausalLM, AutoTokenizer -import time + +from categories import categories, subcategories choices = ["A", "B", "C", "D"] @@ -59,7 +62,9 @@ def eval(args, subject, model, tokenizer, dev_df, test_df): k -= 1 train_prompt = gen_prompt(dev_df, subject, k) prompt = train_prompt + prompt_end - input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to( + model.device + ) label = test_df.iloc[i, test_df.shape[1] - 1] @@ -167,7 +172,9 @@ def main(args): results["weighted_accuracy"] = weighted_acc print("Average accuracy: {:.3f}".format(weighted_acc)) - results_file = os.path.join(args.save_dir, "accuracies_{}.json".format(args.model.replace("/", "_"))) + results_file = os.path.join( + args.save_dir, "accuracies_{}.json".format(args.model.replace("/", "_")) + ) with open(results_file, "w") as f: json.dump(results, f)