diff --git a/docs/commands.md b/docs/commands.md index 85b8e1da6..5bffe3342 100755 --- a/docs/commands.md +++ b/docs/commands.md @@ -251,4 +251,26 @@ pip install httpx==0.23.3 pip install protobuf==3.20 ``` +## Regression Test +Now after each PR, we need to run the regression test to make sure the performance of the model is not degraded. + +```bash +python3 tools/regression.py +``` + +```bash +Already on 'dev/fix_output_path' + +|task|llava-onevision-qwen2-0.5b-ov| +|--|--| +|ocrbench (dev/fix_output_path)|0.70 ± 0.70| +|mmmu_val (dev/fix_output_path)|50.00 ± 50.00| +|ai2d (dev/fix_output_path)|50.00 ± 50.00| +|muirbench (dev/fix_output_path)|12.50 ± 12.50| +|videomme (dev/fix_output_path)|2500.00 ± 2500.00| + +|branch|runtime|%| +|--|--|--| +|dev/fix_output_path|87.7s|100%| +``` \ No newline at end of file diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py index 67d0f5720..f03d82ae8 100755 --- a/lmms_eval/__main__.py +++ b/lmms_eval/__main__.py @@ -25,8 +25,6 @@ from lmms_eval.api.registry import ALL_TASKS from lmms_eval.evaluator import request_caching_arg_to_dict from lmms_eval.loggers import EvaluationTracker, WandbLogger - -# from lmms_eval.logging_utils import WandbLogger from lmms_eval.tasks import TaskManager from lmms_eval.utils import ( handle_non_serializable, @@ -230,7 +228,7 @@ def parse_eval_args() -> argparse.Namespace: parser.add_argument( "--timezone", default="Asia/Singapore", - help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles", + help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles. You can check the full list via `import pytz; print(pytz.common_timezones)`", ) parser.add_argument( "--hf_hub_log_args", @@ -349,7 +347,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: for args, results in zip(args_list, results_list): # cli_evaluate will return none if the process is not the main process (rank 0) if results is not None: - print_results(args, results) + print(f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}") + print(make_table(results)) + if "groups" in results: + print(make_table(results, "groups")) if args.wandb_args: wandb_logger.run.finish() @@ -462,22 +463,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: eval_logger.info(f"Selected Tasks: {task_names}") request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests) - - # set datetime before evaluation datetime_str = utils.get_datetime_str(timezone=args.timezone) - if args.output_path: - if args.log_samples_suffix and len(args.log_samples_suffix) > 15: - eval_logger.warning("The suffix for log_samples is too long. It is recommended to keep it under 15 characters.") - args.log_samples_suffix = args.log_samples_suffix[:5] + "..." + args.log_samples_suffix[-5:] - - hash_input = f"{args.model_args}".encode("utf-8") - hash_output = hashlib.sha256(hash_input).hexdigest()[:6] - path = Path(args.output_path) - path = path.expanduser().resolve().joinpath(f"{datetime_str}_{args.log_samples_suffix}_{args.model}_model_args_{hash_output}") - args.output_path = path - - elif args.log_samples and not args.output_path: - assert args.output_path, "Specify --output_path" results = evaluator.simple_evaluate( model=args.model, @@ -505,6 +491,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: torch_random_seed=args.seed[2], fewshot_random_seed=args.seed[3], cli_args=args, + datetime_str=datetime_str, **request_caching_args, ) @@ -517,21 +504,30 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: if args.show_config: print(dumped) - if args.output_path: - args.output_path.mkdir(parents=True, exist_ok=True) - result_file_path = path.joinpath("results.json") - if result_file_path.exists(): - eval_logger.warning(f"Output file {result_file_path} already exists and will be overwritten.") - - result_file_path.open("w").write(dumped) - if args.log_samples: - for task_name, config in results["configs"].items(): - filename = args.output_path.joinpath(f"{task_name}.json") - # Structure the data with 'args' and 'logs' keys - data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str} - samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False) - filename.open("w", encoding="utf-8").write(samples_dumped) - eval_logger.info(f"Saved samples to {filename}") + batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) + + # Add W&B logging + if args.wandb_args: + try: + wandb_logger.post_init(results) + wandb_logger.log_eval_result() + if args.log_samples: + wandb_logger.log_eval_samples(samples) + except Exception as e: + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + + evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None, datetime_str=datetime_str) + + if args.log_samples: + for task_name, config in results["configs"].items(): + evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name]) + + if evaluation_tracker.push_results_to_hub or evaluation_tracker.push_samples_to_hub: + evaluation_tracker.recreate_metadata_card() + + if args.wandb_args: + # Tear down wandb run once all the logging is done. + wandb_logger.run.finish() return results, samples return None, None diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py index f6fbfbcf3..881497464 100755 --- a/lmms_eval/evaluator.py +++ b/lmms_eval/evaluator.py @@ -76,6 +76,7 @@ def simple_evaluate( numpy_random_seed: int = 1234, torch_random_seed: int = 1234, fewshot_random_seed: int = 1234, + datetime_str: str = get_datetime_str(), cli_args=None, ): """Instantiate and evaluate a model on a list of tasks. @@ -292,7 +293,7 @@ def _adjust_config(task_dict): } ) results["git_hash"] = get_git_commit_hash() - results["date"] = get_datetime_str() + results["date"] = datetime_str # add_env_info(results) # additional environment info to results # add_tokenizer_info(results, lm) # additional info about tokenizer return results diff --git a/lmms_eval/loggers/evaluation_tracker.py b/lmms_eval/loggers/evaluation_tracker.py index dc3bf1370..f13926b7c 100644 --- a/lmms_eval/loggers/evaluation_tracker.py +++ b/lmms_eval/loggers/evaluation_tracker.py @@ -15,6 +15,7 @@ from lmms_eval.utils import ( eval_logger, + get_datetime_str, get_file_datetime, get_file_task_name, get_results_filenames, @@ -154,7 +155,7 @@ def __init__( eval_logger.warning(f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'.") if hub_repo_name == "": - details_repo_name = details_repo_name if details_repo_name != "" else "lm-eval-results" + details_repo_name = details_repo_name if details_repo_name != "" else "lmms-eval-results" results_repo_name = results_repo_name if results_repo_name != "" else details_repo_name else: details_repo_name = hub_repo_name @@ -170,6 +171,7 @@ def save_results_aggregated( self, results: dict, samples: dict, + datetime_str: str, ) -> None: """ Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested. @@ -177,6 +179,7 @@ def save_results_aggregated( Args: results (dict): The aggregated results to save. samples (dict): The samples results to save. + datetime_str (str): The datetime string to use for the results file. """ self.general_config_tracker.log_end_time() @@ -205,8 +208,8 @@ def save_results_aggregated( path = path.joinpath(self.general_config_tracker.model_name_sanitized) path.mkdir(parents=True, exist_ok=True) - self.date_id = datetime.now().isoformat().replace(":", "-") - file_results_aggregated = path.joinpath(f"results_{self.date_id}.json") + self.date_id = datetime_str.replace(":", "-") + file_results_aggregated = path.joinpath(f"{self.date_id}_results.json") file_results_aggregated.open("w", encoding="utf-8").write(dumped) if self.api and self.push_results_to_hub: @@ -219,10 +222,10 @@ def save_results_aggregated( ) self.api.upload_file( repo_id=repo_id, - path_or_fileobj=str(path.joinpath(f"results_{self.date_id}.json")), + path_or_fileobj=str(path.joinpath(f"{self.date_id}_results.json")), path_in_repo=os.path.join( self.general_config_tracker.model_name, - f"results_{self.date_id}.json", + f"{self.date_id}_results.json", ), repo_type="dataset", commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}", @@ -255,18 +258,17 @@ def save_results_samples( path = path.joinpath(self.general_config_tracker.model_name_sanitized) path.mkdir(parents=True, exist_ok=True) - file_results_samples = path.joinpath(f"samples_{task_name}_{self.date_id}.jsonl") + file_results_samples = path.joinpath(f"{self.date_id}_samples_{task_name}.jsonl") for sample in samples: # we first need to sanitize arguments and resps # otherwise we won't be able to load the dataset # using the datasets library arguments = {} - for i, arg in enumerate(sample["arguments"]): - arguments[f"gen_args_{i}"] = {} - for j, tmp in enumerate(arg): - arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp + for key, value in enumerate(sample["arguments"][1]): # update metadata into args + arguments[key] = value + sample["input"] = sample["arguments"][0] sample["resps"] = sanitize_list(sample["resps"]) sample["filtered_resps"] = sanitize_list(sample["filtered_resps"]) sample["arguments"] = arguments diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py index 17324c8b3..d9452a408 100755 --- a/lmms_eval/utils.py +++ b/lmms_eval/utils.py @@ -33,6 +33,7 @@ import gc from itertools import islice +import numpy as np import pytz import torch import transformers @@ -238,11 +239,14 @@ def get_file_datetime(filename: str) -> str: return filename[filename.rfind("_") + 1 :].replace(".jsonl", "") -def sanitize_model_name(model_name: str) -> str: +def sanitize_model_name(model_name: str, full_path: bool = False) -> str: """ Given the model name, returns a sanitized version of it. """ - return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name) + if full_path: + return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name) + else: + return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name.split("/")[-1]) def sanitize_task_name(task_name: str) -> str: @@ -263,14 +267,14 @@ def get_results_filenames(filenames: List[str]) -> List[str]: """ Extracts filenames that correspond to aggregated results. """ - return [f for f in filenames if "/results_" in f and ".json" in f] + return [f for f in filenames if "results" in f and ".json" in f] def get_sample_results_filenames(filenames: List[str]) -> List[str]: """ Extracts filenames that correspond to sample results. """ - return [f for f in filenames if "/samples_" in f and ".json" in f] + return [f for f in filenames if "samples" in f and ".json" in f] def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len): @@ -588,7 +592,7 @@ def get_datetime_str(timezone="Asia/Singapore"): tz = pytz.timezone(timezone) utc_now = datetime.datetime.now(datetime.timezone.utc) local_time = utc_now.astimezone(tz) - return local_time.strftime("%m%d_%H%M") + return local_time.strftime("%Y%m%d_%H%M%S") def ignore_constructor(loader, node): diff --git a/tools/regression.py b/tools/regression.py new file mode 100644 index 000000000..61ca745da --- /dev/null +++ b/tools/regression.py @@ -0,0 +1,169 @@ +# code from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/scripts/regression.py +import argparse +import glob +import json +import os +import subprocess +import time +from pathlib import Path + +from lmms_eval import utils +from lmms_eval.api.registry import ALL_TASKS + +model_types = ["llava_onevision"] +vision_models = [ + "lmms-lab/llava-onevision-qwen2-0.5b-ov", +] + +single_image_tasks = ["ocrbench", "mmmu_val", "ai2d"] +multi_image_tasks = ["muirbench"] +video_tasks = ["videomme"] +# choice_tasks = [] +# perplexity_tasks = [] +# generation_tasks = [] +task_names = single_image_tasks + multi_image_tasks + video_tasks + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--branches", default=[]) + parser.add_argument("--models", default=vision_models) + parser.add_argument("--tasks", default=task_names) + parser.add_argument("--acc_norm", type=bool, default=False) + parser.add_argument("--perplexity", default=None) + # TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000 + parser.add_argument("--num_fewshot", type=int, default=0) + parser.add_argument("--limit", type=float, default=8) + # TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this + parser.add_argument("--model", default="llava_onevision") + # Use whatever is faster here + parser.add_argument("--model_args", default="conv_template=qwen_1_5,model_name=llava_qwen") + parser.add_argument("--batch_size", default="1") + return parser.parse_args() + + +def eval_models(args, branch=None): + if branch is not None: + if os.system(f"git checkout {branch}") != 0: + return {}, 0 + + branch = branch or initial_branch + + start_time = time.time() + + results = {} + + for indx, model in enumerate(args.models): + model_type = model_types[indx] + model_args = f"pretrained={model},{args.model_args}" + tasks = args.tasks + batch_size = args.batch_size + output_path = f"logs/regression_test/{int(start_time)}-{branch.replace('/', '_')}" + + original_dir = os.getcwd() + os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + command = ( + f"python3 -m accelerate.commands.launch --main_process_port=12580 --num_processes=8 lmms_eval --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " + f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " + f"--batch_size {batch_size} --output_path {output_path}" + ) + + print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}") + + ret = os.system(command) + os.chdir(original_dir) + + json_file_path = find_json_file(output_path) + + if json_file_path and ret == 0: + with open(json_file_path, encoding="utf-8") as f: + results[model] = json.load(f) + else: + results[model] = {"results": {}} + + end_time = time.time() + + return results, end_time - start_time + + +def extract_value(args, results, model, task, err=False): + if model not in results: + return 0 + results = results[model]["results"] + if task not in results: + return 0 + results = results[task] + if task == "ai2d": + return results["exact_match,flexible-extract"] + elif task == "mmmu_val": + return results["mmmu_acc,none"] + elif task == "ocrbench": + return results["ocrbench_accuracy,none"] + elif task == "videomme": + return results["videomme_percetion_score,none"] + elif task == "muirbench": + return results["muirbench_score_overall,flexible-extract"] + return 0 + + +def format_value(args, results, model, task): + val = 100 * extract_value(args, results, model, task) + err = 100 * extract_value(args, results, model, task, err=True) + return f"{val:.2f}{f' ± {err:.2f}' if err != 0 else ''}" + + +def format_diff(args, results1, results2, model, task): + val1 = 100 * extract_value(args, results1, model, task) + val2 = 100 * extract_value(args, results2, model, task) + diff = val2 - val1 + return f"**+{diff:.2f}**" if diff > 0 else f"{diff:.2f}" + + +def find_json_file(base_path): + pattern = os.path.join(base_path, "**", "*_results.json") + json_files = glob.glob(pattern, recursive=True) + return json_files[0] if json_files else None + + +def main(): + args = parse_args() + + args.branches = args.branches.split(",") if isinstance(args.branches, str) else args.branches + args.models = args.models.split(",") if isinstance(args.models, str) else args.models + args.tasks = ALL_TASKS if args.tasks == "all_tasks" else utils.pattern_match(args.tasks.split(","), ALL_TASKS) if isinstance(args.tasks, str) else args.tasks + + global initial_branch + initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip() + + # TODO: implement proper timing for each task + # TODO: reduce IO by sharing tasks between models? + + results, runtime = eval_models(args) + print(results, runtime) + + runs = [] + for branch in args.branches: + runs.append((branch, *eval_models(args, branch))) + + os.system(f"git checkout {initial_branch}") + + print("") + print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|") + print(f"|--|{'--|' * len(args.models)}") + for task in args.tasks: + print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|") + for branch, branch_results, branch_runtime in runs: + print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|") + print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|") + + print("") + print("|branch|runtime|%|") + print("|--|--|--|") + print(f"|{initial_branch}|{runtime:.1f}s|100%|") + for branch, _, branch_runtime in runs: + print(f"|{branch}|{branch_runtime:.1f}s|{100 * branch_runtime / runtime:.2f}%|") + + +if __name__ == "__main__": + main()